From e76afe50a5abdffa92fe6184711a07b9c7f7cd82 Mon Sep 17 00:00:00 2001 From: Sanal Date: Wed, 25 Sep 2024 15:17:34 -0700 Subject: [PATCH 001/170] Add support for raft repl dev replace member. (#546) When replacing a member, add the new member, sync raft log for replace and finally remove the old member. Once we add new member, baseline or incremental resync will start. Remove the old member will cause nuraft mesg to exit the group and we periodically gc the destroyed group. Made the repl dev base test common so that both tests files can use. Tests by default create repl group with num_replica's. Dynamic tests create additional spare replica's which can be added to the test dynamically by calling replace member. --- conanfile.py | 2 +- .../homestore/replication/repl_decls.h | 19 +- src/include/homestore/replication/repl_dev.h | 6 +- src/lib/replication/repl_dev/common.cpp | 3 +- .../replication/repl_dev/raft_repl_dev.cpp | 109 ++- src/lib/replication/repl_dev/raft_repl_dev.h | 7 + .../replication/service/raft_repl_service.cpp | 16 +- .../replication/service/raft_repl_service.h | 1 - src/tests/CMakeLists.txt | 6 + src/tests/test_common/hs_repl_test_common.hpp | 19 +- src/tests/test_common/raft_repl_test_base.hpp | 629 ++++++++++++++++++ src/tests/test_raft_repl_dev.cpp | 608 +---------------- src/tests/test_raft_repl_dev_dynamic.cpp | 133 ++++ src/tests/test_solo_repl_dev.cpp | 1 + 14 files changed, 930 insertions(+), 629 deletions(-) create mode 100644 src/tests/test_common/raft_repl_test_base.hpp create mode 100644 src/tests/test_raft_repl_dev_dynamic.cpp diff --git a/conanfile.py b/conanfile.py index be8689f9b..524cd6a1d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.4.61" + version = "6.4.62" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index ac15a53af..24c6a7571 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -19,17 +19,18 @@ namespace homestore { VENUM(ReplServiceError, int32_t, OK = 0, // Everything OK CANCELLED = -1, // Request was cancelled - TIMEOUT = -2, - NOT_LEADER = -3, - BAD_REQUEST = -4, - SERVER_ALREADY_EXISTS = -5, + TIMEOUT = -2, + NOT_LEADER = -3, + BAD_REQUEST = -4, + SERVER_ALREADY_EXISTS = -5, CONFIG_CHANGING = -6, - SERVER_IS_JOINING = -7, - SERVER_NOT_FOUND = -8, - CANNOT_REMOVE_LEADER = -9, + SERVER_IS_JOINING = -7, + SERVER_NOT_FOUND = -8, + CANNOT_REMOVE_LEADER = -9, SERVER_IS_LEAVING = -10, - TERM_MISMATCH = -11, - RESULT_NOT_EXIST_YET = -10000, + TERM_MISMATCH = -11, + RETRY_REQUEST = -12, + RESULT_NOT_EXIST_YET = -10000, NOT_IMPLEMENTED = -10001, NO_SPACE_LEFT = -20000, DRIVE_WRITE_ERROR = -20001, diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 9965ada5d..15dc4872a 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -42,7 +42,8 @@ VENUM(repl_req_state_t, uint32_t, VENUM(journal_type_t, uint16_t, HS_DATA_LINKED = 0, // Linked data where each entry will store physical blkid where data reside HS_DATA_INLINED = 1, // Data is inlined in the header of journal entry - HS_CTRL_DESTROY = 2 // Control message to destroy the repl_dev + HS_CTRL_DESTROY = 2, // Control message to destroy the repl_dev + HS_CTRL_REPLACE = 3, // Control message to replace a member ) struct repl_key { @@ -346,6 +347,9 @@ class ReplDevListener { /// after restart in case crash happened during the destroy. virtual void on_destroy() = 0; + /// @brief Called when replace member is performed. + virtual void replace_member(replica_id_t member_out, replica_id_t member_in) = 0; + /// @brief Called when the snapshot is being created by nuraft virtual AsyncReplResult<> create_snapshot(shared< snapshot_context > context) = 0; diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 71927a3ad..b8800afea 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -192,9 +192,10 @@ std::string repl_req_ctx::to_string() const { } std::string repl_req_ctx::to_compact_string() const { - if (m_op_code == journal_type_t::HS_CTRL_DESTROY) { + if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_REPLACE) { return fmt::format("term={} lsn={} op={}", m_rkey.term, m_lsn, enum_name(m_op_code)); } + return fmt::format("dsn={} term={} lsn={} op={} local_blkid={} state=[{}]", m_rkey.dsn, m_rkey.term, m_lsn, enum_name(m_op_code), m_local_blkid.to_string(), req_state_name(uint32_cast(state()))); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 45a018d92..e928f8996 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -107,13 +107,94 @@ bool RaftReplDev::join_group() { m_msg_mgr.join_group(m_group_id, "homestore_replication", std::dynamic_pointer_cast< nuraft_mesg::mesg_state_mgr >(shared_from_this())); if (!raft_result) { - HS_DBG_ASSERT(false, "Unable to join the group_id={} with error={}", boost::uuids::to_string(m_group_id), - raft_result.error()); + HS_DBG_ASSERT(false, "Unable to join the group_id={} with error={}", group_id_str(), raft_result.error()); return false; } return true; } +AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, replica_id_t member_in_uuid) { + LOGINFO("Replace member group_id={} member_out={} member_in={}", group_id_str(), + boost::uuids::to_string(member_out_uuid), boost::uuids::to_string(member_in_uuid)); + + // Step 1: Check if leader itself is requested to move out. + if (m_my_repl_id == member_out_uuid && m_my_repl_id == get_leader_id()) { + // If leader is the member requested to move out, then give up leadership and return error. + // Client will retry replace_member request to the new leader. + raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); + RD_LOGI("Replace member leader is the member_out so yield leadership"); + return make_async_error<>(ReplServiceError::NOT_LEADER); + } + + // Step 2. Add the new member. + return m_msg_mgr.add_member(m_group_id, member_in_uuid) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, member_in_uuid, member_out_uuid](auto&& e) -> AsyncReplResult<> { + // TODO Currently we ignore the cancelled, fix nuraft_mesg to not timeout + // when adding member. Member is added to cluster config until member syncs fully + // with atleast stop gap. This will take a lot of time for block or + // object storage. + if (e.hasError()) { + // Ignore the server already exists as server already added to the cluster. + // The pg member change requests from control path are idemepotent and request + // can be resend and one of the add or remove can failed and has to retried. + if (e.error() == nuraft::cmd_result_code::CANCELLED || + e.error() == nuraft::cmd_result_code::SERVER_ALREADY_EXISTS) { + RD_LOGW("Ignoring error returned from nuraft add_member {}", e.error()); + } else { + RD_LOGE("Replace member error in add member : {}", e.error()); + return make_async_error<>(RaftReplService::to_repl_error(e.error())); + } + } + auto member_out = boost::uuids::to_string(member_out_uuid); + auto member_in = boost::uuids::to_string(member_in_uuid); + + RD_LOGI("Replace member added member={} to group_id={}", member_in, group_id_str()); + + // Step 3. Append log entry to mark the old member is out and new member is added. + auto rreq = repl_req_ptr_t(new repl_req_ctx{}); + replace_members_ctx members; + std::copy(member_in_uuid.begin(), member_in_uuid.end(), members.in_replica_id.begin()); + std::copy(member_out_uuid.begin(), member_out_uuid.end(), members.out_replica_id.begin()); + sisl::blob header(r_cast< uint8_t* >(&members), + members.in_replica_id.size() + members.out_replica_id.size()); + rreq->init( + repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, + journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0); + + auto err = m_state_machine->propose_to_raft(std::move(rreq)); + if (err != ReplServiceError::OK) { + LOGERROR("Replace member propose to raft failed {}", err); + return make_async_error<>(std::move(err)); + } + + RD_LOGI("Replace member proposed to raft group_id={}", group_id_str()); + + // Step 4. Remove the old member. Even if the old member is temporarily + // down and recovers, nuraft mesg see member remove from cluster log + // entry and call exit_group() and leave(). + return m_msg_mgr.rem_member(m_group_id, member_out_uuid) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, member_out](auto&& e) -> AsyncReplResult<> { + if (e.hasError()) { + // Ignore the server not found as server removed from the cluster + // as requests are idempotent and can be resend. + if (e.error() == nuraft::cmd_result_code::SERVER_NOT_FOUND) { + RD_LOGW("Remove member not found in group error, ignoring"); + } else { + // Its ok to retry this request as the request + // of replace member is idempotent. + RD_LOGE("Replace member failed to remove member : {}", e.error()); + return make_async_error<>(ReplServiceError::RETRY_REQUEST); + } + } else { + RD_LOGI("Replace member removed member={} from group_id={}", member_out, group_id_str()); + } + return make_async_success<>(); + }); + }); +} + folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { // Set the intent to destroy the group m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::DESTROYING; }); @@ -141,7 +222,7 @@ folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { LOGERROR("RaftReplDev::destroy_group failed {}", err); } - LOGINFO("Raft repl dev destroy_group={}", boost::uuids::to_string(m_group_id)); + LOGINFO("Raft repl dev destroy_group={}", group_id_str()); return m_destroy_promise.getSemiFuture(); } @@ -786,6 +867,8 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { RD_LOGD("Raft channel: Commit rreq=[{}]", rreq->to_string()); if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY) { leave(); + } else if (rreq->op_code() == journal_type_t::HS_CTRL_REPLACE) { + replace_member(rreq); } else { m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkid(), rreq); } @@ -820,7 +903,8 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) blkid.to_string()); }); } - } else if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY) { + } else if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY || + rreq->op_code() == journal_type_t::HS_CTRL_REPLACE) { if (rreq->is_proposer()) { m_destroy_promise.setValue(err); } } @@ -836,6 +920,17 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) rreq->clear(); } +void RaftReplDev::replace_member(repl_req_ptr_t rreq) { + auto members = r_cast< const replace_members_ctx* >(rreq->header().cbytes()); + replica_id_t member_in, member_out; + std::copy(members->out_replica_id.begin(), members->out_replica_id.end(), member_out.begin()); + std::copy(members->in_replica_id.begin(), members->in_replica_id.end(), member_in.begin()); + RD_LOGI("Raft repl replace_member member_out={} member_in={}", boost::uuids::to_string(member_out), + boost::uuids::to_string(member_in)); + + m_listener->replace_member(member_out, member_in); +} + static bool blob_equals(sisl::blob const& a, sisl::blob const& b) { if (a.size() != b.size()) { return false; } return (std::memcmp(a.cbytes(), b.cbytes(), a.size()) == 0); @@ -971,12 +1066,14 @@ void RaftReplDev::save_config(const nuraft::cluster_config& config) { std::unique_lock lg{m_config_mtx}; (*m_raft_config_sb)["config"] = serialize_cluster_config(config); m_raft_config_sb.write(); + RD_LOGI("Saved config {}", (*m_raft_config_sb)["config"].dump()); } void RaftReplDev::save_state(const nuraft::srv_state& state) { std::unique_lock lg{m_config_mtx}; (*m_raft_config_sb)["state"] = nlohmann::json{{"term", state.get_term()}, {"voted_for", state.get_voted_for()}}; m_raft_config_sb.write(); + RD_LOGI("Saved state {}", (*m_raft_config_sb)["state"].dump()); } nuraft::ptr< nuraft::srv_state > RaftReplDev::read_state() { @@ -1013,7 +1110,7 @@ uint32_t RaftReplDev::get_logstore_id() const { return m_data_journal->logstore_ std::shared_ptr< nuraft::state_machine > RaftReplDev::get_state_machine() { return m_state_machine; } void RaftReplDev::permanent_destroy() { - RD_LOGI("Permanent destroy for raft repl dev"); + RD_LOGI("Permanent destroy for raft repl dev group_id={}", group_id_str()); m_rd_sb.destroy(); m_raft_config_sb.destroy(); m_data_journal->remove_store(); @@ -1035,7 +1132,7 @@ void RaftReplDev::leave() { m_rd_sb->destroy_pending = 0x1; m_rd_sb.write(); - RD_LOGI("RaftReplDev leave group"); + RD_LOGI("RaftReplDev leave group_id={}", group_id_str()); m_destroy_promise.setValue(ReplServiceError::OK); // In case proposer is waiting for the destroy to complete } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 41594b528..82fdcaa23 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -35,6 +35,11 @@ using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, DESTROYING, DESTROYED, PERMANENT_DESTROYED); +struct replace_members_ctx { + std::array< uint8_t, 16 > out_replica_id; + std::array< uint8_t, 16 > in_replica_id; +}; + class RaftReplDevMetrics : public sisl::MetricsGroup { public: explicit RaftReplDevMetrics(const char* inst_name) : sisl::MetricsGroup("RaftReplDev", inst_name) { @@ -150,6 +155,7 @@ class RaftReplDev : public ReplDev, virtual ~RaftReplDev() = default; bool join_group(); + AsyncReplResult<> replace_member(replica_id_t member_out, replica_id_t member_in); folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// @@ -268,6 +274,7 @@ class RaftReplDev : public ReplDev, bool wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms); void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); void commit_blk(repl_req_ptr_t rreq); + void replace_member(repl_req_ptr_t rreq); }; } // namespace homestore diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 65d928390..bbf921685 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -93,7 +93,12 @@ void RaftReplService::start() { .with_hb_interval(HS_DYNAMIC_CONFIG(consensus.heartbeat_period_ms)) .with_max_append_size(HS_DYNAMIC_CONFIG(consensus.max_append_batch_size)) .with_log_sync_batch_size(HS_DYNAMIC_CONFIG(consensus.log_sync_batch_size)) + // TODO to fix the log_gap thresholds when adding new member. + // When the option is enabled, new member is doing log sync is stuck after the first batch + // where if the option is disabled, new member is going through append entries and it works. +#if 0 .with_log_sync_stopping_gap(HS_DYNAMIC_CONFIG(consensus.min_log_gap_to_join)) +#endif .with_stale_log_gap(HS_DYNAMIC_CONFIG(consensus.stale_log_gap_hi_threshold)) .with_fresh_log_gap(HS_DYNAMIC_CONFIG(consensus.stale_log_gap_lo_threshold)) .with_snapshot_enabled(HS_DYNAMIC_CONFIG(consensus.snapshot_freq_distance)) @@ -327,7 +332,16 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in) const { - return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); + auto rdev_result = get_repl_dev(group_id); + if (!rdev_result) { return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } + + return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) + ->replace_member(member_out, member_in) + .via(&folly::InlineExecutor::instance()) + .thenValue([this](auto&& e) mutable { + if (e.hasError()) { return make_async_error<>(e.error()); } + return make_async_success<>(); + }); } ////////////////////// Reaper Thread related ////////////////////////////////// diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index a38cbbccb..cba90e2e0 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -80,7 +80,6 @@ class RaftReplService : public GenericReplService, void gc_repl_devs(); void gc_repl_reqs(); void flush_durable_commit_lsn(); - }; class RaftReplServiceCPHandler : public CPCallbacks { diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index ddbac4c94..ce8ccb422 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -118,6 +118,10 @@ if (${io_tests}) target_sources(test_raft_repl_dev PRIVATE test_raft_repl_dev.cpp) target_link_libraries(test_raft_repl_dev homestore ${COMMON_TEST_DEPS} GTest::gmock) + add_executable(test_raft_repl_dev_dynamic) + target_sources(test_raft_repl_dev_dynamic PRIVATE test_raft_repl_dev_dynamic.cpp) + target_link_libraries(test_raft_repl_dev_dynamic homestore ${COMMON_TEST_DEPS} GTest::gmock) + can_build_epoll_io_tests(epoll_tests) if(${epoll_tests}) add_test(NAME LogDev-Epoll COMMAND test_log_dev) @@ -126,6 +130,7 @@ if (${io_tests}) add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr) add_test(NAME DataService-Epoll COMMAND test_data_service) add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev) + add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic) # add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) endif() @@ -138,6 +143,7 @@ if (${io_tests}) add_test(NAME SoloReplDev-Spdk COMMAND test_solo_repl_dev -- --spdk "true") add_test(NAME HomeRaftLogStore-Spdk COMMAND test_home_raft_logstore -- --spdk "true") add_test(NAME RaftReplDev-Spdk COMMAND test_raft_repl_dev -- --spdk "true") + add_test(NAME RaftReplDevDynamic-Spdk COMMAND test_raft_repl_dev_dynamic -- --spdk "true") if(${epoll_tests}) SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk) SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk) diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp index 67abe2f8e..672acffcb 100644 --- a/src/tests/test_common/hs_repl_test_common.hpp +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -38,6 +38,8 @@ SISL_OPTION_GROUP(test_repl_common_setup, (replicas, "", "replicas", "Total number of replicas", ::cxxopts::value< uint32_t >()->default_value("3"), "number"), + (spare_replicas, "", "spare_replicas", "Additional number of spare replicas not part of repldev", + ::cxxopts::value< uint32_t >()->default_value("1"), "number"), (base_port, "", "base_port", "Port number of first replica", ::cxxopts::value< uint16_t >()->default_value("4000"), "number"), (replica_num, "", "replica_num", @@ -134,11 +136,12 @@ class HSReplTestHelper : public HSTestHelper { HSReplTestHelper(std::string const& name, std::vector< std::string > const& args, char** argv) : name_{name}, args_{args}, argv_{argv} {} - void setup() { + void setup(uint32_t num_replicas) { + num_replicas_ = num_replicas; replica_num_ = SISL_OPTIONS["replica_num"].as< uint16_t >(); + sisl::logging::SetLogger(name_ + std::string("_replica_") + std::to_string(replica_num_)); sisl::logging::SetLogPattern("[%D %T%z] [%^%L%$] [%n] [%t] %v"); - auto const num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); boost::uuids::string_generator gen; for (uint32_t i{0}; i < num_replicas; ++i) { @@ -226,7 +229,7 @@ class HSReplTestHelper : public HSTestHelper { void reset_setup() { teardown(); - setup(); + setup(num_replicas_); } void restart(uint32_t shutdown_delay_secs = 5u) { @@ -273,8 +276,12 @@ class HSReplTestHelper : public HSTestHelper { if (replica_num_ == 0) { std::set< homestore::replica_id_t > members; - std::transform(members_.begin(), members_.end(), std::inserter(members, members.end()), - [](auto const& p) { return p.first; }); + // By default we create repl dev with number of members equal to replicas argument. + // We dont add spare replica's to the group by default. + for (auto& m : members_) { + if (m.second < SISL_OPTIONS["replicas"].as< uint32_t >()) { members.insert(m.first); } + } + group_id_t repl_group_id = hs_utils::gen_random_uuid(); { std::unique_lock lg(groups_mtx_); @@ -299,6 +306,7 @@ class HSReplTestHelper : public HSTestHelper { auto listener = std::move(pending_listeners_[0]); repl_groups_.insert(std::pair(group_id, listener)); pending_listeners_.erase(pending_listeners_.begin()); + LOGINFO("Got listener for group_id={} replica={}", boost::uuids::to_string(group_id), replica_num_); return listener; } @@ -346,6 +354,7 @@ class HSReplTestHelper : public HSTestHelper { std::string name_; std::vector< std::string > args_; char** argv_; + uint32_t num_replicas_; std::vector< homestore::dev_info > dev_list_; diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp new file mode 100644 index 000000000..7b96afa4c --- /dev/null +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -0,0 +1,629 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include "common/homestore_config.hpp" +#include "common/homestore_assert.hpp" +#include "common/homestore_utils.hpp" + +#define private public +#include "test_common/hs_repl_test_common.hpp" +#include "replication/service/raft_repl_service.h" +#include "replication/repl_dev/raft_repl_dev.h" + +using namespace homestore; + +SISL_LOGGING_DEF(test_raft_repl_dev) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS, nuraft_mesg) + +SISL_OPTION_GROUP(test_raft_repl_dev, + (block_size, "", "block_size", "block size to io", + ::cxxopts::value< uint32_t >()->default_value("4096"), "number"), + (num_raft_groups, "", "num_raft_groups", "number of raft groups per test", + ::cxxopts::value< uint32_t >()->default_value("1"), "number"), + // for below replication parameter, their default value always get from dynamic config, only used + // when specified by user + (snapshot_distance, "", "snapshot_distance", "distance between snapshots", + ::cxxopts::value< uint32_t >()->default_value("0"), "number"), + (num_raft_logs_resv, "", "num_raft_logs_resv", "number of raft logs reserved", + ::cxxopts::value< uint32_t >()->default_value("0"), "number"), + (res_mgr_audit_timer_ms, "", "res_mgr_audit_timer_ms", "resource manager audit timer", + ::cxxopts::value< uint32_t >()->default_value("0"), "number")); + +SISL_OPTIONS_ENABLE(logging, test_raft_repl_dev, iomgr, config, test_common_setup, test_repl_common_setup) + +static std::unique_ptr< test_common::HSReplTestHelper > g_helper; +static std::random_device g_rd{}; +static std::default_random_engine g_re{g_rd()}; + +class TestReplicatedDB : public homestore::ReplDevListener { +public: + struct Key { + uint64_t id_; + bool operator<(Key const& other) const { return id_ < other.id_; } + }; + + struct Value { + int64_t lsn_; + uint64_t data_size_; + uint64_t data_pattern_; + MultiBlkId blkid_; + uint64_t id_; + }; + + struct KeyValuePair { + Key key; + Value value; + }; + + struct test_req : public repl_req_ctx { + struct journal_header { + uint64_t data_size; + uint64_t data_pattern; + }; + + journal_header jheader; + uint64_t key_id; + sisl::sg_list write_sgs; + sisl::sg_list read_sgs; + + sisl::blob header_blob() { return sisl::blob(uintptr_cast(&jheader), sizeof(journal_header)); } + sisl::blob key_blob() { return sisl::blob{uintptr_cast(&key_id), sizeof(uint64_t)}; } + + test_req() { + write_sgs.size = 0; + read_sgs.size = 0; + key_id = (uint64_t)rand() << 32 | rand(); + } + + ~test_req() { + for (auto const& iov : write_sgs.iovs) { + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + + for (auto const& iov : read_sgs.iovs) { + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + } + }; + + TestReplicatedDB() = default; + virtual ~TestReplicatedDB() = default; + + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, + cintrusive< repl_req_ctx >& ctx) override { + ASSERT_EQ(header.size(), sizeof(test_req::journal_header)); + + auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); + Key k{.id_ = *(r_cast< uint64_t const* >(key.cbytes()))}; + Value v{.lsn_ = lsn, + .data_size_ = jheader->data_size, + .data_pattern_ = jheader->data_pattern, + .blkid_ = blkids, + .id_ = k.id_}; + + LOGINFOMOD(replication, "[Replica={}] Received commit on lsn={} dsn={} key={} value[blkid={} pattern={}]", + g_helper->replica_num(), lsn, ctx->dsn(), k.id_, v.blkid_.to_string(), v.data_pattern_); + + { + std::unique_lock lk(db_mtx_); + inmem_db_.insert_or_assign(k, v); + lsn_index_.emplace(lsn, v); + last_committed_lsn = lsn; + ++commit_count_; + } + + if (ctx->is_proposer()) { g_helper->runner().next_task(); } + } + + bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFOMOD(replication, "[Replica={}] Received pre-commit on lsn={} dsn={}", g_helper->replica_num(), lsn, + ctx->dsn()); + return true; + } + + void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFOMOD(replication, "[Replica={}] Received rollback on lsn={}", g_helper->replica_num(), lsn); + } + + void on_restart() { + LOGINFOMOD(replication, "restarted repl dev for [Replica={}] Group={}", g_helper->replica_num(), + boost::uuids::to_string(repl_dev()->group_id())); + } + + void on_error(ReplServiceError error, const sisl::blob& header, const sisl::blob& key, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFOMOD(replication, "[Replica={}] Received error={} on key={}", g_helper->replica_num(), enum_name(error), + *(r_cast< uint64_t const* >(key.cbytes()))); + } + + AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { + std::lock_guard< std::mutex > lock(m_snapshot_lock); + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); + LOGINFOMOD(replication, "[Replica={}] Got snapshot callback term={} idx={}", g_helper->replica_num(), + s->get_last_log_term(), s->get_last_log_idx()); + m_last_snapshot = context; + return make_async_success<>(); + } + + int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); + + if (snp_data->offset == 0) { + snp_data->is_last_obj = false; + snp_data->blob = sisl::io_blob_safe(sizeof(ulong)); + LOGINFOMOD(replication, + "[Replica={}] Read logical snapshot callback first message obj_id={} term={} idx={}", + g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx()); + return 0; + } + + int64_t next_lsn = snp_data->offset; + std::vector< KeyValuePair > kv_snapshot_data; + // we can not use find to get the next element, since if the next lsn is a config lsn , it will not be put into + // lsn_index_ and as a result, the find will return the end of the map. so here we use lower_bound to get the + // first element to be read and transfered. + for (auto iter = lsn_index_.lower_bound(next_lsn); iter != lsn_index_.end(); iter++) { + auto& v = iter->second; + kv_snapshot_data.emplace_back(Key{v.id_}, v); + LOGTRACEMOD(replication, "[Replica={}] Read logical snapshot callback fetching lsn={} size={} pattern={}", + g_helper->replica_num(), v.lsn_, v.data_size_, v.data_pattern_); + if (kv_snapshot_data.size() >= 1000) { break; } + } + + if (kv_snapshot_data.size() == 0) { + snp_data->is_last_obj = true; + LOGINFOMOD(replication, "Snapshot is_last_obj is true"); + return 0; + } + + int64_t kv_snapshot_data_size = sizeof(KeyValuePair) * kv_snapshot_data.size(); + sisl::io_blob_safe blob{static_cast< uint32_t >(kv_snapshot_data_size)}; + std::memcpy(blob.bytes(), kv_snapshot_data.data(), kv_snapshot_data_size); + snp_data->blob = std::move(blob); + snp_data->is_last_obj = false; + LOGINFOMOD(replication, "[Replica={}] Read logical snapshot callback obj_id={} term={} idx={} num_items={}", + g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), + kv_snapshot_data.size()); + + return 0; + } + + void snapshot_data_write(uint64_t data_size, uint64_t data_pattern, MultiBlkId& out_blkids) { + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + auto write_sgs = test_common::HSTestHelper::create_sgs(data_size, block_size, data_pattern); + auto fut = homestore::data_service().async_alloc_write(write_sgs, blk_alloc_hints{}, out_blkids); + std::move(fut).get(); + for (auto const& iov : write_sgs.iovs) { + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + } + + void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); + auto last_committed_idx = + std::dynamic_pointer_cast< RaftReplDev >(repl_dev())->raft_server()->get_committed_log_idx(); + if (snp_data->offset == 0) { + snp_data->offset = last_committed_lsn + 1; + LOGINFOMOD(replication, "[Replica={}] Save logical snapshot callback return obj_id={}", + g_helper->replica_num(), snp_data->offset); + return; + } + + size_t kv_snapshot_data_size = snp_data->blob.size(); + if (kv_snapshot_data_size == 0) return; + + size_t num_items = kv_snapshot_data_size / sizeof(KeyValuePair); + std::unique_lock lk(db_mtx_); + auto ptr = r_cast< const KeyValuePair* >(snp_data->blob.bytes()); + for (size_t i = 0; i < num_items; i++) { + auto key = ptr->key; + auto value = ptr->value; + LOGTRACEMOD(replication, "[Replica={}] Save logical snapshot got lsn={} data_size={} data_pattern={}", + g_helper->replica_num(), value.lsn_, value.data_size_, value.data_pattern_); + + // Write to data service and inmem map. + MultiBlkId out_blkids; + if (value.data_size_ != 0) { + snapshot_data_write(value.data_size_, value.data_pattern_, out_blkids); + value.blkid_ = out_blkids; + } + inmem_db_.insert_or_assign(key, value); + last_committed_lsn = value.lsn_; + ++commit_count_; + ptr++; + } + + snp_data->offset = last_committed_lsn + 1; + LOGINFOMOD(replication, + "[Replica={}] Save logical snapshot callback obj_id={} term={} idx={} is_last={} num_items={}", + g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), + snp_data->is_last_obj, num_items); + } + + bool apply_snapshot(shared< snapshot_context > context) override { + std::lock_guard< std::mutex > lock(m_snapshot_lock); + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); + LOGINFOMOD(replication, "[Replica={}] Apply snapshot term={} idx={}", g_helper->replica_num(), + s->get_last_log_term(), s->get_last_log_idx()); + m_last_snapshot = context; + return true; + } + + shared< snapshot_context > last_snapshot() override { + std::lock_guard< std::mutex > lock(m_snapshot_lock); + if (!m_last_snapshot) return nullptr; + + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(m_last_snapshot)->nuraft_snapshot(); + LOGINFOMOD(replication, "[Replica={}] Last snapshot term={} idx={}", g_helper->replica_num(), + s->get_last_log_term(), s->get_last_log_idx()); + return m_last_snapshot; + } + + void free_user_snp_ctx(void*& user_snp_ctx) override {} + + ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { + return blk_alloc_hints{}; + } + void replace_member(replica_id_t member_out, replica_id_t member_in) override {} + + void on_destroy() override { + LOGINFOMOD(replication, "[Replica={}] Group={} is being destroyed", g_helper->replica_num(), + boost::uuids::to_string(repl_dev()->group_id())); + g_helper->unregister_listener(repl_dev()->group_id()); + } + + void db_write(uint64_t data_size, uint32_t max_size_per_iov) { + static std::atomic< uint32_t > s_uniq_num{0}; + auto req = intrusive< test_req >(new test_req()); + req->jheader.data_size = data_size; + req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num; + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + + LOGINFOMOD(replication, "[Replica={}] Db write key={} data_size={} pattern={} block_size={}", + g_helper->replica_num(), req->key_id, data_size, req->jheader.data_pattern, block_size); + + if (data_size != 0) { + req->write_sgs = + test_common::HSTestHelper::create_sgs(data_size, max_size_per_iov, req->jheader.data_pattern); + } + + repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req); + } + + void validate_db_data() { + g_helper->runner().set_num_tasks(inmem_db_.size()); + + LOGINFOMOD(replication, "[{}]: Total {} keys committed, validating them", + boost::uuids::to_string(repl_dev()->group_id()), inmem_db_.size()); + auto it = inmem_db_.begin(); + g_helper->runner().set_task([this, &it]() { + Key k; + Value v; + { + std::unique_lock lk(db_mtx_); + std::tie(k, v) = *it; + ++it; + } + + if (v.data_size_ != 0) { + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + auto read_sgs = test_common::HSTestHelper::create_sgs(v.data_size_, block_size); + + repl_dev()->async_read(v.blkid_, read_sgs, v.data_size_).thenValue([read_sgs, k, v](auto const ec) { + LOGINFOMOD(replication, "Validating key={} value[blkid={} pattern={}]", k.id_, v.blkid_.to_string(), + v.data_pattern_); + RELEASE_ASSERT(!ec, "Read of blkid={} for key={} error={}", v.blkid_.to_string(), k.id_, + ec.message()); + for (auto const& iov : read_sgs.iovs) { + test_common::HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, + v.data_pattern_); + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + g_helper->runner().next_task(); + }); + } else { + g_helper->runner().next_task(); + } + }); + g_helper->runner().execute().get(); + } + + uint64_t db_commit_count() const { + std::shared_lock lk(db_mtx_); + return commit_count_; + } + + uint64_t db_size() const { + std::shared_lock lk(db_mtx_); + return inmem_db_.size(); + } + + void create_snapshot() { + auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); + ulong snapshot_idx = raft_repl_dev->raft_server()->create_snapshot(); + LOGINFO("Manually create snapshot got index {}", snapshot_idx); + } + + void truncate(int num_reserved_entries) { + auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); + raft_repl_dev->truncate(num_reserved_entries); + LOGINFO("Manually truncated"); + } + + void set_zombie() { zombie_ = true; } + bool is_zombie() { + // Wether a group is zombie(non recoverable) + return zombie_; + } + +private: + std::map< Key, Value > inmem_db_; + std::map< int64_t, Value > lsn_index_; + uint64_t commit_count_{0}; + std::shared_mutex db_mtx_; + uint64_t last_committed_lsn{0}; + std::shared_ptr< snapshot_context > m_last_snapshot{nullptr}; + std::mutex m_snapshot_lock; + bool zombie_{false}; +}; + +class RaftReplDevTestBase : public testing::Test { +public: + void SetUp() override { + // By default it will create one db + for (uint32_t i{0}; i < SISL_OPTIONS["num_raft_groups"].as< uint32_t >(); ++i) { + auto db = std::make_shared< TestReplicatedDB >(); + g_helper->register_listener(db); + dbs_.emplace_back(std::move(db)); + } + } + + void TearDown() override { + for (auto const& db : dbs_) { + if (db->is_zombie()) { continue; } + run_on_leader(db, [this, db]() { + auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get(); + ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group"; + }); + } + + for (auto const& db : dbs_) { + if (db->is_zombie()) { continue; } + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + int i = 0; + bool force_leave = false; + do { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed"); + + // TODO: if leader is destroyed, but the follower does not receive the notification, it will not be + // destroyed for ever. we need handle this in raft_repl_dev. revisit here after making changes at + // raft_repl_dev side to hanle this case. this is a workaround to avoid the infinite loop for now. + if (i++ > 10 && !force_leave) { + LOGWARN("has already waited for repl dev to get destroyed for 10 times, so do a force leave"); + repl_dev->force_leave(); + force_leave = true; + } + + } while (!repl_dev->is_destroyed()); + } + } + + void generate_writes(uint64_t data_size, uint32_t max_size_per_iov, shared< TestReplicatedDB > db = nullptr) { + if (db == nullptr) { db = pick_one_db(); } + // LOGINFO("Writing on group_id={}", db->repl_dev()->group_id()); + db->db_write(data_size, max_size_per_iov); + } + + void wait_for_all_commits() { wait_for_commits(written_entries_); } + + void wait_for_commits(uint64_t exp_writes) { + uint64_t total_writes{0}; + while (true) { + total_writes = 0; + for (auto const& db : dbs_) { + total_writes += db->db_commit_count(); + } + + if (total_writes >= exp_writes) { break; } + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + LOGINFO("Replica={} received {} commits but expected {}", g_helper->replica_num(), total_writes, + exp_writes); + } + LOGINFO("Replica={} has received {} commits as expected", g_helper->replica_num(), total_writes); + } + + void validate_data() { + for (auto const& db : dbs_) { + db->validate_db_data(); + } + } + + shared< TestReplicatedDB > pick_one_db() { return dbs_[0]; } + + void assign_leader(uint16_t replica) { + LOGINFO("Switch the leader to replica_num = {}", replica); + if (g_helper->replica_num() == replica) { + for (auto const& db : dbs_) { + do { + auto result = db->repl_dev()->become_leader().get(); + if (result.hasError()) { + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } else { + break; + } + } while (true); + } + } else { + for (auto const& db : dbs_) { + homestore::replica_id_t leader_uuid; + while (true) { + leader_uuid = db->repl_dev()->get_leader_id(); + if (!leader_uuid.is_nil() && (g_helper->member_id(leader_uuid) == replica)) { break; } + + LOGINFO("Waiting for replica={} to become leader", replica); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } + } + } + } + + void run_on_leader(std::shared_ptr< TestReplicatedDB > db, auto&& lambda) { + do { + auto leader_uuid = db->repl_dev()->get_leader_id(); + + if (leader_uuid.is_nil()) { + LOGINFO("Waiting for leader to be elected for group={}", db->repl_dev()->group_id()); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } else if (leader_uuid == g_helper->my_replica_id()) { + lambda(); + break; + } else { + break; + } + } while (true); + } + + void write_on_leader(uint32_t num_entries, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr) { + do { + auto leader_uuid = dbs_[0]->repl_dev()->get_leader_id(); + + if (leader_uuid.is_nil()) { + LOGINFO("Waiting for leader to be elected"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } else if (leader_uuid == g_helper->my_replica_id()) { + LOGINFO("Writing {} entries since I am the leader my_uuid={}", num_entries, + boost::uuids::to_string(g_helper->my_replica_id())); + auto const block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + g_helper->runner().set_num_tasks(num_entries); + + LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); + g_helper->runner().set_task([this, block_size, db]() { + static std::normal_distribution<> num_blks_gen{3.0, 2.0}; + this->generate_writes(std::abs(std::lround(num_blks_gen(g_re))) * block_size, block_size, db); + }); + if (wait_for_commit) { g_helper->runner().execute().get(); } + break; + } else { + LOGINFO("{} entries were written on the leader_uuid={} my_uuid={}", num_entries, + boost::uuids::to_string(leader_uuid), boost::uuids::to_string(g_helper->my_replica_id())); + break; + } + } while (true); + + written_entries_ += num_entries; + if (wait_for_commit) { this->wait_for_all_commits(); } + } + + void remove_db(std::shared_ptr< TestReplicatedDB > db, bool wait_for_removal) { + this->run_on_leader(db, [this, db]() { + auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get(); + ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group"; + }); + + // Remove the db from the dbs_ list and check if count matches with repl_device + for (auto it = dbs_.begin(); it != dbs_.end(); ++it) { + if (*it == db) { + dbs_.erase(it); + break; + } + } + + if (wait_for_removal) { wait_for_listener_destroy(dbs_.size()); } + } + + void wait_for_listener_destroy(uint64_t exp_listeners) { + while (true) { + auto total_listeners = g_helper->num_listeners(); + if (total_listeners == exp_listeners) { break; } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + + void restart_replica(uint16_t replica, uint32_t shutdown_delay_sec = 5u) { + if (g_helper->replica_num() == replica) { + LOGINFO("Restart homestore: replica_num = {}", replica); + g_helper->restart(shutdown_delay_sec); + // g_helper->sync_for_test_start(); + } else { + LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica); + std::this_thread::sleep_for(std::chrono::seconds{5}); + } + } + + void shutdown_replica(uint16_t replica) { + if (g_helper->replica_num() == replica) { + LOGINFO("Shutdown homestore: replica_num = {}", replica); + g_helper->shutdown(); + } else { + LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica); + std::this_thread::sleep_for(std::chrono::seconds{5}); + } + } + + void start_replica(uint16_t replica) { + if (g_helper->replica_num() == replica) { + LOGINFO("Start homestore: replica_num = {}", replica); + g_helper->start(); + } + } + + void create_snapshot() { dbs_[0]->create_snapshot(); } + void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } + + void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in) { + this->run_on_leader(db, [this, db, member_out, member_in]() { + LOGINFO("Replace member out={} in={}", boost::uuids::to_string(member_out), + boost::uuids::to_string(member_in)); + auto v = hs()->repl_service().replace_member(db->repl_dev()->group_id(), member_out, member_in).get(); + ASSERT_EQ(v.hasError(), false) << "Error in replacing member"; + }); + } + +protected: + std::vector< std::shared_ptr< TestReplicatedDB > > dbs_; + uint32_t written_entries_{0}; + +#ifdef _PRERELEASE + flip::FlipClient m_fc{iomgr_flip::instance()}; +#endif +}; diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index caf28e0ee..9ccc40dfc 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -12,609 +12,9 @@ * specific language governing permissions and limitations under the License. * *********************************************************************************/ -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include "common/homestore_config.hpp" -#include "common/homestore_assert.hpp" -#include "common/homestore_utils.hpp" - -#define private public -#include "test_common/hs_repl_test_common.hpp" -#include "replication/service/raft_repl_service.h" -#include "replication/repl_dev/raft_repl_dev.h" - -using namespace homestore; - -SISL_LOGGING_DEF(test_raft_repl_dev) -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS, nuraft_mesg) - -SISL_OPTION_GROUP(test_raft_repl_dev, - (block_size, "", "block_size", "block size to io", - ::cxxopts::value< uint32_t >()->default_value("4096"), "number"), - (num_raft_groups, "", "num_raft_groups", "number of raft groups per test", - ::cxxopts::value< uint32_t >()->default_value("1"), "number"), - // for below replication parameter, their default value always get from dynamic config, only used - // when specified by user - (snapshot_distance, "", "snapshot_distance", "distance between snapshots", - ::cxxopts::value< uint32_t >()->default_value("0"), "number"), - (num_raft_logs_resv, "", "num_raft_logs_resv", "number of raft logs reserved", - ::cxxopts::value< uint32_t >()->default_value("0"), "number"), - (res_mgr_audit_timer_ms, "", "res_mgr_audit_timer_ms", "resource manager audit timer", - ::cxxopts::value< uint32_t >()->default_value("0"), "number")); - -SISL_OPTIONS_ENABLE(logging, test_raft_repl_dev, iomgr, config, test_common_setup, test_repl_common_setup) - -static std::unique_ptr< test_common::HSReplTestHelper > g_helper; -static std::random_device g_rd{}; -static std::default_random_engine g_re{g_rd()}; - -class TestReplicatedDB : public homestore::ReplDevListener { -public: - struct Key { - uint64_t id_; - bool operator<(Key const& other) const { return id_ < other.id_; } - }; - - struct Value { - int64_t lsn_; - uint64_t data_size_; - uint64_t data_pattern_; - MultiBlkId blkid_; - uint64_t id_; - }; - - struct KeyValuePair { - Key key; - Value value; - }; - - struct test_req : public repl_req_ctx { - struct journal_header { - uint64_t data_size; - uint64_t data_pattern; - }; - - journal_header jheader; - uint64_t key_id; - sisl::sg_list write_sgs; - sisl::sg_list read_sgs; - - sisl::blob header_blob() { return sisl::blob(uintptr_cast(&jheader), sizeof(journal_header)); } - sisl::blob key_blob() { return sisl::blob{uintptr_cast(&key_id), sizeof(uint64_t)}; } - - test_req() { - write_sgs.size = 0; - read_sgs.size = 0; - key_id = (uint64_t)rand() << 32 | rand(); - } - - ~test_req() { - for (auto const& iov : write_sgs.iovs) { - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - - for (auto const& iov : read_sgs.iovs) { - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - } - }; - - TestReplicatedDB() = default; - virtual ~TestReplicatedDB() = default; - - void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, - cintrusive< repl_req_ctx >& ctx) override { - ASSERT_EQ(header.size(), sizeof(test_req::journal_header)); - - auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); - Key k{.id_ = *(r_cast< uint64_t const* >(key.cbytes()))}; - Value v{.lsn_ = lsn, - .data_size_ = jheader->data_size, - .data_pattern_ = jheader->data_pattern, - .blkid_ = blkids, - .id_ = k.id_}; - - LOGINFOMOD(replication, "[Replica={}] Received commit on lsn={} dsn={} key={} value[blkid={} pattern={}]", - g_helper->replica_num(), lsn, ctx->dsn(), k.id_, v.blkid_.to_string(), v.data_pattern_); - - { - std::unique_lock lk(db_mtx_); - inmem_db_.insert_or_assign(k, v); - lsn_index_.emplace(lsn, v); - last_data_committed_lsn = lsn; - ++commit_count_; - } - - if (ctx->is_proposer()) { g_helper->runner().next_task(); } - } - - bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, - cintrusive< repl_req_ctx >& ctx) override { - LOGINFOMOD(replication, "[Replica={}] Received pre-commit on lsn={} dsn={}", g_helper->replica_num(), lsn, - ctx->dsn()); - return true; - } - - void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, - cintrusive< repl_req_ctx >& ctx) override { - LOGINFOMOD(replication, "[Replica={}] Received rollback on lsn={}", g_helper->replica_num(), lsn); - } - - void on_restart() { - LOGINFOMOD(replication, "restarted repl dev for [Replica={}] Group={}", g_helper->replica_num(), - boost::uuids::to_string(repl_dev()->group_id())); - } - - void on_error(ReplServiceError error, const sisl::blob& header, const sisl::blob& key, - cintrusive< repl_req_ctx >& ctx) override { - LOGINFOMOD(replication, "[Replica={}] Received error={} on key={}", g_helper->replica_num(), enum_name(error), - *(r_cast< uint64_t const* >(key.cbytes()))); - } - - AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { - std::lock_guard< std::mutex > lock(m_snapshot_lock); - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); - LOGINFOMOD(replication, "[Replica={}] Got snapshot callback term={} idx={}", g_helper->replica_num(), - s->get_last_log_term(), s->get_last_log_idx()); - m_last_snapshot = context; - return make_async_success<>(); - } - - int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); - - if (snp_data->offset == 0) { - snp_data->is_last_obj = false; - snp_data->blob = sisl::io_blob_safe(sizeof(ulong)); - LOGINFOMOD(replication, "[Replica={}] Read logical snapshot callback obj_id={} term={} idx={}", - g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx()); - return 0; - } - - int64_t next_lsn = snp_data->offset; - std::vector< KeyValuePair > kv_snapshot_data; - // we can not use find to get the next element, since if the next lsn is a config lsn , it will not be put into - // lsn_index_ and as a result, the find will return the end of the map. so here we use lower_bound to get the - // first element to be read and transfered. - for (auto iter = lsn_index_.lower_bound(next_lsn); iter != lsn_index_.end(); iter++) { - auto& v = iter->second; - kv_snapshot_data.emplace_back(Key{v.id_}, v); - LOGTRACEMOD(replication, "[Replica={}] Read logical snapshot callback fetching lsn={} size={} pattern={}", - g_helper->replica_num(), v.lsn_, v.data_size_, v.data_pattern_); - if (kv_snapshot_data.size() >= 1000) { break; } - } - - if (kv_snapshot_data.size() == 0) { - snp_data->is_last_obj = true; - LOGINFOMOD(replication, "Snapshot is_last_obj is true"); - return 0; - } - - int64_t kv_snapshot_data_size = sizeof(KeyValuePair) * kv_snapshot_data.size(); - sisl::io_blob_safe blob{static_cast< uint32_t >(kv_snapshot_data_size)}; - std::memcpy(blob.bytes(), kv_snapshot_data.data(), kv_snapshot_data_size); - snp_data->blob = std::move(blob); - snp_data->is_last_obj = false; - LOGINFOMOD(replication, "[Replica={}] Read logical snapshot callback obj_id={} term={} idx={} num_items={}", - g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), - kv_snapshot_data.size()); - - return 0; - } - - void snapshot_data_write(uint64_t data_size, uint64_t data_pattern, MultiBlkId& out_blkids) { - auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); - auto write_sgs = test_common::HSTestHelper::create_sgs(data_size, block_size, data_pattern); - auto fut = homestore::data_service().async_alloc_write(write_sgs, blk_alloc_hints{}, out_blkids); - std::move(fut).get(); - for (auto const& iov : write_sgs.iovs) { - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - } - - void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); - if (snp_data->offset == 0) { - snp_data->offset = last_data_committed_lsn + 1; - LOGINFOMOD(replication, "[Replica={}] Save logical snapshot callback return obj_id={}", - g_helper->replica_num(), snp_data->offset); - return; - } - - size_t kv_snapshot_data_size = snp_data->blob.size(); - if (kv_snapshot_data_size == 0) return; - - size_t num_items = kv_snapshot_data_size / sizeof(KeyValuePair); - std::unique_lock lk(db_mtx_); - auto ptr = r_cast< const KeyValuePair* >(snp_data->blob.bytes()); - for (size_t i = 0; i < num_items; i++) { - auto key = ptr->key; - auto value = ptr->value; - LOGTRACEMOD(replication, "[Replica={}] Save logical snapshot got lsn={} data_size={} data_pattern={}", - g_helper->replica_num(), value.lsn_, value.data_size_, value.data_pattern_); - - // Write to data service and inmem map. - MultiBlkId out_blkids; - if (value.data_size_ != 0) { - snapshot_data_write(value.data_size_, value.data_pattern_, out_blkids); - value.blkid_ = out_blkids; - } - last_data_committed_lsn = value.lsn_; - inmem_db_.insert_or_assign(key, value); - ++commit_count_; - ptr++; - } - - LOGINFOMOD(replication, - "[Replica={}] Save logical snapshot callback obj_id={} term={} idx={} is_last={} num_items={}", - g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), - snp_data->is_last_obj, num_items); - - // before we finish install snapshot, raft_server()->get_committed_log_idx() will always be the same. so we need - // last_data_committed_lsn to notify leader to transfer new data to follower. - snp_data->offset = last_data_committed_lsn + 1; - } - - bool apply_snapshot(shared< snapshot_context > context) override { - std::lock_guard< std::mutex > lock(m_snapshot_lock); - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); - LOGINFOMOD(replication, "[Replica={}] Apply snapshot term={} idx={}", g_helper->replica_num(), - s->get_last_log_term(), s->get_last_log_idx()); - m_last_snapshot = context; - return true; - } - - shared< snapshot_context > last_snapshot() override { - std::lock_guard< std::mutex > lock(m_snapshot_lock); - if (!m_last_snapshot) return nullptr; - - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(m_last_snapshot)->nuraft_snapshot(); - LOGINFOMOD(replication, "[Replica={}] Last snapshot term={} idx={}", g_helper->replica_num(), - s->get_last_log_term(), s->get_last_log_idx()); - return m_last_snapshot; - } - - void free_user_snp_ctx(void*& user_snp_ctx) override {} - - ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { - return blk_alloc_hints{}; - } - - void on_destroy() override { - LOGINFOMOD(replication, "[Replica={}] Group={} is being destroyed", g_helper->replica_num(), - boost::uuids::to_string(repl_dev()->group_id())); - g_helper->unregister_listener(repl_dev()->group_id()); - } - - void db_write(uint64_t data_size, uint32_t max_size_per_iov) { - static std::atomic< uint32_t > s_uniq_num{0}; - auto req = intrusive< test_req >(new test_req()); - req->jheader.data_size = data_size; - req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num; - auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); - - LOGINFOMOD(replication, "[Replica={}] Db write key={} data_size={} pattern={} block_size={}", - g_helper->replica_num(), req->key_id, data_size, req->jheader.data_pattern, block_size); - - if (data_size != 0) { - req->write_sgs = - test_common::HSTestHelper::create_sgs(data_size, max_size_per_iov, req->jheader.data_pattern); - } - - repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req); - } - - void validate_db_data() { - g_helper->runner().set_num_tasks(inmem_db_.size()); - - LOGINFOMOD(replication, "[{}]: Total {} keys committed, validating them", - boost::uuids::to_string(repl_dev()->group_id()), inmem_db_.size()); - auto it = inmem_db_.begin(); - g_helper->runner().set_task([this, &it]() { - Key k; - Value v; - { - std::unique_lock lk(db_mtx_); - std::tie(k, v) = *it; - ++it; - } - - if (v.data_size_ != 0) { - auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); - auto read_sgs = test_common::HSTestHelper::create_sgs(v.data_size_, block_size); - - repl_dev()->async_read(v.blkid_, read_sgs, v.data_size_).thenValue([read_sgs, k, v](auto const ec) { - LOGINFOMOD(replication, "Validating key={} value[blkid={} pattern={}]", k.id_, v.blkid_.to_string(), - v.data_pattern_); - RELEASE_ASSERT(!ec, "Read of blkid={} for key={} error={}", v.blkid_.to_string(), k.id_, - ec.message()); - for (auto const& iov : read_sgs.iovs) { - test_common::HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, - v.data_pattern_); - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - g_helper->runner().next_task(); - }); - } else { - g_helper->runner().next_task(); - } - }); - g_helper->runner().execute().get(); - } +#include "test_common/raft_repl_test_base.hpp" - uint64_t db_commit_count() const { - std::shared_lock lk(db_mtx_); - return commit_count_; - } - - uint64_t db_size() const { - std::shared_lock lk(db_mtx_); - return inmem_db_.size(); - } - - void create_snapshot() { - auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); - ulong snapshot_idx = raft_repl_dev->raft_server()->create_snapshot(); - LOGINFO("Manually create snapshot got index {}", snapshot_idx); - } - - void truncate(int num_reserved_entries) { - auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); - raft_repl_dev->truncate(num_reserved_entries); - LOGINFO("Manually truncated"); - } - - void set_zombie() { zombie_ = true; } - bool is_zombie() { - // Wether a group is zombie(non recoverable) - return zombie_; - } - -private: - std::map< Key, Value > inmem_db_; - std::map< int64_t, Value > lsn_index_; - uint64_t commit_count_{0}; - // this is the last lsn for data, might not be the same with the real last committed lsn - // which should be get by raft_server()->get_committed_log_idx() - uint64_t last_data_committed_lsn{0}; - std::shared_mutex db_mtx_; - std::shared_ptr< snapshot_context > m_last_snapshot{nullptr}; - std::mutex m_snapshot_lock; - bool zombie_{false}; -}; - -class RaftReplDevTest : public testing::Test { -public: - void SetUp() override { - // By default it will create one db - for (uint32_t i{0}; i < SISL_OPTIONS["num_raft_groups"].as< uint32_t >(); ++i) { - auto db = std::make_shared< TestReplicatedDB >(); - g_helper->register_listener(db); - dbs_.emplace_back(std::move(db)); - } - } - - void TearDown() override { - for (auto const& db : dbs_) { - if (db->is_zombie()) { continue; } - run_on_leader(db, [this, db]() { - auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get(); - ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group"; - }); - } - - for (auto const& db : dbs_) { - if (db->is_zombie()) { continue; } - auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); - int i = 0; - bool force_leave = false; - do { - std::this_thread::sleep_for(std::chrono::seconds(1)); - auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); - raft_repl_svc.gc_repl_devs(); - LOGINFO("Waiting for repl dev to get destroyed"); - - // TODO: if leader is destroyed, but the follower does not receive the notification, it will not be - // destroyed for ever. we need handle this in raft_repl_dev. revisit here after making changes at - // raft_repl_dev side to hanle this case. this is a workaround to avoid the infinite loop for now. - if (i++ > 10 && !force_leave) { - LOGWARN("has already waited for repl dev to get destroyed for 10 times, so do a force leave"); - repl_dev->force_leave(); - force_leave = true; - } - - } while (!repl_dev->is_destroyed()); - } - } - - void generate_writes(uint64_t data_size, uint32_t max_size_per_iov, shared< TestReplicatedDB > db = nullptr) { - if (db == nullptr) { db = pick_one_db(); } - // LOGINFO("Writing on group_id={}", db->repl_dev()->group_id()); - db->db_write(data_size, max_size_per_iov); - } - - void wait_for_all_commits() { wait_for_commits(written_entries_); } - - void wait_for_commits(uint64_t exp_writes) { - uint64_t total_writes{0}; - while (true) { - total_writes = 0; - for (auto const& db : dbs_) { - total_writes += db->db_commit_count(); - } - - if (total_writes >= exp_writes) { break; } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - LOGINFO("Replica={} has received {} commits as expected", g_helper->replica_num(), total_writes); - } - - void validate_data() { - for (auto const& db : dbs_) { - db->validate_db_data(); - } - } - - shared< TestReplicatedDB > pick_one_db() { return dbs_[0]; } - - void assign_leader(uint16_t replica) { - LOGINFO("Switch the leader to replica_num = {}", replica); - if (g_helper->replica_num() == replica) { - for (auto const& db : dbs_) { - do { - auto result = db->repl_dev()->become_leader().get(); - if (result.hasError()) { - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } else { - break; - } - } while (true); - } - } else { - for (auto const& db : dbs_) { - homestore::replica_id_t leader_uuid; - while (true) { - leader_uuid = db->repl_dev()->get_leader_id(); - if (!leader_uuid.is_nil() && (g_helper->member_id(leader_uuid) == replica)) { break; } - - LOGINFO("Waiting for replica={} to become leader", replica); - std::this_thread::sleep_for(std::chrono::milliseconds{500}); - } - } - } - } - - void run_on_leader(std::shared_ptr< TestReplicatedDB > db, auto&& lambda) { - do { - auto leader_uuid = db->repl_dev()->get_leader_id(); - - if (leader_uuid.is_nil()) { - LOGINFO("Waiting for leader to be elected for group={}", db->repl_dev()->group_id()); - std::this_thread::sleep_for(std::chrono::milliseconds{500}); - } else if (leader_uuid == g_helper->my_replica_id()) { - lambda(); - break; - } else { - break; - } - } while (true); - } - - void write_on_leader(uint32_t num_entries, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr) { - do { - auto leader_uuid = dbs_[0]->repl_dev()->get_leader_id(); - - if (leader_uuid.is_nil()) { - LOGINFO("Waiting for leader to be elected"); - std::this_thread::sleep_for(std::chrono::milliseconds{500}); - } else if (leader_uuid == g_helper->my_replica_id()) { - LOGINFO("Writing {} entries since I am the leader my_uuid={}", num_entries, - boost::uuids::to_string(g_helper->my_replica_id())); - auto const block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); - g_helper->runner().set_num_tasks(num_entries); - - LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); - g_helper->runner().set_task([this, block_size, db]() { - static std::normal_distribution<> num_blks_gen{3.0, 2.0}; - this->generate_writes(std::abs(std::lround(num_blks_gen(g_re))) * block_size, block_size, db); - }); - if (wait_for_commit) { g_helper->runner().execute().get(); } - break; - } else { - LOGINFO("{} entries were written on the leader_uuid={} my_uuid={}", num_entries, - boost::uuids::to_string(leader_uuid), boost::uuids::to_string(g_helper->my_replica_id())); - break; - } - } while (true); - - written_entries_ += num_entries; - if (wait_for_commit) { this->wait_for_all_commits(); } - } - - void remove_db(std::shared_ptr< TestReplicatedDB > db, bool wait_for_removal) { - this->run_on_leader(db, [this, db]() { - auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get(); - ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group"; - }); - - // Remove the db from the dbs_ list and check if count matches with repl_device - for (auto it = dbs_.begin(); it != dbs_.end(); ++it) { - if (*it == db) { - dbs_.erase(it); - break; - } - } - - if (wait_for_removal) { wait_for_listener_destroy(dbs_.size()); } - } - - void wait_for_listener_destroy(uint64_t exp_listeners) { - while (true) { - auto total_listeners = g_helper->num_listeners(); - if (total_listeners == exp_listeners) { break; } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - } - - void restart_replica(uint16_t replica, uint32_t shutdown_delay_sec = 5u) { - if (g_helper->replica_num() == replica) { - LOGINFO("Restart homestore: replica_num = {}", replica); - g_helper->restart(shutdown_delay_sec); - // g_helper->sync_for_test_start(); - } else { - LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica); - std::this_thread::sleep_for(std::chrono::seconds{5}); - } - } - - void shutdown_replica(uint16_t replica) { - if (g_helper->replica_num() == replica) { - LOGINFO("Shutdown homestore: replica_num = {}", replica); - g_helper->shutdown(); - } else { - LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica); - std::this_thread::sleep_for(std::chrono::seconds{5}); - } - } - - void start_replica(uint16_t replica) { - if (g_helper->replica_num() == replica) { - LOGINFO("Start homestore: replica_num = {}", replica); - g_helper->start(); - } - } - - void create_snapshot() { dbs_[0]->create_snapshot(); } - void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } - -protected: - std::vector< std::shared_ptr< TestReplicatedDB > > dbs_; - uint32_t written_entries_{0}; - -#ifdef _PRERELEASE - flip::FlipClient m_fc{iomgr_flip::instance()}; -#endif -}; +class RaftReplDevTest : public RaftReplDevTestBase {}; TEST_F(RaftReplDevTest, Write_Restart_Write) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); @@ -1015,7 +415,6 @@ int main(int argc, char* argv[]) { // Snapshot and truncation tests needs num reserved to be 0 and distance 10. s.consensus.num_reserved_log_items = 0; - s.consensus.snapshot_freq_distance = 10; s.resource_limits.resource_audit_timer_ms = 0; // only reset when user specified the value for test; @@ -1033,7 +432,8 @@ int main(int argc, char* argv[]) { FLAGS_folly_global_cpu_executor_threads = 4; g_helper = std::make_unique< test_common::HSReplTestHelper >("test_raft_repl_dev", args, orig_argv); - g_helper->setup(); + // No spare replica's are created. Test cases in this file expects fixed number of replica's. + g_helper->setup(SISL_OPTIONS["replicas"].as< uint32_t >()); auto ret = RUN_ALL_TESTS(); g_helper->teardown(); diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp new file mode 100644 index 000000000..7bd69a13c --- /dev/null +++ b/src/tests/test_raft_repl_dev_dynamic.cpp @@ -0,0 +1,133 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include "test_common/raft_repl_test_base.hpp" + +// Dynamic tests spawn spare replica's also which can be used to add and remove from a repl dev. +class ReplDevDynamicTest : public RaftReplDevTestBase {}; + +TEST_F(ReplDevDynamicTest, ReplaceMember) { + // Write some IO's, replace a member, validate all members data except which is out. + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + if (g_helper->replica_num() < num_replicas) { + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + std::this_thread::sleep_for(std::chrono::seconds(3)); + } else if (g_helper->replica_num() == member_in) { + LOGINFO("Wait for commits replica={}", g_helper->replica_num()); + wait_for_commits(num_io_entries); + } + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (g_helper->replica_num() != member_out) { + // Skip the member which is going to be replaced. Validate data on all other replica's. + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } else { + // The out member will have the repl dev destroyed. + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + do { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num()); + } while (!repl_dev->is_destroyed()); + LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); + } + + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("ReplaceMember test done"); +} + +// TODO add more tests with leader and member restart, multiple member replace +// leader replace, commit quorum + +int main(int argc, char* argv[]) { + int parsed_argc = argc; + char** orig_argv = argv; + + // Save the args for replica use + std::vector< std::string > args; + for (int i = 0; i < argc; ++i) { + args.emplace_back(argv[i]); + } + + ::testing::InitGoogleTest(&parsed_argc, argv); + + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, config, test_raft_repl_dev, iomgr, test_common_setup, + test_repl_common_setup); + + // + // Entire test suite assumes that once a replica takes over as leader, it stays until it is explicitly yielded. + // Otherwise it is very hard to control or accurately test behavior. Hence we forcibly override the + // leadership_expiry time. + // + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.consensus.leadership_expiry_ms = -1; // -1 means never expires; + s.generic.repl_dev_cleanup_interval_sec = 1; + + // Disable implicit flush and timer. + s.logstore.flush_threshold_size = 0; + s.logstore.flush_timer_frequency_us = 0; + + // Snapshot and truncation tests needs num reserved to be 0 and distance 10. + s.consensus.num_reserved_log_items = 0; + s.resource_limits.resource_audit_timer_ms = 0; + + // only reset when user specified the value for test; + if (SISL_OPTIONS.count("snapshot_distance")) { + s.consensus.snapshot_freq_distance = SISL_OPTIONS["snapshot_distance"].as< uint32_t >(); + } + if (SISL_OPTIONS.count("num_raft_logs_resv")) { + s.resource_limits.raft_logstore_reserve_threshold = SISL_OPTIONS["num_raft_logs_resv"].as< uint32_t >(); + } + if (SISL_OPTIONS.count("res_mgr_audit_timer_ms")) { + s.resource_limits.resource_audit_timer_ms = SISL_OPTIONS["res_mgr_audit_timer_ms"].as< uint32_t >(); + } + }); + HS_SETTINGS_FACTORY().save(); + + FLAGS_folly_global_cpu_executor_threads = 4; + g_helper = std::make_unique< test_common::HSReplTestHelper >("test_raft_repl_dev_dynamic", args, orig_argv); + + // We spawn spare replica's also for dynamic repl dev tests. + auto total_replicas = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + g_helper->setup(total_replicas); + + auto ret = RUN_ALL_TESTS(); + g_helper->teardown(); + + std::string str; + sisl::ObjCounterRegistry::foreach ([&str](const std::string& name, int64_t created, int64_t alive) { + fmt::format_to(std::back_inserter(str), "{}: created={} alive={}\n", name, created, alive); + }); + LOGINFO("Object Life Counter\n:{}", str); + + return ret; +} diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 9367d64f0..c26ba273d 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -136,6 +136,7 @@ class SoloReplDevTest : public testing::Test { cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received error={} on repl_dev", enum_name(error)); } + void replace_member(replica_id_t member_out, replica_id_t member_in) override {} void on_destroy() override {} }; From af56137fff5a56e379c71df7258c1e8b30c58958 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Sun, 22 Sep 2024 11:47:29 +0800 Subject: [PATCH 002/170] Generalize and introduce Sealer into CP. Sealer is a special consumer that provides information regarding where the cp is up to. It will be the first one during cp switch over , as a conservative marker of everything before or equals to this point, should be in current cp, possibly some consumer are above this point which is fine. And Sealer is the last one during cp flush after all other services flushed successfully. Signed-off-by: Xiaoxi Chen --- src/include/homestore/checkpoint/cp.hpp | 5 +++ src/lib/checkpoint/cp_mgr.cpp | 28 +++++++------ .../replication/repl_dev/raft_repl_dev.cpp | 33 +++++++++++++--- src/lib/replication/repl_dev/raft_repl_dev.h | 9 ++++- .../replication/service/raft_repl_service.cpp | 39 +++++++++++++++++-- .../replication/service/raft_repl_service.h | 17 ++++++++ 6 files changed, 110 insertions(+), 21 deletions(-) diff --git a/src/include/homestore/checkpoint/cp.hpp b/src/include/homestore/checkpoint/cp.hpp index c15bed87a..e88a9e4e2 100644 --- a/src/include/homestore/checkpoint/cp.hpp +++ b/src/include/homestore/checkpoint/cp.hpp @@ -70,6 +70,11 @@ class CPContext; class CPManager; VENUM(cp_consumer_t, uint8_t, + // Sealer is a special consumer that provides information regarding where the cp is up to. + // It will be the first one during cp switch over , as a conservative marker of everything + // before or equals to this point, should be in current cp, possibly some consumer are above this point which is fine. + // And Sealer is the last one during cp flush after all other services flushed successfully. + SEALER = 3, HS_CLIENT = 0, // Client of the homestore module INDEX_SVC = 1, // Index service module BLK_DATA_SVC = 2, // Block data service module diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp index 687dae1e5..7072d7c91 100644 --- a/src/lib/checkpoint/cp_mgr.cpp +++ b/src/lib/checkpoint/cp_mgr.cpp @@ -184,10 +184,16 @@ folly::Future< bool > CPManager::do_trigger_cp_flush(bool force, bool flush_on_s new_cp->m_cp_id = cur_cp->m_cp_id + 1; HS_PERIODIC_LOG(DEBUG, cp, "Create New CP session", new_cp->id()); - size_t idx{0}; - for (auto& consumer : m_cp_cb_table) { - if (consumer) { new_cp->m_contexts[idx] = std::move(consumer->on_switchover_cp(cur_cp.get(), new_cp)); } - ++idx; + // sealer should be the first one to switch over + auto& sealer_cp = m_cp_cb_table[(size_t)cp_consumer_t::SEALER]; + if (sealer_cp) { + new_cp->m_contexts[(size_t)cp_consumer_t::SEALER] = std::move(sealer_cp->on_switchover_cp(cur_cp.get(), new_cp)); + } + // switch over other consumers + for (size_t svcid = 0; svcid < (size_t)cp_consumer_t::SENTINEL; svcid++) { + if (svcid == (size_t)cp_consumer_t::SEALER) { continue; } + auto& consumer = m_cp_cb_table[svcid]; + if (consumer) { new_cp->m_contexts[svcid] = std::move(consumer->on_switchover_cp(cur_cp.get(), new_cp)); } } HS_PERIODIC_LOG(DEBUG, cp, "CP Attached completed, proceed to exit cp critical section"); @@ -218,20 +224,18 @@ void CPManager::cp_start_flush(CP* cp) { std::vector< folly::Future< bool > > futs; HS_PERIODIC_LOG(INFO, cp, "Starting CP {} flush", cp->id()); cp->m_cp_status = cp_status_t::cp_flushing; - for (size_t svcid = 0; svcid < (size_t)cp_consumer_t::SENTINEL; svcid++) { - if (svcid == (size_t)cp_consumer_t::REPLICATION_SVC) { - continue; - } + if (svcid == (size_t)cp_consumer_t::SEALER) { continue; } auto& consumer = m_cp_cb_table[svcid]; if (consumer) { futs.emplace_back(std::move(consumer->cp_flush(cp))); } } folly::collectAllUnsafe(futs).thenValue([this, cp](auto) { - // Sync flushing replication svc at last as the cp_lsn updated here - // other component should at least flushed to cp_lsn - auto& repl_cp = m_cp_cb_table[(size_t)cp_consumer_t::REPLICATION_SVC]; - if (repl_cp) {repl_cp->cp_flush(cp).wait();} + // Sync flushing SEALER svc which is the replication service + // at last as the cp_lsn updated here. Other component should + // at least flushed to cp_lsn. + auto& sealer_cp = m_cp_cb_table[(size_t)cp_consumer_t::SEALER]; + if (sealer_cp) { sealer_cp->cp_flush(cp).wait(); } // All consumers have flushed for the cp on_cp_flush_done(cp); }); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index e928f8996..088270de0 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1189,9 +1189,10 @@ void RaftReplDev::flush_durable_commit_lsn() { } /////////////////////////////////// Private metohds //////////////////////////////////// -void RaftReplDev::cp_flush(CP* cp) { - auto const lsn = m_commit_upto_lsn.load(); - auto const clsn = m_compact_lsn.load(); +void RaftReplDev::cp_flush(CP* cp, cshared ctx) { + auto const lsn = ctx->cp_lsn; + auto const clsn = ctx->compacted_to_lsn; + auto const dsn = ctx->last_applied_dsn; if (lsn == m_last_flushed_commit_lsn) { // Not dirtied since last flush ignore @@ -1200,15 +1201,31 @@ void RaftReplDev::cp_flush(CP* cp) { std::unique_lock lg{m_sb_mtx}; m_rd_sb->compact_lsn = clsn; - m_rd_sb->durable_commit_lsn = lsn; + // dc_lsn is also flushed in flush_durable_commit_lsn() + // we need to take a max to avoid rolling back. + m_rd_sb->durable_commit_lsn = std::max(lsn, m_rd_sb->durable_commit_lsn); m_rd_sb->checkpoint_lsn = lsn; - m_rd_sb->last_applied_dsn = m_next_dsn.load(); + m_rd_sb->last_applied_dsn = dsn; m_rd_sb.write(); m_last_flushed_commit_lsn = lsn; RD_LOGD("cp flush in raft repl dev, lsn={}, clsn={}, next_dsn={}, cp string:{}", lsn, clsn, m_next_dsn.load(), cp->to_string()); } +cshared RaftReplDev::get_cp_ctx(CP* cp) { + auto const cp_lsn = m_commit_upto_lsn.load(); + auto const clsn = m_compact_lsn.load(); + auto const dsn = m_next_dsn.load(); + + RD_LOGD("getting cp_ctx for raft repl dev {}, cp_lsn={}, clsn={}, next_dsn={}, cp string:{}", + (void *)this, cp_lsn, clsn, dsn, cp->to_string()); + auto dev_ctx = std::make_shared(); + dev_ctx->cp_lsn = cp_lsn; + dev_ctx->compacted_to_lsn = clsn; + dev_ctx->last_applied_dsn = dsn; + return dev_ctx; +} + void RaftReplDev::cp_cleanup(CP*) {} void RaftReplDev::gc_repl_reqs() { @@ -1300,6 +1317,12 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx // keep lentry in scope for the lyfe cycle of the rreq rreq->set_lentry(lentry); rreq->init(rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), entry_to_key(jentry), data_size); + // we load the log from log device, implies log flushed. We only flush log after data is written to data device. + rreq->add_state(repl_req_state_t::BLK_ALLOCATED); + rreq->add_state(repl_req_state_t::DATA_RECEIVED); + rreq->add_state(repl_req_state_t::DATA_WRITTEN); + rreq->add_state(repl_req_state_t::LOG_RECEIVED); + rreq->add_state(repl_req_state_t::LOG_FLUSHED); RD_LOGD("Replay log on restart, rreq=[{}]", rreq->to_string()); if (repl_lsn > m_rd_sb->durable_commit_lsn) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 82fdcaa23..e2e95550d 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -107,6 +107,12 @@ class RaftReplDevMetrics : public sisl::MetricsGroup { class RaftReplService; class CP; +struct ReplDevCPContext { + repl_lsn_t cp_lsn; + repl_lsn_t compacted_to_lsn; + uint64_t last_applied_dsn; +}; + class RaftReplDev : public ReplDev, public nuraft_mesg::mesg_state_mgr, public std::enable_shared_from_this< RaftReplDev > { @@ -192,7 +198,8 @@ class RaftReplDev : public ReplDev, sisl::blob const& key, uint32_t data_size, bool is_data_channel); folly::Future< folly::Unit > notify_after_data_written(std::vector< repl_req_ptr_t >* rreqs); void check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreqs); - void cp_flush(CP* cp); + void cp_flush(CP* cp, cshared ctx); + cshared get_cp_ctx(CP* cp); void cp_cleanup(CP* cp); void become_ready(); diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index bbf921685..974984ca3 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -448,11 +448,44 @@ void RaftReplService::flush_durable_commit_lsn() { } ///////////////////// RaftReplService CP Callbacks ///////////////////////////// -std::unique_ptr< CPContext > RaftReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; } +int ReplSvcCPContext::add_repl_dev_ctx(ReplDev* dev, cshared< ReplDevCPContext > dev_ctx) { + m_cp_ctx_map.emplace(dev, dev_ctx); + return 0; +} + +cshared< ReplDevCPContext > ReplSvcCPContext::get_repl_dev_ctx(ReplDev* dev) { + if (m_cp_ctx_map.count(dev) == 0) { + // it is possible if a repl dev added during the cp flush + return std::make_shared< ReplDevCPContext >(); + } + return m_cp_ctx_map[dev]; +} + +std::unique_ptr< CPContext > RaftReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { + // checking if cur_cp == nullptr as on_switchover_cp will be called when registering the cp handler + if (cur_cp != nullptr) { + // Add cp info from all devices to current cp. + // We dont need taking cp_guard as cp_mgr already taken it in do_trigger_cp_flush + auto cur_cp_ctx = s_cast< ReplSvcCPContext* >(cur_cp->context(cp_consumer_t::REPLICATION_SVC)); + repl_service().iterate_repl_devs([cur_cp, cur_cp_ctx](cshared< ReplDev >& repl_dev) { + // we need collecting the LSN of each repl dev and put it into current CP. + // There is no dirty buffers accumulated to new_cp yet, as the cp_mgr ensure replication_svc + // is the first one being called during cp switchover. + auto dev_ctx = std::static_pointer_cast< RaftReplDev >(repl_dev)->get_cp_ctx(cur_cp); + cur_cp_ctx->add_repl_dev_ctx(repl_dev.get(), std::move(dev_ctx)); + }); + } + // create new ctx + auto ctx = std::make_unique< ReplSvcCPContext >(new_cp); + return ctx; +} folly::Future< bool > RaftReplServiceCPHandler::cp_flush(CP* cp) { - repl_service().iterate_repl_devs( - [cp](cshared< ReplDev >& repl_dev) { std::static_pointer_cast< RaftReplDev >(repl_dev)->cp_flush(cp); }); + auto cp_ctx = s_cast< ReplSvcCPContext* >(cp->context(cp_consumer_t::REPLICATION_SVC)); + repl_service().iterate_repl_devs([cp, cp_ctx](cshared< ReplDev >& repl_dev) { + auto dev_ctx = cp_ctx->get_repl_dev_ctx(repl_dev.get()); + std::static_pointer_cast< RaftReplDev >(repl_dev)->cp_flush(cp, dev_ctx); + }); return folly::makeFuture< bool >(true); } diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index cba90e2e0..4985d4eea 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -82,6 +82,23 @@ class RaftReplService : public GenericReplService, void flush_durable_commit_lsn(); }; +// cp context for repl_dev, repl_dev cp_lsn is critical cursor in the system, +// anything below the cp_lsn we believed is persisted through cp and will not +// go through replay. The cp_lsn need to be kept into ctx when switchover_cp, +// and the persist of repl_dev_cp need to be done after all other consumers succeed. + +struct ReplDevCPContext; + +class ReplSvcCPContext : public CPContext { + std::shared_mutex m_cp_map_mtx; + std::map< ReplDev*, cshared > m_cp_ctx_map; +public: + ReplSvcCPContext(CP* cp) : CPContext(cp){}; + virtual ~ReplSvcCPContext() = default; + int add_repl_dev_ctx(ReplDev* dev, cshared dev_ctx); + cshared get_repl_dev_ctx(ReplDev* dev); +}; + class RaftReplServiceCPHandler : public CPCallbacks { public: RaftReplServiceCPHandler() = default; From 81a80f25082e042f9848accec0524fc59f20bbdb Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Sat, 28 Sep 2024 01:13:35 +0800 Subject: [PATCH 003/170] Start data service after log replay done. Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 22 +++++++++++++++---- src/lib/replication/repl_dev/raft_repl_dev.h | 1 + .../replication/service/raft_repl_service.cpp | 20 +++++++++++++++-- 4 files changed, 38 insertions(+), 7 deletions(-) diff --git a/conanfile.py b/conanfile.py index 524cd6a1d..f34a966c7 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.4.62" + version = "6.4.63" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 088270de0..4db39382b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -74,6 +74,7 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_rd_sb->free_blks_journal_id = m_free_blks_journal->get_store_id(); } m_rd_sb.write(); + bind_data_service(); } RD_LOG(INFO, @@ -83,9 +84,13 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk (load_existing ? "Existing" : "New"), group_id_str(), my_replica_id_str(), m_raft_server_id, m_commit_upto_lsn.load(), m_compact_lsn.load(), m_rd_sb->checkpoint_lsn, m_next_dsn.load(), m_rd_sb->logdev_id, m_rd_sb->logstore_id); +} +bool RaftReplDev::bind_data_service() { + RD_LOG(INFO, "Starting data channel, group_id={}, replica_id={}", group_id_str(), my_replica_id_str()); + bool success = false; #ifdef _PRERELEASE - m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, [this](intrusive< sisl::GenericRpcData >& rpc_data) { + success = m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, [this](intrusive< sisl::GenericRpcData >& rpc_data) { if (iomgr_flip::instance()->delay_flip("slow_down_data_channel", [this, rpc_data]() mutable { RD_LOGI("Resuming after slow down data channel flip"); on_push_data_received(rpc_data); @@ -96,13 +101,22 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk } }); #else - m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1)); + success = m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1)); #endif - - m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 1)); + if (!success) { + RD_LOGE("Failed to bind data service request for PUSH_DATA"); + return false; + } + success = m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 1)); + if (!success) { + RD_LOGE("Failed to bind data service request for FETCH_DATA"); + return false; + } + return true; } bool RaftReplDev::join_group() { + bind_data_service(); auto raft_result = m_msg_mgr.join_group(m_group_id, "homestore_replication", std::dynamic_pointer_cast< nuraft_mesg::mesg_state_mgr >(shared_from_this())); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index e2e95550d..f78308aba 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -160,6 +160,7 @@ class RaftReplDev : public ReplDev, RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk >&& rd_sb, bool load_existing); virtual ~RaftReplDev() = default; + bool bind_data_service(); bool join_group(); AsyncReplResult<> replace_member(replica_id_t member_out, replica_id_t member_in); folly::SemiFuture< ReplServiceError > destroy_group(); diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 974984ca3..bd7cd1945 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -128,14 +128,30 @@ void RaftReplService::start() { m_config_sb_bufs.clear(); // Step 5: Start the data and logstore service now. This step is essential before we can ask Raft to join groups etc - hs()->data_service().start(); + + // It is crucial to start the logstore before the enalbe data channel. This is because during log replay, + // the commit_blks() function is called, which interacts with the allocator. + // Starting the data channel before the log replay is complete can lead to a race condition between + // PUSHDATA operations and log replay. + // For example, consider LSN 100 in the log store is associated with PBA1. After a restart, the allocator + // is only aware of allocations up to the last checkpoint and may consider PBA1 as available. + // If a PUSHDATA request is received during this time, PBA1 could be allocated again to a new request, + // leading to data corruption by overwriting the data associated with LSN 100. + // Now the data channel is started in join_group(). + + LOGINFO("Starting LogStore service, fist_boot = {}", hs()->is_first_time_boot()); hs()->logstore_service().start(hs()->is_first_time_boot()); + LOGINFO("Started LogStore service, log replay should already done till this point"); + // all log stores are replayed, time to start data service. + LOGINFO("Starting DataService"); + hs()->data_service().start(); // Step 6: Iterate all the repl dev and ask each one of the join the raft group. for (auto it = m_rd_map.begin(); it != m_rd_map.end();) { auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); rdev->wait_for_logstore_ready(); if (!rdev->join_group()) { + HS_REL_ASSERT(false, "FAILED TO JOIN GROUP, PANIC HERE"); it = m_rd_map.erase(it); } else { ++it; @@ -358,7 +374,7 @@ void RaftReplService::start_reaper_thread() { m_rdev_gc_timer_hdl = iomanager.schedule_thread_timer( HS_DYNAMIC_CONFIG(generic.repl_dev_cleanup_interval_sec) * 1000 * 1000 * 1000, true /* recurring */, nullptr, [this](void*) { - LOGINFOMOD(replication, "Reaper Thread: Doing GC"); + LOGDEBUGMOD(replication, "Reaper Thread: Doing GC"); gc_repl_reqs(); gc_repl_devs(); }); From d445658b192bb2abb92baf09bf8683a7c7421c4c Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Sun, 29 Sep 2024 18:41:08 +0800 Subject: [PATCH 004/170] Flushing log after data written. Signed-off-by: Xiaoxi Chen --- src/lib/replication/log_store/repl_log_store.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index 4271d8b88..36cec9370 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -66,17 +66,18 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { // a fetch and write. Once all requests are completed and written, these requests are poped out of the map and // the future will be ready. auto fut = m_rd.notify_after_data_written(reqs); + // Wait for the fetch and write to be completed successfully. + // It is essential to complete the data write before appending to the log. If the logs are flushed + // before the data is written, a restart and subsequent log replay occurs, as the in-memory state is lost, + // it leaves us uncertain about whether the data was actually written, potentially leading to data inconsistency. + std::move(fut).wait(); - // In the meanwhile, we can flush the journal for this lsn batch. It is ok to flush the entries in log before - // actual data is written, because, even if we have the log, it doesn't mean data is committed, until state - // machine reports that. This way the flush and fetch both can run in parallel. + // Flushing log now. auto cur_time = std::chrono::steady_clock::now(); HomeRaftLogStore::end_of_append_batch(start_lsn, count); HISTOGRAM_OBSERVE(m_rd.metrics(), raft_end_of_append_batch_latency_us, get_elapsed_time_us(cur_time)); cur_time = std::chrono::steady_clock::now(); - // Wait for the fetch and write to be completed successfully. - std::move(fut).wait(); HISTOGRAM_OBSERVE(m_rd.metrics(), data_channel_wait_latency_us, get_elapsed_time_us(cur_time)); // Mark all the reqs also completely written From 15741a8c8dc22308ce5080b2b3ec87ba09dfc93b Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Sun, 29 Sep 2024 18:21:51 -0700 Subject: [PATCH 005/170] Workaround: temporary disbale assert of dirty_buf_cnt. Signed-off-by: Xiaoxi Chen --- src/lib/common/resource_mgr.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lib/common/resource_mgr.cpp b/src/lib/common/resource_mgr.cpp index a4276529b..0ba4803c4 100644 --- a/src/lib/common/resource_mgr.cpp +++ b/src/lib/common/resource_mgr.cpp @@ -94,7 +94,10 @@ void ResourceMgr::dec_dirty_buf_size(const uint32_t size) { HS_REL_ASSERT_GT(size, 0); const int64_t dirty_buf_cnt = m_hs_dirty_buf_cnt.fetch_sub(size, std::memory_order_relaxed); COUNTER_DECREMENT(m_metrics, dirty_buf_cnt, size); - HS_REL_ASSERT_GE(dirty_buf_cnt, size); + if (dirty_buf_cnt < size) { + LOGERROR("dirty_buf_cnt {} of now is less then size {}", dirty_buf_cnt, size); + } + //HS_REL_ASSERT_GE(dirty_buf_cnt, size); } void ResourceMgr::register_dirty_buf_exceed_cb(exceed_limit_cb_t cb) { m_dirty_buf_exceed_cb = std::move(cb); } From 5e6bf9d2634e857040b981226b85dfeb49c15460 Mon Sep 17 00:00:00 2001 From: Sanal Date: Tue, 1 Oct 2024 17:15:05 -0700 Subject: [PATCH 006/170] Add raft commit quorum for replace member if two members down. (#559) --- src/include/homestore/replication_service.hpp | 4 +- .../replication/repl_dev/raft_repl_dev.cpp | 27 +++++++- src/lib/replication/repl_dev/raft_repl_dev.h | 3 +- .../replication/service/generic_repl_svc.cpp | 4 +- .../replication/service/generic_repl_svc.h | 4 +- .../replication/service/raft_repl_service.cpp | 6 +- .../replication/service/raft_repl_service.h | 4 +- src/tests/test_common/raft_repl_test_base.hpp | 9 ++- src/tests/test_raft_repl_dev_dynamic.cpp | 68 ++++++++++++++++++- 9 files changed, 109 insertions(+), 20 deletions(-) diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index 8f535b855..f9b4f2986 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -41,8 +41,8 @@ class ReplicationService { /// @return A Future which gets called after schedule to release (before garbage collection is kicked in) virtual folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) = 0; - virtual AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const = 0; + virtual AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, + uint32_t commit_quorum = 0) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened /// @param group_id Group id interested in diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 4db39382b..565bc0d67 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -127,23 +127,30 @@ bool RaftReplDev::join_group() { return true; } -AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, replica_id_t member_in_uuid) { +AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, replica_id_t member_in_uuid, + uint32_t commit_quorum) { LOGINFO("Replace member group_id={} member_out={} member_in={}", group_id_str(), boost::uuids::to_string(member_out_uuid), boost::uuids::to_string(member_in_uuid)); + if (commit_quorum >= 1) { + // Two members are down and leader cant form the quorum. Reduce the quorum size. + reset_quorum_size(commit_quorum); + } + // Step 1: Check if leader itself is requested to move out. if (m_my_repl_id == member_out_uuid && m_my_repl_id == get_leader_id()) { // If leader is the member requested to move out, then give up leadership and return error. // Client will retry replace_member request to the new leader. raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); RD_LOGI("Replace member leader is the member_out so yield leadership"); + reset_quorum_size(0); return make_async_error<>(ReplServiceError::NOT_LEADER); } // Step 2. Add the new member. return m_msg_mgr.add_member(m_group_id, member_in_uuid) .via(&folly::InlineExecutor::instance()) - .thenValue([this, member_in_uuid, member_out_uuid](auto&& e) -> AsyncReplResult<> { + .thenValue([this, member_in_uuid, member_out_uuid, commit_quorum](auto&& e) -> AsyncReplResult<> { // TODO Currently we ignore the cancelled, fix nuraft_mesg to not timeout // when adding member. Member is added to cluster config until member syncs fully // with atleast stop gap. This will take a lot of time for block or @@ -157,6 +164,7 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl RD_LOGW("Ignoring error returned from nuraft add_member {}", e.error()); } else { RD_LOGE("Replace member error in add member : {}", e.error()); + reset_quorum_size(0); return make_async_error<>(RaftReplService::to_repl_error(e.error())); } } @@ -179,6 +187,7 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { LOGERROR("Replace member propose to raft failed {}", err); + reset_quorum_size(0); return make_async_error<>(std::move(err)); } @@ -189,7 +198,7 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl // entry and call exit_group() and leave(). return m_msg_mgr.rem_member(m_group_id, member_out_uuid) .via(&folly::InlineExecutor::instance()) - .thenValue([this, member_out](auto&& e) -> AsyncReplResult<> { + .thenValue([this, member_out, commit_quorum](auto&& e) -> AsyncReplResult<> { if (e.hasError()) { // Ignore the server not found as server removed from the cluster // as requests are idempotent and can be resend. @@ -199,16 +208,28 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl // Its ok to retry this request as the request // of replace member is idempotent. RD_LOGE("Replace member failed to remove member : {}", e.error()); + reset_quorum_size(0); return make_async_error<>(ReplServiceError::RETRY_REQUEST); } } else { RD_LOGI("Replace member removed member={} from group_id={}", member_out, group_id_str()); } + + // Revert the quorum size back to 0. + reset_quorum_size(0); return make_async_success<>(); }); }); } +void RaftReplDev::reset_quorum_size(uint32_t commit_quorum) { + RD_LOGI("Reset raft quorum size={}", commit_quorum); + nuraft::raft_params params = raft_server()->get_current_params(); + params.with_custom_commit_quorum_size(commit_quorum); + params.with_custom_election_quorum_size(commit_quorum); + raft_server()->update_params(params); +} + folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { // Set the intent to destroy the group m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::DESTROYING; }); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index f78308aba..3b25cb23b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -162,7 +162,7 @@ class RaftReplDev : public ReplDev, bool bind_data_service(); bool join_group(); - AsyncReplResult<> replace_member(replica_id_t member_out, replica_id_t member_in); + AsyncReplResult<> replace_member(replica_id_t member_out, replica_id_t member_in, uint32_t commit_quorum); folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// @@ -283,6 +283,7 @@ class RaftReplDev : public ReplDev, void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); void commit_blk(repl_req_ptr_t rreq); void replace_member(repl_req_ptr_t rreq); + void reset_quorum_size(uint32_t commit_quorum); }; } // namespace homestore diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 89800df3f..8e5c9a7a1 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -147,8 +147,8 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } } -AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const { +AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, + uint32_t commit_quorum) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index e2d445427..5e0cb84a3 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -73,8 +73,8 @@ class SoloReplService : public GenericReplService { std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const override; + AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, + uint32_t commit_quorum = 0) const override; }; class SoloReplServiceCPHandler : public CPCallbacks { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index bd7cd1945..d862c2098 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -346,13 +346,13 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki add_repl_dev(group_id, rdev); } -AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const { +AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, + uint32_t commit_quorum) const { auto rdev_result = get_repl_dev(group_id); if (!rdev_result) { return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) - ->replace_member(member_out, member_in) + ->replace_member(member_out, member_in, commit_quorum) .via(&folly::InlineExecutor::instance()) .thenValue([this](auto&& e) mutable { if (e.hasError()) { return make_async_error<>(e.error()); } diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 4985d4eea..44ed06332 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -69,8 +69,8 @@ class RaftReplService : public GenericReplService, std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const override; + AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, + uint32_t commit_quorum = 0) const override; private: RaftReplDev* raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie); diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 7b96afa4c..a3160f13a 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -610,11 +610,14 @@ class RaftReplDevTestBase : public testing::Test { void create_snapshot() { dbs_[0]->create_snapshot(); } void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } - void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in) { - this->run_on_leader(db, [this, db, member_out, member_in]() { + void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, + uint32_t commit_quorum = 0) { + this->run_on_leader(db, [this, db, member_out, member_in, commit_quorum]() { LOGINFO("Replace member out={} in={}", boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); - auto v = hs()->repl_service().replace_member(db->repl_dev()->group_id(), member_out, member_in).get(); + auto v = hs()->repl_service() + .replace_member(db->repl_dev()->group_id(), member_out, member_in, commit_quorum) + .get(); ASSERT_EQ(v.hasError(), false) << "Error in replacing member"; }); } diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp index 7bd69a13c..c29f239e1 100644 --- a/src/tests/test_raft_repl_dev_dynamic.cpp +++ b/src/tests/test_raft_repl_dev_dynamic.cpp @@ -65,8 +65,73 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { LOGINFO("ReplaceMember test done"); } +TEST_F(ReplDevDynamicTest, TwoMemberDown) { + LOGINFO("TwoMemberDown test started"); + + // Make two members down in a group and leader cant reach a quorum. + // We set the custom quorum size to 1 and call replace member. + // Leader should do some writes to validate it has reach quorum size. + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + + // Shutdown replica 1 and replica 2 to simulate two member down. + if (g_helper->replica_num() == 1) { + this->shutdown_replica(1); + LOGINFO("Shutdown replica 1"); + } + + if (g_helper->replica_num() == 2) { + this->shutdown_replica(2); + LOGINFO("Shutdown replica 2"); + } + + if (g_helper->replica_num() == 0) { + // Replace down replica 2 with spare replica 3 with commit quorum 1 + // so that leader can go ahead with replacing member. + LOGINFO("Replace member started"); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + LOGINFO("Leader completed num_io={}", num_io_entries); + } + + if (g_helper->replica_num() == member_in) { + wait_for_commits(num_io_entries); + LOGINFO("Member in got all commits"); + } + + if (g_helper->replica_num() == 0 || g_helper->replica_num() == member_in) { + // Validate data on leader replica 0 and replica 3 + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } + + g_helper->sync_for_cleanup_start(num_members); + + if (g_helper->replica_num() == 1) { + LOGINFO("Start replica 1"); + this->start_replica(1); + } + if (g_helper->replica_num() == 2) { + LOGINFO("Start replica 2"); + this->start_replica(2); + } + + LOGINFO("TwoMemberDown test done"); +} + // TODO add more tests with leader and member restart, multiple member replace -// leader replace, commit quorum +// leader replace int main(int argc, char* argv[]) { int parsed_argc = argc; @@ -89,7 +154,6 @@ int main(int argc, char* argv[]) { // leadership_expiry time. // HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { - s.consensus.leadership_expiry_ms = -1; // -1 means never expires; s.generic.repl_dev_cleanup_interval_sec = 1; // Disable implicit flush and timer. From 87963c39f01d7485e128f95cd3cfd6f7e0953d9e Mon Sep 17 00:00:00 2001 From: yuwmao Date: Mon, 14 Oct 2024 22:08:03 -0700 Subject: [PATCH 007/170] Add cert watcher --- .../replication/service/raft_repl_service.cpp | 47 +++++++++++++++++++ .../replication/service/raft_repl_service.h | 9 +++- 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index d862c2098..8417b141c 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -85,6 +85,13 @@ void RaftReplService::start() { LOGINFO("Starting RaftReplService with server_uuid={} port={}", boost::uuids::to_string(params.server_uuid_), params.mesg_port_); + //check if ssl cert files are provided, if yes, monitor the changes + if (!params.ssl_key_.empty() && !params.ssl_cert_.empty()) { + ioenvironment.with_file_watcher(); + monitor_cert_changes(); + } + + // Step 2: Register all RAFT parameters. At the end of this step, raft is ready to be created/join group auto r_params = nuraft::raft_params() .with_election_timeout_lower(HS_DYNAMIC_CONFIG(consensus.elect_to_low_ms)) @@ -175,6 +182,46 @@ void RaftReplService::stop() { hs()->logstore_service().stop(); } +void RaftReplService::monitor_cert_changes() { + auto fw = ioenvironment.get_file_watcher(); + auto cert_change_cb = [this](const std::string filepath, const bool deleted) { + LOGINFO("file change event for {}, deleted? {}", filepath, deleted) + // do not block file_watcher thread + std::thread restart_svc(&RaftReplService::restart_raft_svc, this, filepath, deleted); + restart_svc.detach(); + }; + + //monitor ssl cert file + if (!fw->register_listener(ioenvironment.get_ssl_cert(), "hs_ssl_cert_watcher", cert_change_cb)) { + LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", + "hs_ssl_cert_watcher", ioenvironment.get_ssl_cert()); + } + //monitor ssl key file + if (!fw->register_listener(ioenvironment.get_ssl_key(), "hs_ssl_key_watcher", cert_change_cb)) { + LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", + "hs_ssl_key_watcher", ioenvironment.get_ssl_key()); + } +} + +void RaftReplService::restart_raft_svc(const std::string filepath, const bool deleted){ + if (deleted && !wait_for_cert(filepath)) { + LOGINFO("file {} deleted, ", filepath) + // wait for the deleted file to be added again + throw std::runtime_error(fmt::format("file {} not found! Can not start grpc server", filepath)); + } + const std::unique_lock lock(raft_restart_mutex); + m_msg_mgr->restart_server(); + if (deleted) { monitor_cert_changes(); } +} + +bool RaftReplService::wait_for_cert(const std::string& filepath) { + for (auto i = cert_change_timeout; i > 0; --i) { + if (std::filesystem::exists(filepath)) { return true; } + std::this_thread::sleep_for(cert_check_sleep); + } + return false; +} + RaftReplDev* RaftReplService::raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie) { json_superblk group_config; auto& js = group_config.load(buf, meta_cookie); diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 44ed06332..4daaad9b3 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -31,6 +31,9 @@ namespace homestore { +constexpr auto cert_change_timeout = 1200; +constexpr auto cert_check_sleep = std::chrono::seconds(1); + struct repl_dev_superblk; class RaftReplDev; @@ -47,7 +50,8 @@ class RaftReplService : public GenericReplService, iomgr::timer_handle_t m_rdev_gc_timer_hdl; iomgr::timer_handle_t m_flush_durable_commit_timer_hdl; iomgr::io_fiber_t m_reaper_fiber; - + std::mutex raft_restart_mutex; + public: RaftReplService(cshared< ReplApplication >& repl_app); @@ -80,6 +84,9 @@ class RaftReplService : public GenericReplService, void gc_repl_devs(); void gc_repl_reqs(); void flush_durable_commit_lsn(); + void monitor_cert_changes(); + void restart_raft_svc(const std::string filepath, const bool deleted); + bool wait_for_cert(const std::string& filepath); }; // cp context for repl_dev, repl_dev cp_lsn is critical cursor in the system, From d35f75ec6f2abb297ddc56a6c60d17fc356aa304 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Mon, 14 Oct 2024 22:24:25 -0700 Subject: [PATCH 008/170] upgrade version --- conanfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index f34a966c7..51d8e4923 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.4.63" + version = "6.4.64" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" From f88317da4cae7323f0cba249cc5fb9e90285a143 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Tue, 15 Oct 2024 22:43:59 -0700 Subject: [PATCH 009/170] fix nit --- src/lib/replication/service/raft_repl_service.cpp | 3 ++- src/lib/replication/service/raft_repl_service.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 8417b141c..c4aefe1ca 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -215,7 +215,8 @@ void RaftReplService::restart_raft_svc(const std::string filepath, const bool de } bool RaftReplService::wait_for_cert(const std::string& filepath) { - for (auto i = cert_change_timeout; i > 0; --i) { + auto attempts = cert_change_timeout/cert_check_sleep; + for (auto i = attempts; i > 0; --i) { if (std::filesystem::exists(filepath)) { return true; } std::this_thread::sleep_for(cert_check_sleep); } diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 4daaad9b3..e0d1e6718 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -31,7 +31,7 @@ namespace homestore { -constexpr auto cert_change_timeout = 1200; +constexpr auto cert_change_timeout = std::chrono::seconds(1200); constexpr auto cert_check_sleep = std::chrono::seconds(1); struct repl_dev_superblk; From d90b54df2f79db33f6c629f0f2f8ebdb67209799 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Thu, 17 Oct 2024 21:36:28 -0700 Subject: [PATCH 010/170] Fix read_io in dataservice test. Previous code can overflow the io_size, i.e remaining_io_size -= sub_io_size; where sub_io_size > remaining_io_size, and remaining_io_size is unsigned which will be a huge number, takes ages to finish. Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- src/tests/test_data_service.cpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 51d8e4923..0e7fa7d89 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.4.64" + version = "6.4.65" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/tests/test_data_service.cpp b/src/tests/test_data_service.cpp index 9592da475..0cb855752 100644 --- a/src/tests/test_data_service.cpp +++ b/src/tests/test_data_service.cpp @@ -446,7 +446,7 @@ class BlkDataServiceTest : public testing::Test { void read_io(uint32_t io_size) { auto remaining_io_size = io_size; while (remaining_io_size > 0) { - auto const bid = get_rand_blkid_to_read(io_size); + auto const bid = get_rand_blkid_to_read(remaining_io_size); if (!bid.is_valid()) { // didn't find any block to read, either write blk map is empty or // all blks are pending on free. @@ -456,6 +456,7 @@ class BlkDataServiceTest : public testing::Test { // every piece in bid is a single block, e.g. nblks = 1 auto const nbids = bid.num_pieces(); auto sub_io_size = nbids * inst().get_blk_size(); + HS_REL_ASSERT_LE(sub_io_size, remaining_io_size, "not expecting sub_io_size to exceed remaining_io_size"); // we pass crc from lambda becaues if there is any async_free_blk, the written blks in the blkcrc map will // be removed by the time read thenVlue is called; @@ -582,7 +583,7 @@ class BlkDataServiceTest : public testing::Test { auto nbids = io_size / inst().get_blk_size(); // number of blks to read; // nbids should not exceed max pieces that MultiBlkId can hold; - nbids = std::max(nbids, MultiBlkId::max_addln_pieces); + nbids = std::min(nbids, MultiBlkId::max_addln_pieces); // make sure skip + nbids are in the range of m_blk_crc_map; if (skip_nbids + nbids > m_blk_crc_map.size()) { skip_nbids = m_blk_crc_map.size() - nbids; } From b182e7f1a9055e122a23cb7e758a1cb08765dfdd Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Mon, 21 Oct 2024 17:45:22 -0700 Subject: [PATCH 011/170] FIX wbcache for put and modify long running index (#567) --- conanfile.py | 2 +- src/include/homestore/btree/btree.hpp | 2 +- .../homestore/btree/detail/btree_common.ipp | 1 + .../btree/detail/btree_mutate_impl.ipp | 5 + .../homestore/btree/detail/simple_node.hpp | 4 +- src/include/homestore/index/index_table.hpp | 11 +- src/include/homestore/index_service.hpp | 1 + src/lib/index/index_cp.cpp | 19 +- src/lib/index/index_cp.hpp | 5 +- src/lib/index/index_service.cpp | 8 + src/lib/index/wb_cache.cpp | 169 ++++++--- src/lib/index/wb_cache.hpp | 2 +- src/tests/btree_helpers/btree_test_helper.hpp | 2 +- src/tests/btree_helpers/shadow_map.hpp | 1 + .../test_common/homestore_test_common.hpp | 2 +- src/tests/test_index_crash_recovery.cpp | 339 ++++++++++++++---- src/tests/test_mem_btree.cpp | 8 + src/tests/test_scripts/index_test.py | 17 +- 18 files changed, 455 insertions(+), 143 deletions(-) diff --git a/conanfile.py b/conanfile.py index 0e7fa7d89..deee0421d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.4.65" + version = "6.4.66" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index 2ef1e1d44..0fec83ddd 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -123,7 +123,7 @@ class Btree { void dump_tree_to_file(const std::string& file = "") const; std::string to_custom_string(to_string_cb_t< K, V > const& cb) const; std::string visualize_tree_keys(const std::string& file) const; - uint64_t count_keys(bnodeid_t bnodeid) const; + uint64_t count_keys(bnodeid_t bnodeid = 0) const; nlohmann::json get_metrics_in_json(bool updated = true); bnodeid_t root_node_id() const; diff --git a/src/include/homestore/btree/detail/btree_common.ipp b/src/include/homestore/btree/detail/btree_common.ipp index b21305497..ecda7e138 100644 --- a/src/include/homestore/btree/detail/btree_common.ipp +++ b/src/include/homestore/btree/detail/btree_common.ipp @@ -201,6 +201,7 @@ void Btree< K, V >::to_dot_keys(bnodeid_t bnodeid, std::string& buf, template < typename K, typename V > uint64_t Btree< K, V >::count_keys(bnodeid_t bnodeid) const { + if (bnodeid == 0) { bnodeid = this->root_node_id(); } BtreeNodePtr node; locktype_t acq_lock = locktype_t::READ; if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return 0; } diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp index 209b35558..3e90ccfd5 100644 --- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp +++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp @@ -283,6 +283,11 @@ btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const child_node1->inc_link_version(); // Update the existing parent node entry to point to second child ptr. + // Don't change the order. First update the parent node and then insert the new key. This is important for casee + // where the split key is the last key in the parent node. In this case, the split key should be inserted in the + // parent node. If we insert the split key first, then the split key will be inserted in the parent node and the + // last key in the parent node will be lost. This will lead to inconsistency in the tree. In case of empty parent + // (i.e., new root) or updating the edge, this order made sure that edge is updated. parent_node->update(parent_ind, child_node2->link_info()); parent_node->insert(parent_ind, *out_split_key, child_node1->link_info()); diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/detail/simple_node.hpp index 1f4c30e32..e85d1190c 100644 --- a/src/include/homestore/btree/detail/simple_node.hpp +++ b/src/include/homestore/btree/detail/simple_node.hpp @@ -204,7 +204,7 @@ class SimpleNode : public VariantNode< K, V > { bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const override { #ifdef _PRERELEASE auto max_keys = max_keys_in_node(); - if(max_keys) {return (this->total_entries() < max_keys);} + if(max_keys) {return (this->total_entries() < max_keys);} #endif return ((put_type == btree_put_type::UPSERT) || (put_type == btree_put_type::INSERT)) ? (get_available_entries() > 0) @@ -230,7 +230,7 @@ class SimpleNode : public VariantNode< K, V > { return str; } std::string to_dot_keys() const override { - return to_dot_keys_impl(std::is_same{}); + return to_dot_keys_impl(std::is_same().key()), uint64_t>{}); } std::string to_dot_keys_impl(std::false_type) const { diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 2bec275e3..86f3a8c86 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -224,10 +224,13 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override { + // todo: if(m_sb->root_node == new_root->node_id() && m_sb->root_link_version == new_root->link_version()){ + // return btree_status_t::success;} m_sb->root_node = new_root->node_id(); m_sb->root_link_version = new_root->link_version(); if (!wb_cache().refresh_meta_buf(m_sb_buffer, r_cast< CPContext* >(context))) { + LOGTRACEMOD(wbcache, "CP mismatch error - discard transact for meta node"); return btree_status_t::cp_mismatch; } @@ -238,8 +241,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->to_string()); - - // Get the last key in the node + // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this needs + // to be handled. Get the last key in the node auto const last_parent_key = parent_node->get_last_key< K >(); auto const is_parent_edge_node = parent_node->has_valid_edge(); if ((parent_node->total_entries() == 0) && !is_parent_edge_node) { @@ -285,8 +288,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { BT_LOG(INFO, "Repairing node={} child_node={} child_last_key={}", cur_parent->node_id(), child_node->to_string(), child_last_key.to_string()); - if (child_last_key.compare(last_parent_key) > 0) { - // We have reached the last key, we can stop now + if (child_last_key.compare(last_parent_key) > 0 && !is_parent_edge_node) { + // We have reached the last key, and the parent node doesn't have edge, so we can stop now break; } diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp index 0530c6846..c8801c9d2 100644 --- a/src/include/homestore/index_service.hpp +++ b/src/include/homestore/index_service.hpp @@ -70,6 +70,7 @@ class IndexService { void stop(); // Add/Remove Index Table to/from the index service + uint64_t num_tables(); void add_index_table(const std::shared_ptr< IndexTableBase >& tbl); void remove_index_table(const std::shared_ptr< IndexTableBase >& tbl); std::shared_ptr< IndexTableBase > get_index_table(uuid_t uuid) const; diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index 955bd523f..99ba7dbba 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -142,6 +142,17 @@ void IndexCPContext::to_string_dot(const std::string& filename) { file << "}\n"; file.close(); + LOGINFO("cp dag is stored in file {}", filename); +} + +uint16_t IndexCPContext::num_dags() { + // count number of buffers whose up_buffers are nullptr + uint16_t count = 0; + std::unique_lock lg{m_flush_buffer_mtx}; + m_dirty_buf_list.foreach_entry([&count](IndexBufferPtr buf) { + if (buf->m_up_buffer == nullptr) { count++; } + }); + return count; } std::string IndexCPContext::to_string_with_dags() { @@ -219,6 +230,7 @@ std::map< BlkId, IndexBufferPtr > IndexCPContext::recover(sisl::byte_view sb) { process_txn_record(rec, buf_map); cur_ptr += rec->size(); + LOGTRACEMOD(wbcache, "Recovered txn record: {}: {}", t, rec->to_string()); } return buf_map; @@ -314,8 +326,8 @@ std::string IndexCPContext::txn_record::to_string() const { if (id_count == 0) { fmt::format_to(std::back_inserter(str), "empty]"); } else { - for (uint8_t i{0}; i < id_count; ++i, ++idx) { - fmt::format_to(std::back_inserter(str), "[chunk={}, blk={}],", ids[idx].second, ids[idx].first); + for (uint8_t i{0}; i < id_count; ++i) { + fmt::format_to(std::back_inserter(str), "[{}],", blk_id(idx++).to_integer()); } fmt::format_to(std::back_inserter(str), "]"); } @@ -324,12 +336,13 @@ std::string IndexCPContext::txn_record::to_string() const { std::string str = fmt::format("ordinal={}, parent=[{}], in_place_child=[{}]", index_ordinal, parent_id_string(), child_id_string(), num_new_ids, num_freed_ids); - uint8_t idx = (has_inplace_parent == 0x1) ? 1 : 0 + (has_inplace_child == 0x1) ? 1 : 0; + uint8_t idx = ((has_inplace_parent == 0x1) ? 1 : 0) + ((has_inplace_child == 0x1) ? 1 : 0); fmt::format_to(std::back_inserter(str), ", new_ids=["); add_to_string(str, idx, num_new_ids); fmt::format_to(std::back_inserter(str), ", freed_ids=["); add_to_string(str, idx, num_freed_ids); + fmt::format_to(std::back_inserter(str), "{}", (is_parent_meta ? ", parent is meta" : "")); return str; } } // namespace homestore diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/index_cp.hpp index 1b8a2a2b0..d7bd124df 100644 --- a/src/lib/index/index_cp.hpp +++ b/src/lib/index/index_cp.hpp @@ -94,12 +94,12 @@ struct IndexCPContext : public VDevCPContext { } std::string parent_id_string() const { - return (has_inplace_parent == 0x1) ? fmt::format("chunk={}, blk={}", ids[0].second, ids[0].first) : "empty"; + return (has_inplace_parent == 0x1) ? fmt::format("{}", blk_id(0).to_integer()) : "empty"; } std::string child_id_string() const { auto const idx = (has_inplace_parent == 0x1) ? 1 : 0; - return (has_inplace_child == 0x1) ? fmt::format("chunk={}, blk={}", ids[idx].second, ids[idx].first) + return (has_inplace_child == 0x1) ? fmt::format("{}", blk_id(idx).to_integer()) : "empty"; } @@ -162,6 +162,7 @@ struct IndexCPContext : public VDevCPContext { std::optional< IndexBufferPtr > next_dirty(); std::string to_string(); std::string to_string_with_dags(); + uint16_t num_dags(); void to_string_dot(const std::string& filename); private: diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index cc199bbd5..4b3fb5d11 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -87,10 +87,18 @@ void IndexService::start() { for (const auto& [_, tbl] : m_index_map) { tbl->recovery_completed(); } + // Force taking cp after recovery done. This makes sure that the index table is in consistent state and dirty buffer + // after recovery can be added to dirty list for flushing in the new cp + hs()->cp_mgr().trigger_cp_flush(true /* force */); } void IndexService::stop() { m_wb_cache.reset(); } +uint64_t IndexService::num_tables() { + std::unique_lock lg(m_index_map_mtx); + return m_index_map.size(); +} + void IndexService::add_index_table(const std::shared_ptr< IndexTableBase >& tbl) { std::unique_lock lg(m_index_map_mtx); m_index_map.insert(std::make_pair(tbl->uuid(), tbl)); diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 1b7523363..ed5dd7e6d 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -196,14 +196,19 @@ bool IndexWBCache::refresh_meta_buf(shared< MetaIndexBuffer >& meta_buf, CPConte return false; // meta_buf modified by a newer CP, we shouldn't overwrite that } else if (meta_buf->m_dirtied_cp_id == cp_ctx->id()) { // Modified by the same cp, no need to create new index buffer, but we only copy the superblk to the buffer + LOGTRACEMOD(wbcache, "meta buf {} is already dirtied in cp {} now is in recovery {}", meta_buf->to_string(), + cp_ctx->id(), m_in_recovery); meta_buf->copy_sb_to_buf(); + // TODO: corner case , meta buffer is dirtied by the same cp but not added to dirty list due to previously + // recovery mode } else { // We always create a new meta index buffer on every meta buf update, which copies the superblk auto new_buf = std::make_shared< MetaIndexBuffer >(meta_buf); new_buf->m_dirtied_cp_id = cp_ctx->id(); write_buf(nullptr, new_buf, cp_ctx); meta_buf = new_buf; // Replace the meta_buf with new buf - LOGTRACEMOD(wbcache, "meta buf {} is created in cp {}", meta_buf->to_string(), cp_ctx->id()); + LOGTRACEMOD(wbcache, "meta buf {} is created in cp {} in recovery = {}", meta_buf->to_string(), cp_ctx->id(), + m_in_recovery); } return true; } @@ -292,10 +297,49 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p freed_node_bufs // free_node_bufs ); } +#ifdef _PRERELEASE + // log new nodes and freed nodes and parent and child + static uint32_t txn_id = 0; + static int last_cp_id = -2; + static std::string txn = ""; + if (last_cp_id != icp_ctx->id()) { + last_cp_id = icp_ctx->id(); + txn_id = 0; + txn = ""; + } + + if (new_node_bufs.empty() && freed_node_bufs.empty()) { + fmt::format_to(std::back_inserter(txn), "\n{} - parent=[{}] child=[{}] new=[{}] freed=[{}]", txn_id, + (parent_buf && parent_buf->blkid().to_integer() != 0) + ? std::to_string(parent_buf->blkid().to_integer()) + : "empty", + child_buf->blkid().to_integer(), "empty", "empty"); + } else { + std::string new_nodes; + for (auto const& buf : new_node_bufs) { + new_nodes += std::to_string(buf->blkid().to_integer()) + ", "; + } + std::string freed_nodes; + for (auto const& buf : freed_node_bufs) { + freed_nodes += std::to_string(buf->blkid().to_integer()) + ", "; + } + std::string parent_str = (parent_buf && parent_buf->blkid().to_integer() != 0) + ? std::to_string(parent_buf->blkid().to_integer()) + : "empty"; + std::string child_str = (child_buf && child_buf->blkid().to_integer() != 0) + ? std::to_string(child_buf->blkid().to_integer()) + : "empty"; + + fmt::format_to(std::back_inserter(txn), "\n{} - parent={} child={} new=[{}] freed=[{}]", txn_id, parent_str, + child_str, new_nodes, freed_nodes); + } + LOGTRACEMOD(wbcache, "\ttranasction till now: cp: {} \n{}\n", icp_ctx->id(), txn); + txn_id++; +#endif #if 0 static int id = 0; - auto filename = "transact_bufs_"+std::to_string(id++)+ "_" +std::to_string(rand()%100)+".dot"; - LOGINFO("Transact cp is in cp\n{} and storing in {}\n\n\n", icp_ctx->to_string(), filename); + auto filename = fmt::format("txn_buf_{}_{}.dot", icp_ctx->id(), id++); + LOGTRACEMOD(wbcache,"Writing txn to file: {}", filename); icp_ctx->to_string_dot(filename); #endif } @@ -390,6 +434,14 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { } //////////////////// Recovery Related section ///////////////////////////////// +void IndexWBCache::load_buf(IndexBufferPtr const& buf) { + if (buf->m_bytes == nullptr) { + buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); + m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid()); + buf->m_dirtied_cp_id = BtreeNode::get_modified_cp_id(buf->m_bytes); + } +} + void IndexWBCache::recover(sisl::byte_view sb) { // If sb is empty, its possible a first time boot. if ((sb.bytes() == nullptr) || (sb.size() == 0)) { @@ -408,6 +460,29 @@ void IndexWBCache::recover(sisl::byte_view sb) { LOGINFOMOD(wbcache, "Detected unclean shutdown, prior cp={} had to flush {} nodes, recovering... ", icp_ctx->id(), bufs.size()); +#ifdef _PRERELEASE + auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, + std::vector< IndexBufferPtr > const& l0_bufs) { + std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); + for (auto const& [_, buf] : bufs) { + load_buf(buf); + fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); + } + + // list of new_bufs + if (!l0_bufs.empty()) { + fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size()); + for (auto const& buf : l0_bufs) { + fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); + } + } + return log; + }; + + std::string log = fmt::format("Recovering bufs (#of bufs = {}) before processing them\n", bufs.size()); + LOGTRACEMOD(wbcache, "{}\n{}", log, detailed_log(bufs, {})); +#endif + // At this point, we have the DAG structure (up/down dependency graph), exactly the same as prior to crash, with one // addition of all freed buffers also put in the DAG structure. // @@ -433,30 +508,30 @@ void IndexWBCache::recover(sisl::byte_view sb) { l0_bufs.push_back(buf); } else { buf->m_up_buffer->m_wait_for_down_buffers.decrement(); +#ifndef NDEBUG + bool found{false}; + for (auto it = buf->m_up_buffer->m_down_buffers.begin(); + it != buf->m_up_buffer->m_down_buffers.end(); ++it) { + auto sp = it->lock(); + if (sp && sp == buf) { + found = true; + buf->m_up_buffer->m_down_buffers.erase(it); + break; + } + } + HS_DBG_ASSERT(found, + "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); +#endif } } } } +#ifdef _PRERELEASE LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", l0_bufs.size(), bufs.size(), icp_ctx->id()); - - auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, - std::vector< IndexBufferPtr > const& l0_bufs) { - // Logs to detect down_waits are set correctly for up buffers list of all recovered bufs - std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); - for (auto const& [_, buf] : bufs) { - fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); - } - - // list of new_bufs - fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size()); - for (auto const& buf : l0_bufs) { - fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); - } - return log; - }; LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, l0_bufs)); +#endif // Second iteration we start from the lowest levels (which are all new_bufs) and check if up_buffers need to be // repaired. All L1 buffers are not needed to repair, because they are sibling nodes and so we pass false in @@ -469,7 +544,10 @@ void IndexWBCache::recover(sisl::byte_view sb) { } void IndexWBCache::recover_buf(IndexBufferPtr const& buf) { - if (!buf->m_wait_for_down_buffers.decrement_testz()) { return; } + if (!buf->m_wait_for_down_buffers.decrement_testz()) { + // TODO: remove the buf_>m_up_buffer from down_buffers list of buf->m_up_buffer + return; + } // All down buffers are completed and given a nod saying that they are committed. If this buffer is not committed, // then we need to repair this node/buffer. After that we will keep going to the next up level to repair them if @@ -495,21 +573,21 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) { } // All down_buf has indicated that they have seen this up buffer, now its time to repair them. - if (buf->m_bytes == nullptr) { - // Read the btree node and get its modified cp_id - buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); - m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid()); - if (!BtreeNode::is_valid_node(sisl::blob{buf->m_bytes, m_node_size})) { return false; } - - buf->m_dirtied_cp_id = BtreeNode::get_modified_cp_id(buf->m_bytes); - } - auto cpg = cp_mgr().cp_guard(); - return (buf->m_dirtied_cp_id == cpg->id()); + load_buf(buf); + if (!BtreeNode::is_valid_node(sisl::blob{buf->m_bytes, m_node_size})) { return false; } + return (buf->m_dirtied_cp_id == cp_mgr().cp_guard()->id()); } //////////////////// CP Related API section ///////////////////////////////// folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { - LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp context={}", cp_ctx->to_string_with_dags()); + LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp \ndag={}\n\n cp context {}", cp_ctx->to_string_with_dags(), + cp_ctx->to_string()); + // #ifdef _PRERELEASE + // static int id = 0; + // auto filename = "cp_" + std::to_string(id++) + "_" + std::to_string(rand() % 100) + ".dot"; + // LOGTRACEMOD(wbcache, "Transact cp storing in file {}\n\n\n", filename); + // cp_ctx->to_string_dot(filename); + // #endif if (!cp_ctx->any_dirty_buffers()) { if (cp_ctx->id() == 0) { // For the first CP, we need to flush the journal buffer to the meta blk @@ -523,17 +601,20 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { #ifdef _PRERELEASE if (hs()->crash_simulator().is_crashed()) { - LOGINFOMOD(wbcache, "crash simulation is ongoing, so skip the cp flush"); + LOGINFO("crash simulation is ongoing, so skip the cp flush"); return folly::makeFuture< bool >(true); } #endif - // First thing is to flush the new_blks created as part of the CP. + // First thing is to flush the journal created as part of the CP. auto const& journal_buf = cp_ctx->journal_buf(); + auto txn = r_cast< IndexCPContext::txn_journal const* >(journal_buf.cbytes()); if (journal_buf.size() != 0) { if (m_meta_blk) { + LOGTRACEMOD(wbcache, " journal {} ", txn->to_string()); meta_service().update_sub_sb(journal_buf.cbytes(), journal_buf.size(), m_meta_blk); } else { + LOGTRACEMOD(wbcache, " First time journal {} ", txn->to_string()); meta_service().add_sub_sb("wb_cache", journal_buf.cbytes(), journal_buf.size(), m_meta_blk); } } @@ -556,21 +637,20 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) { #ifdef _PRERELEASE + static std::once_flag flag; if (buf->m_crash_flag_on) { -// std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; -// LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename); -// cp_ctx->to_string_dot(filename); - LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}", buf->to_string()); + std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; + LOGINFO("Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename); + // cp_ctx->to_string_dot(filename); hs()->crash_simulator().crash(); cp_ctx->complete(true); return; } else if (hs()->crash_simulator().is_crashed()) { - LOGINFOMOD(wbcache, "crash simulation is ongoing, aid simulation by not flushing"); + std::call_once(flag, []() { LOGINFO("Crash simulation is ongoing; aid simulation by not flushing."); }); return; } #endif - LOGTRACEMOD(wbcache, "cp={} {}", cp_ctx->id(), buf->to_string()); buf->set_state(index_buf_state_t::FLUSHING); if (buf->is_meta_buf()) { @@ -584,16 +664,13 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const buf->to_string()); process_write_completion(cp_ctx, buf); } else { - LOGTRACEMOD(wbcache, "flushing cp {} buf {} info: {}", cp_ctx->id(), buf->to_string(), - BtreeNode::to_string_buf(buf->raw_buffer())); + LOGTRACEMOD(wbcache, "flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) .thenValue([buf, cp_ctx](auto) { try { auto& pthis = s_cast< IndexWBCache& >(wb_cache()); pthis.process_write_completion(cp_ctx, buf); - } catch (const std::runtime_error& e) { - LOGERROR("Failed to access write-back cache: {}", e.what()); - } + } catch (const std::runtime_error& e) { LOGERROR("Failed to access write-back cache: {}", e.what()); } }); if (!part_of_batch) { m_vdev->submit_batch(); } @@ -602,8 +679,10 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferPtr const& buf) { #ifdef _PRERELEASE + static std::once_flag flag; if (hs()->crash_simulator().is_crashed()) { - LOGINFOMOD(wbcache, "Crash simulation is ongoing, ignore all process_write_completion"); + std::call_once( + flag, []() { LOGINFOMOD(wbcache, "Crash simulation is ongoing, ignore all process_write_completion"); }); return; } #endif diff --git a/src/lib/index/wb_cache.hpp b/src/lib/index/wb_cache.hpp index 209d3845e..25a4c8201 100644 --- a/src/lib/index/wb_cache.hpp +++ b/src/lib/index/wb_cache.hpp @@ -41,7 +41,6 @@ class IndexWBCache : public IndexWBCacheBase { std::mutex m_flush_mtx; void* m_meta_blk; bool m_in_recovery{false}; - public: IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size); @@ -78,5 +77,6 @@ class IndexWBCache : public IndexWBCacheBase { void recover_buf(IndexBufferPtr const& buf); bool was_node_committed(IndexBufferPtr const& buf); + void load_buf(IndexBufferPtr const& buf); }; } // namespace homestore diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index 6c00975ea..a047fed23 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -402,7 +402,7 @@ struct BtreeTestHelper { LOGINFO("{}{}", preamble.empty() ? "" : preamble + ":\n", m_bt->to_custom_string(print_key_range)); } - void visualize_keys(const std::string& file) const { m_bt->visualize_tree_keys(file); } + void visualize_keys(const std::string& file) const { /*m_bt->visualize_tree_keys(file);*/ } void compare_files(const std::string& before, const std::string& after) { std::ifstream b(before, std::ifstream::ate); diff --git a/src/tests/btree_helpers/shadow_map.hpp b/src/tests/btree_helpers/shadow_map.hpp index 9818c8a45..3e8c998ef 100644 --- a/src/tests/btree_helpers/shadow_map.hpp +++ b/src/tests/btree_helpers/shadow_map.hpp @@ -240,6 +240,7 @@ class ShadowMap { file << key << " " << value << '\n'; } file.close(); + LOGINFO("Saved shadow map to file: {}", filename); } void load(const std::string& filename) { diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 174039495..97ca410f7 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -434,7 +434,7 @@ class HSTestHelper { } #ifdef _PRERELEASE hsi->with_crash_simulator([this](void) mutable { - LOGINFO("CrashSimulator::crash() is called - restarting homestore"); + LOGWARN("CrashSimulator::crash() is called - restarting homestore"); this->restart_homestore(); m_crash_recovered.setValue(); }); diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 11235be6a..0f5963eff 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -34,25 +34,30 @@ SISL_LOGGING_DECL(test_index_crash_recovery) // TODO Add tests to do write,remove after recovery. // TODO Test with var len key with io mgr page size is 512. -SISL_OPTION_GROUP(test_index_crash_recovery, - (num_iters, "", "num_iters", "number of iterations for rand ops", - ::cxxopts::value< uint32_t >()->default_value("500"), "number"), - (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), - (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), - "seconds"), - (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", - ::cxxopts::value< uint32_t >()->default_value("0"), ""), - (operation_list, "", "operation_list", - "operation list instead of default created following by percentage", - ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), - (preload_size, "", "preload_size", "number of entries to preload tree with", - ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), - (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), - (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", - ::cxxopts::value< bool >()->default_value("1"), ""), - (seed, "", "seed", "random engine seed, use random if not defined", - ::cxxopts::value< uint64_t >()->default_value("0"), "number")) +SISL_OPTION_GROUP( + test_index_crash_recovery, + (num_iters, "", "num_iters", "number of iterations for rand ops", + ::cxxopts::value< uint32_t >()->default_value("500"), "number"), + (num_entries, "", "num_entries", "number of entries to test with", + ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), + (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), + (num_rounds, "", "num_rounds", "number of rounds to test with", + ::cxxopts::value< uint32_t >()->default_value("100"), "number"), + (num_entries_per_rounds, "", "num_entries_per_rounds", "number of entries per rounds", + ::cxxopts::value< uint32_t >()->default_value("40"), "number"), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("0"), + ""), + (operation_list, "", "operation_list", "operation list instead of default created following by percentage", + ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), + (preload_size, "", "preload_size", "number of entries to preload tree with", + ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), + (load_from_file, "", "load_from_file", "load from file", ::cxxopts::value< bool >()->default_value("0"), ""), + (save_to_file, "", "save_to_file", "save to file", ::cxxopts::value< bool >()->default_value("0"), ""), + (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", + ::cxxopts::value< bool >()->default_value("1"), ""), + (seed, "", "seed", "random engine seed, use random if not defined", + ::cxxopts::value< uint64_t >()->default_value("0"), "number")) void log_obj_life_counter() { std::string str; @@ -74,8 +79,6 @@ class SequenceGenerator { public: SequenceGenerator(int putFreq, int removeFreq, uint64_t start_range, uint64_t end_range) : putFreq_(putFreq), removeFreq_(removeFreq), start_range_(start_range), end_range_(end_range) { - std::random_device rd; - gen_ = std::mt19937(rd()); keyDist_ = std::uniform_int_distribution<>(start_range_, end_range_); updateOperationTypeDistribution(); } @@ -100,11 +103,11 @@ class SequenceGenerator { std::vector< Operation > operations; if (reset) { this->reset(); } for (size_t i = 0; i < numOperations; ++i) { - uint32_t key = keyDist_(gen_); + uint32_t key = keyDist_(g_re); auto [it, inserted] = keyStates.try_emplace(key, false); auto& inUse = it->second; - OperationType operation = static_cast< OperationType >(opTypeDist_(gen_)); + OperationType operation = static_cast< OperationType >(opTypeDist_(g_re)); if (operation == OperationType::Put && !inUse) { operations.emplace_back(key, OperationType::Put); @@ -131,15 +134,16 @@ class SequenceGenerator { } return occurrences; } - __attribute__((noinline)) std::string printOperations(const OperationList& operations) const { + __attribute__((noinline)) static std::string printOperations(const OperationList& operations) { std::ostringstream oss; + auto count = 1; for (const auto& [key, opType] : operations) { std::string opTypeStr = (opType == OperationType::Put) ? "Put" : "Remove"; - oss << "{" << key << ", " << opTypeStr << "}\n"; + oss << count++ << "- {" << key << ", " << opTypeStr << "}\n"; } return oss.str(); } - __attribute__((noinline)) std::string printKeysOccurrences(const OperationList& operations) const { + __attribute__((noinline)) static std::string printKeysOccurrences(const OperationList& operations) { std::set< uint64_t > keys = collectUniqueKeys(operations); std::ostringstream oss; for (auto key : keys) { @@ -152,16 +156,51 @@ class SequenceGenerator { } return oss.str(); } - __attribute__((noinline)) std::string printKeyOccurrences(const OperationList& operations, uint64_t key ) const { + __attribute__((noinline)) static std::string printKeyOccurrences(const OperationList& operations, uint64_t key) { std::ostringstream oss; auto keyOccurrences = inspect(operations, key); oss << "Occurrences of key " << key << ":\n"; for (const auto& [index, operation] : keyOccurrences) { std::string opTypeStr = (operation == OperationType::Put) ? "Put" : "Remove"; - oss << "Index: " << index << ", Operation: " << opTypeStr << "\n"; + oss << "Index: " << index << ", Operation: " << opTypeStr << "\n"; } return oss.str(); } + + static std::set< uint64_t > collectUniqueKeys(const OperationList& operations) { + std::set< uint64_t > keys; + for (const auto& [key, _] : operations) { + keys.insert(key); + } + return keys; + } + static void save_to_file(std::string filename, const OperationList& operations) { + std::ofstream file(filename); + if (file.is_open()) { + for (const auto& [key, opType] : operations) { + file << key << " " << static_cast< int >(opType) << "\n"; + } + file.close(); + } + } + + static OperationList load_from_file(std::string filename) { + std::ifstream file(filename); + OperationList operations; + if (file.is_open()) { + std::string line; + while (std::getline(file, line)) { + std::istringstream iss(line); + uint64_t key; + int opType; + iss >> key >> opType; + operations.emplace_back(key, static_cast< OperationType >(opType)); + } + file.close(); + } + return operations; + } + void reset() { keyStates.clear(); } private: @@ -169,7 +208,6 @@ class SequenceGenerator { int removeFreq_; uint64_t start_range_; uint64_t end_range_; - std::mt19937 gen_; std::uniform_int_distribution<> keyDist_; std::discrete_distribution<> opTypeDist_; std::map< uint64_t, bool > keyStates; @@ -178,15 +216,8 @@ class SequenceGenerator { opTypeDist_ = std::discrete_distribution<>({static_cast< double >(putFreq_), static_cast< double >(removeFreq_)}); } - - std::set< uint64_t > collectUniqueKeys(const OperationList& operations) const { - std::set< uint64_t > keys; - for (const auto& [key, _] : operations) { - keys.insert(key); - } - return keys; - } }; + #ifdef _PRERELEASE template < typename TestType > struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestType >, public ::testing::Test { @@ -198,7 +229,9 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT TestIndexServiceCallbacks(IndexCrashTest* test) : m_test(test) {} std::shared_ptr< IndexTableBase > on_index_table_found(superblk< index_table_sb >&& sb) override { - LOGINFO("Index table recovered, root bnode_id {} version {}", sb->root_node, sb->root_link_version); + LOGINFO("Index table recovered, root bnode_id {} uuid {} ordinal {} version {}", + static_cast< uint64_t >(sb->root_node), boost::uuids::to_string(sb->uuid), sb->ordinal, + sb->root_link_version); m_test->m_cfg = BtreeConfig(hs()->index_service().node_size()); m_test->m_cfg.m_leaf_node_type = T::leaf_node_type; @@ -241,18 +274,29 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT BtreeTestHelper< TestType >::SetUp(); if (this->m_bt == nullptr || SISL_OPTIONS["init_device"].as< bool >()) { this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); + auto num_keys = this->m_bt->count_keys(this->m_bt->root_node_id()); + // LOGINFO("Creating new index table with uuid {} - init_device:{:s} bt: {} root id {}, num of + // keys {}", boost::uuids::to_string(uuid), SISL_OPTIONS["init_device"].as< bool >(), + // this->m_bt, this->m_bt->root_node_id(), num_keys); + LOGINFO("Creating new index table with uuid {} - root id {}, num of keys {}", boost::uuids::to_string(uuid), + this->m_bt->root_node_id(), num_keys); + } else { populate_shadow_map(); } hs()->index_service().add_index_table(this->m_bt); - LOGINFO("Added index table to index service"); + LOGINFO("Added index table to index service with uuid {} - total tables in the system is currently {}", + boost::uuids::to_string(uuid), hs()->index_service().num_tables()); } void populate_shadow_map() { + LOGINFO("Populating shadow map"); this->m_shadow_map.load(m_shadow_filename); - ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) - << "shadow map size and tree size mismatch"; + auto num_keys = this->m_bt->count_keys(this->m_bt->root_node_id()); + LOGINFO("Shadow map size {} - btree keys {} - root id {}", this->m_shadow_map.size(), num_keys, + this->m_bt->root_node_id()); + ASSERT_EQ(this->m_shadow_map.size(), num_keys) << "shadow map size and tree size mismatch"; this->get_all(); } @@ -263,6 +307,8 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); hs()->index_service().add_index_table(this->m_bt); this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); + this->m_shadow_map.save(m_shadow_filename); + LOGINFO("Reset btree with uuid {} - erase shadow map {}", boost::uuids::to_string(uuid), m_shadow_filename); } void restart_homestore(uint32_t shutdown_delay_sec = 3) override { @@ -274,7 +320,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT void reapply_after_crash() { ShadowMap< K, V > snapshot_map{this->m_shadow_map.max_keys()}; snapshot_map.load(m_shadow_filename); - LOGDEBUG("\tSnapshot before crash\n{}", snapshot_map.to_string()); + LOGINFO("\tSnapshot before crash\n{}", snapshot_map.to_string()); auto diff = this->m_shadow_map.diff(snapshot_map); // visualize tree after crash @@ -286,7 +332,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT for (const auto& [k, addition] : diff) { dif_str += fmt::format(" {} \t{}\n", k.key(), addition); } - LOGDEBUG("Diff between shadow map and snapshot map\n{}\n", dif_str); + LOGINFO("Diff between shadow map and snapshot map\n{}\n", dif_str); for (const auto& [k, addition] : diff) { // this->print_keys(fmt::format("reapply: before inserting key {}", k.key())); @@ -324,8 +370,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Error: failed to remove {}", m_shadow_filename); } } - LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(), - this->m_bt->count_keys(this->m_bt->root_node_id())); + LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(), this->tree_key_count()); BtreeTestHelper< TestType >::TearDown(); this->shutdown_homestore(false); } @@ -341,30 +386,67 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->get_all(); LOGINFO("Expect to have [{},{}) in tree and it is actually{} ", s_key, e_key, tree_key_count()); - ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) - << "shadow map size and tree size mismatch"; + ASSERT_EQ(this->m_shadow_map.size(), this->tree_key_count()) << "shadow map size and tree size mismatch"; + } + + void sanity_check(OperationList& operations) const { + std::set< uint64_t > new_keys; + std::transform(operations.begin(), operations.end(), std::inserter(new_keys, new_keys.end()), + [](const Operation& operation) { return operation.first; }); + uint32_t count = 1; + this->m_shadow_map.foreach ([this, new_keys, &count](K key, V value) { + // discard the new keys to check + if (new_keys.find(key.key()) != new_keys.end()) { return; } + auto copy_key = std::make_unique< K >(); + *copy_key = key; + auto out_v = std::make_unique< V >(); + auto req = BtreeSingleGetRequest{copy_key.get(), out_v.get()}; + req.enable_route_tracing(); + const auto ret = this->m_bt->get(req); + ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map"; + LOGINFO("{} - Key {} passed sanity check!", count++, key.key()); + }); } void crash_and_recover(OperationList& operations, std::string filename = "") { - // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + LOGINFO("Before Crash: {} keys in shadow map and it is actually {} keys in tree - operations size {}", + this->m_shadow_map.size(), tree_key_count(), operations.size()); + + if (!filename.empty()) { + std::string b_filename = filename + "_before_crash.dot"; + LOGINFO("Visualize the tree before crash file {}", b_filename); + this->visualize_keys(b_filename); + } + test_common::HSTestHelper::trigger_cp(false); + LOGINFO("waiting for crash to recover"); this->wait_for_crash_recovery(); - // this->print_keys("Post crash and recovery, btree structure:"); if (!filename.empty()) { - LOGINFO("Visualize the tree file {}", filename); - this->visualize_keys(filename); + std::string rec_filename = filename + "_after_recovery.dot"; + LOGINFO("Visualize the tree file after recovery : {}", rec_filename); + this->visualize_keys(rec_filename); + this->print_keys("Post crash and recovery, btree structure: "); } - + sanity_check(operations); + // Added to the index service right after recovery. Not needed here + // test_common::HSTestHelper::trigger_cp(true); + LOGINFO("Before Reapply: {} keys in shadow map and actually {} in trees operation size {}", + this->m_shadow_map.size(), tree_key_count(), operations.size()); this->reapply_after_crash(operations); - - // this->print_keys("\n\nafter reapply keys"); if (!filename.empty()) { - LOGINFO("Visualize the tree file after_reapply__{}", filename); - this->visualize_keys("after_reapply__" + filename); + std::string re_filename = filename + "_after_reapply.dot"; + LOGINFO("Visualize the tree after reapply {}", re_filename); + this->visualize_keys(re_filename); + this->print_keys("Post crash and recovery, btree structure: "); } this->get_all(); + LOGINFO("After reapply: {} keys in shadow map and actually {} in tress", this->m_shadow_map.size(), + tree_key_count()); + ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) + << "shadow map size and tree size mismatch"; } uint32_t tree_key_count() { return this->m_bt->count_keys(this->m_bt->root_node_id()); } @@ -378,6 +460,8 @@ using BtreeTypes = testing::Types< FixedLenBtree >; TYPED_TEST_SUITE(IndexCrashTest, BtreeTypes); TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) { + this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); + this->m_shadow_map.save(this->m_shadow_filename); // Simulate the crash even before first cp this->set_basic_flip("crash_flush_on_root"); @@ -393,6 +477,8 @@ TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) { } TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) { + this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); + this->m_shadow_map.save(this->m_shadow_filename); // Insert into 4 phases, first fill up the last part, since we need to test split on left edge LOGINFO("Step 1: Fill up the last quarter of the tree"); auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); @@ -525,11 +611,11 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) { vector< std::string > flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", "crash_flush_on_split_at_right_child"}; OperationList operations; + bool renew_btree_after_crash = true; for (size_t i = 0; i < flips.size(); ++i) { - this->reset_btree(); LOGINFO("Step 1-{}: Set flag {}", i + 1, flips[i]); this->set_basic_flip(flips[i]); - operations = generator.generateOperations(num_entries -1 , true /* reset */); + operations = generator.generateOperations(num_entries - 1, renew_btree_after_crash /* reset */); // LOGINFO("Batch {} Operations:\n {} \n ", i + 1, generator.printOperations(operations)); // LOGINFO("Detailed Key Occurrences for Batch {}:\n {} \n ", i + 1, // generator.printKeyOccurrences(operations)); @@ -538,49 +624,148 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) { this->put(k, btree_put_type::INSERT, true /* expect_success */); } this->crash_and_recover(operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); + if (renew_btree_after_crash) { this->reset_btree(); }; } } TYPED_TEST(IndexCrashTest, long_running_put_crash) { + // Define the lambda function auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); + auto const preload_size = SISL_OPTIONS["preload_size"].as< uint32_t >(); + auto const rounds = SISL_OPTIONS["num_rounds"].as< uint32_t >(); + auto const num_entries_per_rounds = SISL_OPTIONS["num_entries_per_rounds"].as< uint32_t >(); + bool load_mode = SISL_OPTIONS.count("load_from_file"); + bool save_mode = SISL_OPTIONS.count("save_to_file"); SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); vector< std::string > flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", "crash_flush_on_split_at_right_child"}; + + std::string flip = ""; OperationList operations; auto m_start_time = Clock::now(); auto time_to_stop = [this, m_start_time]() { return (get_elapsed_time_sec(m_start_time) > this->m_run_time); }; double elapsed_time, progress_percent, last_progress_time = 0; - for (size_t i = 0; !time_to_stop(); ++i) { - bool print_time = false; - elapsed_time = get_elapsed_time_sec(m_start_time); + bool renew_btree_after_crash = false; + auto cur_flip_idx = 0; + std::uniform_int_distribution<> dis(1, 100); + int flip_percentage = 90; // Set the desired percentage here + bool normal_execution = true; + bool clean_shutdown = true; + // if it is safe then delete all previous save files + if (save_mode) { + std::filesystem::remove_all("/tmp/operations_*.txt"); + std::filesystem::remove_all("/tmp/flips_history.txt"); + } + // init tree + LOGINFO("Step 0: Fill up the tree with {} entries", preload_size); + if (load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_0.txt")); + } else { + operations = generator.generateOperations(preload_size, true /* reset */); + if (save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); } + } + auto opstr = SequenceGenerator::printOperations(operations); + LOGINFO("Lets before crash print operations\n{}", opstr); - this->reset_btree(); - auto flip = flips[i % flips.size()]; - LOGINFO("Step 1-{}: Set flag {}", i + 1, flip); + for (auto [k, _] : operations) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } - this->set_basic_flip(flip, 1, 10); - operations = generator.generateOperations(num_entries -1, true /* reset */); - // operations = generator.generateOperations(num_entries/10, false /* reset */); - // LOGINFO("Batch {} Operations:\n {} \n ", i + 1, generator.printOperations(operations)); - // LOGINFO("Detailed Key Occurrences for Batch {}:\n {} \n ", i + 1, - // generator.printKeyOccurrences(operations)); + // Trigger the cp to make sure middle part is successful + LOGINFO("Step 0-1: Flush all the entries so far"); + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + this->m_shadow_map.save(this->m_shadow_filename); + this->print_keys("reapply: after preload"); + this->visualize_keys("tree_after_preload.dot"); + + for (uint32_t round = 1; + round <= rounds && !time_to_stop() && this->tree_key_count() < num_entries - num_entries_per_rounds; round++) { + LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, rounds); + bool print_time = false; + elapsed_time = get_elapsed_time_sec(m_start_time); + if (load_mode) { + std::ifstream file("/tmp/flips_history.txt"); + std::string line; + bool found = false; + for (uint32_t i = 0; i < round && std::getline(file, line); i++) { + if (i == round - 1) { + found = true; + break; + } + } + if (found && !line.empty()) { + if (line == "normal") { + normal_execution = true; + } else { + normal_execution = false; + flip = line; + LOGINFO("Step 1-{}: Set flag {}", round, flip); + this->set_basic_flip(flip, 1, 100); + } + } + file.close(); + } else { + if (dis(g_re) <= flip_percentage) { + flip = flips[cur_flip_idx++ % flips.size()]; + LOGINFO("Step 1-{}: Set flag {}", round, flip); + this->set_basic_flip(flip, 1, 100); + normal_execution = false; + } else { + normal_execution = true; + LOGINFO("Step 1-{}: No flip set", round); + } + if (save_mode) { + // save the filp name to a file for later use + std::ofstream file("/tmp/flips_history.txt", std::ios::app); + if (file.is_open()) { file << (normal_execution ? "normal" : flip) << "\n"; } + file.close(); + } + } + if (load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); + } else { + operations = generator.generateOperations(num_entries_per_rounds, renew_btree_after_crash /* reset */); + if (save_mode) { + SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations); + } + } + LOGINFO("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); for (auto [k, _] : operations) { - // LOGINFO("\t\t\t\t\t\t\t\t\t\t\t\t\tupserting entry {}", k); this->put(k, btree_put_type::INSERT, true /* expect_success */); + if (!time_to_stop()) { + static bool print_alert = false; + if (print_alert) { + LOGINFO("It is time to stop but let's finish this round and then stop!"); + print_alert = false; + } + } + } + if (normal_execution) { + if (clean_shutdown) { + this->m_shadow_map.save(this->m_shadow_filename); + this->restart_homestore(); + } else { + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + } + } else { + this->crash_and_recover(operations, fmt::format("long_tree_{}", round)); } - this->crash_and_recover(operations/*, fmt::format("recover_tree_crash_{}.dot", i + 1)*/); if (elapsed_time - last_progress_time > 30) { last_progress_time = elapsed_time; print_time = true; } if (print_time) { - LOGINFO("\n\n\n\t\t\tProgress: {} iterations completed - Elapsed time: {:.0f} seconds of total " - "{} ({:.2f}%)\n\n\n", - i, elapsed_time, this->m_run_time, elapsed_time * 100.0 / this->m_run_time); + LOGINFO("\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " + "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", + round, rounds, round * 100.0 / rounds, elapsed_time, this->m_run_time, + elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), num_entries, + this->tree_key_count() * 100.0 / num_entries); } - this->print_keys(fmt::format("reapply: after iteration {}", i)); - + this->print_keys(fmt::format("reapply: after round {}", round)); + if (renew_btree_after_crash) { this->reset_btree(); }; } } #endif diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 1ef55302b..141fcf5e2 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -48,6 +48,8 @@ SISL_OPTION_GROUP( ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", + ::cxxopts::value< uint32_t >()->default_value("0"), ""), (seed, "", "seed", "random engine seed, use random if not defined", ::cxxopts::value< uint64_t >()->default_value("0"), "number"), (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds")) @@ -102,6 +104,9 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { void SetUp() override { BtreeTestHelper< TestType >::SetUp(); +#ifdef _PRERELEASE + this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); +#endif this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); } }; @@ -300,6 +305,9 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin .hugepage_size_mb = 0}); BtreeTestHelper< TestType >::SetUp(); +#ifdef _PRERELEASE + this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); +#endif this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); } diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index 4e4814ccb..02c3e4c2c 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -20,11 +20,13 @@ def run_test(options, type): raise TestFailedError(f"Test failed for type {type}") print("Test completed") + def run_crash_test(options): - cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --max_keys_in_node={options['max_keys_in_node']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} {options['dev_list']}" + cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --log_mods=wbcache:trace --max_keys_in_node={options['max_keys_in_node']} --num_entries_per_rounds={options['num_entries_per_rounds']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} --num_rounds={options['num_rounds']} {options['dev_list']} " # print(f"Running test with options: {cmd_opts}") try: - subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT, shell=True) + subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT, + shell=True) except subprocess.CalledProcessError as e: print(f"Test failed: {e}") raise TestFailedError(f"Test failed for type {type}") @@ -49,7 +51,9 @@ def parse_arguments(): parser.add_argument('--dev_list', help='Device list', default='') parser.add_argument('--cleanup_after_shutdown', help='Cleanup after shutdown', type=bool, default=False) parser.add_argument('--init_device', help='Initialize device', type=bool, default=True) - parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=20) + parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=5) + parser.add_argument('--num_rounds', help='number of rounds for crash test', type=int, default=10000) + parser.add_argument('--num_entries_per_rounds', help='number of rounds for crash test', type=int, default=60) # Parse the known arguments and ignore any unknown arguments args, unknown = parser.parse_known_args() @@ -73,7 +77,6 @@ def long_runnig_index(options, type=0): def long_running_clean_shutdown(options, type=0): print("Long running clean shutdown started") - options['run_time'] = int(options['run_time']) // 10 # 20 minutes try: run_test(options, type) @@ -87,14 +90,18 @@ def long_running_clean_shutdown(options, type=0): raise print("Long running clean shutdown completed") + def long_running_crash_put(options): print("Long running crash put started") - options['num_entries'] = 20480 # 20K + options['num_entries'] = 131072 # 128K options['init_device'] = True + options['run_time'] = 14400 # 4 hours + options['preload_size'] = 100 print(f"options: {options}") run_crash_test(options) print("Long running crash put completed") + def main(): options = parse_arguments() test_suite_name = options['test_suits'] From 9a06c059943a264f8548d374532c7bded3a59501 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Mon, 21 Oct 2024 15:04:05 +0800 Subject: [PATCH 012/170] Count in ovf headers. We see no space error in write_to_full ut, might due to when left space == max_wrt_sz and we take max_wrt_sz, however two extra blks are needed. Signed-off-by: Xiaoxi Chen --- src/tests/test_meta_blk_mgr.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tests/test_meta_blk_mgr.cpp b/src/tests/test_meta_blk_mgr.cpp index 640d6ac84..af80e9da5 100644 --- a/src/tests/test_meta_blk_mgr.cpp +++ b/src/tests/test_meta_blk_mgr.cpp @@ -195,7 +195,8 @@ class VMetaBlkMgrTest : public ::testing::Test { uint64_t size_written{0}; while (free_size > 0) { - if (free_size >= gp.max_wrt_sz) { + // if it is overflow, 2 extra blocks are needed for ovf blk header and meta blk; + if (free_size - 2 * m_mbm->block_size() >= gp.max_wrt_sz) { size_written = do_sb_write(do_overflow(), 0); } else { size_written = do_sb_write(false, m_mbm->meta_blk_context_sz()); From 634047ce95051e6f5303a38506aaf3fd0a1bd1b9 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Tue, 22 Oct 2024 17:24:14 -0700 Subject: [PATCH 013/170] Reduce logs (#571) --- conanfile.py | 2 +- src/tests/test_index_crash_recovery.cpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index deee0421d..f79b22549 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.4.66" + version = "6.4.67" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 0f5963eff..83b1928cd 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -404,8 +404,9 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT req.enable_route_tracing(); const auto ret = this->m_bt->get(req); ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map"; - LOGINFO("{} - Key {} passed sanity check!", count++, key.key()); }); + LOGINFO("Sanity check passed for {} keys!", count); + } void crash_and_recover(OperationList& operations, std::string filename = "") { @@ -439,7 +440,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT std::string re_filename = filename + "_after_reapply.dot"; LOGINFO("Visualize the tree after reapply {}", re_filename); this->visualize_keys(re_filename); - this->print_keys("Post crash and recovery, btree structure: "); +// this->print_keys("Post crash and recovery, btree structure: "); } this->get_all(); From a5b2969e98abaf7bb9b11d03ab8f45562f2026a7 Mon Sep 17 00:00:00 2001 From: Sanal P Date: Mon, 21 Oct 2024 13:51:28 -0700 Subject: [PATCH 014/170] Change replace member api signature. Add replica member info with name, priority and id. Use replica member info for replace member api and listener callbacks. --- conanfile.py | 2 +- .../homestore/replication/repl_decls.h | 7 ++ src/include/homestore/replication/repl_dev.h | 2 +- src/include/homestore/replication_service.hpp | 3 +- .../replication/repl_dev/raft_repl_dev.cpp | 81 ++++++++++--------- src/lib/replication/repl_dev/raft_repl_dev.h | 11 +-- .../replication/service/generic_repl_svc.cpp | 4 +- .../replication/service/generic_repl_svc.h | 4 +- .../replication/service/raft_repl_service.cpp | 25 +++--- .../replication/service/raft_repl_service.h | 15 ++-- src/tests/test_common/raft_repl_test_base.hpp | 11 ++- src/tests/test_solo_repl_dev.cpp | 2 +- 12 files changed, 90 insertions(+), 77 deletions(-) diff --git a/conanfile.py b/conanfile.py index f79b22549..a59c35466 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.4.67" + version = "6.5.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 24c6a7571..edcdbe51e 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -78,6 +78,13 @@ struct peer_info { uint64_t last_succ_resp_us_; }; +struct replica_member_info { + static constexpr uint64_t max_name_len = 128; + replica_id_t id; + char name[max_name_len]; + int32_t priority{0}; +}; + } // namespace homestore // hash function definitions diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 15dc4872a..c2223455f 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -348,7 +348,7 @@ class ReplDevListener { virtual void on_destroy() = 0; /// @brief Called when replace member is performed. - virtual void replace_member(replica_id_t member_out, replica_id_t member_in) = 0; + virtual void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) = 0; /// @brief Called when the snapshot is being created by nuraft virtual AsyncReplResult<> create_snapshot(shared< snapshot_context > context) = 0; diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index f9b4f2986..c3e56d9a3 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -41,7 +41,8 @@ class ReplicationService { /// @return A Future which gets called after schedule to release (before garbage collection is kicked in) virtual folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) = 0; - virtual AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, + virtual AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 565bc0d67..f3a4a2461 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -74,7 +74,7 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_rd_sb->free_blks_journal_id = m_free_blks_journal->get_store_id(); } m_rd_sb.write(); - bind_data_service(); + bind_data_service(); } RD_LOG(INFO, @@ -90,27 +90,30 @@ bool RaftReplDev::bind_data_service() { RD_LOG(INFO, "Starting data channel, group_id={}, replica_id={}", group_id_str(), my_replica_id_str()); bool success = false; #ifdef _PRERELEASE - success = m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, [this](intrusive< sisl::GenericRpcData >& rpc_data) { - if (iomgr_flip::instance()->delay_flip("slow_down_data_channel", [this, rpc_data]() mutable { - RD_LOGI("Resuming after slow down data channel flip"); + success = + m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, [this](intrusive< sisl::GenericRpcData >& rpc_data) { + if (iomgr_flip::instance()->delay_flip("slow_down_data_channel", [this, rpc_data]() mutable { + RD_LOGI("Resuming after slow down data channel flip"); + on_push_data_received(rpc_data); + })) { + RD_LOGI("Slow down data channel flip is enabled, scheduling to call later"); + } else { on_push_data_received(rpc_data); - })) { - RD_LOGI("Slow down data channel flip is enabled, scheduling to call later"); - } else { - on_push_data_received(rpc_data); - } - }); + } + }); #else - success = m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1)); + success = + m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1)); #endif if (!success) { RD_LOGE("Failed to bind data service request for PUSH_DATA"); - return false; + return false; } - success = m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 1)); + success = + m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 1)); if (!success) { RD_LOGE("Failed to bind data service request for FETCH_DATA"); - return false; + return false; } return true; } @@ -127,10 +130,10 @@ bool RaftReplDev::join_group() { return true; } -AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, replica_id_t member_in_uuid, - uint32_t commit_quorum) { +AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum) { LOGINFO("Replace member group_id={} member_out={} member_in={}", group_id_str(), - boost::uuids::to_string(member_out_uuid), boost::uuids::to_string(member_in_uuid)); + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); if (commit_quorum >= 1) { // Two members are down and leader cant form the quorum. Reduce the quorum size. @@ -138,7 +141,7 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl } // Step 1: Check if leader itself is requested to move out. - if (m_my_repl_id == member_out_uuid && m_my_repl_id == get_leader_id()) { + if (m_my_repl_id == member_out.id && m_my_repl_id == get_leader_id()) { // If leader is the member requested to move out, then give up leadership and return error. // Client will retry replace_member request to the new leader. raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); @@ -148,9 +151,9 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl } // Step 2. Add the new member. - return m_msg_mgr.add_member(m_group_id, member_in_uuid) + return m_msg_mgr.add_member(m_group_id, member_in.id) .via(&folly::InlineExecutor::instance()) - .thenValue([this, member_in_uuid, member_out_uuid, commit_quorum](auto&& e) -> AsyncReplResult<> { + .thenValue([this, member_in, member_out, commit_quorum](auto&& e) -> AsyncReplResult<> { // TODO Currently we ignore the cancelled, fix nuraft_mesg to not timeout // when adding member. Member is added to cluster config until member syncs fully // with atleast stop gap. This will take a lot of time for block or @@ -168,18 +171,17 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl return make_async_error<>(RaftReplService::to_repl_error(e.error())); } } - auto member_out = boost::uuids::to_string(member_out_uuid); - auto member_in = boost::uuids::to_string(member_in_uuid); - RD_LOGI("Replace member added member={} to group_id={}", member_in, group_id_str()); + RD_LOGI("Replace member added member={} to group_id={}", boost::uuids::to_string(member_in.id), + group_id_str()); // Step 3. Append log entry to mark the old member is out and new member is added. auto rreq = repl_req_ptr_t(new repl_req_ctx{}); replace_members_ctx members; - std::copy(member_in_uuid.begin(), member_in_uuid.end(), members.in_replica_id.begin()); - std::copy(member_out_uuid.begin(), member_out_uuid.end(), members.out_replica_id.begin()); - sisl::blob header(r_cast< uint8_t* >(&members), - members.in_replica_id.size() + members.out_replica_id.size()); + members.replica_out = member_out; + members.replica_in = member_in; + + sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_members_ctx)); rreq->init( repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0); @@ -196,7 +198,7 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl // Step 4. Remove the old member. Even if the old member is temporarily // down and recovers, nuraft mesg see member remove from cluster log // entry and call exit_group() and leave(). - return m_msg_mgr.rem_member(m_group_id, member_out_uuid) + return m_msg_mgr.rem_member(m_group_id, member_out.id) .via(&folly::InlineExecutor::instance()) .thenValue([this, member_out, commit_quorum](auto&& e) -> AsyncReplResult<> { if (e.hasError()) { @@ -212,7 +214,8 @@ AsyncReplResult<> RaftReplDev::replace_member(replica_id_t member_out_uuid, repl return make_async_error<>(ReplServiceError::RETRY_REQUEST); } } else { - RD_LOGI("Replace member removed member={} from group_id={}", member_out, group_id_str()); + RD_LOGI("Replace member removed member={} from group_id={}", + boost::uuids::to_string(member_out.id), group_id_str()); } // Revert the quorum size back to 0. @@ -957,13 +960,11 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) void RaftReplDev::replace_member(repl_req_ptr_t rreq) { auto members = r_cast< const replace_members_ctx* >(rreq->header().cbytes()); - replica_id_t member_in, member_out; - std::copy(members->out_replica_id.begin(), members->out_replica_id.end(), member_out.begin()); - std::copy(members->in_replica_id.begin(), members->in_replica_id.end(), member_in.begin()); - RD_LOGI("Raft repl replace_member member_out={} member_in={}", boost::uuids::to_string(member_out), - boost::uuids::to_string(member_in)); - m_listener->replace_member(member_out, member_in); + RD_LOGI("Raft repl replace_member commit member_out={} member_in={}", + boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); + + m_listener->on_replace_member(members->replica_out, members->replica_in); } static bool blob_equals(sisl::blob const& a, sisl::blob const& b) { @@ -1224,7 +1225,7 @@ void RaftReplDev::flush_durable_commit_lsn() { } /////////////////////////////////// Private metohds //////////////////////////////////// -void RaftReplDev::cp_flush(CP* cp, cshared ctx) { +void RaftReplDev::cp_flush(CP* cp, cshared< ReplDevCPContext > ctx) { auto const lsn = ctx->cp_lsn; auto const clsn = ctx->compacted_to_lsn; auto const dsn = ctx->last_applied_dsn; @@ -1247,14 +1248,14 @@ void RaftReplDev::cp_flush(CP* cp, cshared ctx) { cp->to_string()); } -cshared RaftReplDev::get_cp_ctx(CP* cp) { +cshared< ReplDevCPContext > RaftReplDev::get_cp_ctx(CP* cp) { auto const cp_lsn = m_commit_upto_lsn.load(); auto const clsn = m_compact_lsn.load(); auto const dsn = m_next_dsn.load(); - RD_LOGD("getting cp_ctx for raft repl dev {}, cp_lsn={}, clsn={}, next_dsn={}, cp string:{}", - (void *)this, cp_lsn, clsn, dsn, cp->to_string()); - auto dev_ctx = std::make_shared(); + RD_LOGD("getting cp_ctx for raft repl dev {}, cp_lsn={}, clsn={}, next_dsn={}, cp string:{}", (void*)this, cp_lsn, + clsn, dsn, cp->to_string()); + auto dev_ctx = std::make_shared< ReplDevCPContext >(); dev_ctx->cp_lsn = cp_lsn; dev_ctx->compacted_to_lsn = clsn; dev_ctx->last_applied_dsn = dsn; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 3b25cb23b..4be98394c 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -36,8 +36,8 @@ using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, DESTROYING, DESTROYED, PERMANENT_DESTROYED); struct replace_members_ctx { - std::array< uint8_t, 16 > out_replica_id; - std::array< uint8_t, 16 > in_replica_id; + replica_member_info replica_out; + replica_member_info replica_in; }; class RaftReplDevMetrics : public sisl::MetricsGroup { @@ -162,7 +162,8 @@ class RaftReplDev : public ReplDev, bool bind_data_service(); bool join_group(); - AsyncReplResult<> replace_member(replica_id_t member_out, replica_id_t member_in, uint32_t commit_quorum); + AsyncReplResult<> replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + uint32_t commit_quorum); folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// @@ -199,8 +200,8 @@ class RaftReplDev : public ReplDev, sisl::blob const& key, uint32_t data_size, bool is_data_channel); folly::Future< folly::Unit > notify_after_data_written(std::vector< repl_req_ptr_t >* rreqs); void check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreqs); - void cp_flush(CP* cp, cshared ctx); - cshared get_cp_ctx(CP* cp); + void cp_flush(CP* cp, cshared< ReplDevCPContext > ctx); + cshared< ReplDevCPContext > get_cp_ctx(CP* cp); void cp_cleanup(CP* cp); void become_ready(); diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 8e5c9a7a1..9aa2c044d 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -147,8 +147,8 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } } -AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, - uint32_t commit_quorum) const { +AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index 5e0cb84a3..acdff7bd4 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -73,8 +73,8 @@ class SoloReplService : public GenericReplService { std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, - uint32_t commit_quorum = 0) const override; + AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0) const override; }; class SoloReplServiceCPHandler : public CPCallbacks { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index c4aefe1ca..0469d7829 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -85,12 +85,11 @@ void RaftReplService::start() { LOGINFO("Starting RaftReplService with server_uuid={} port={}", boost::uuids::to_string(params.server_uuid_), params.mesg_port_); - //check if ssl cert files are provided, if yes, monitor the changes + // check if ssl cert files are provided, if yes, monitor the changes if (!params.ssl_key_.empty() && !params.ssl_cert_.empty()) { ioenvironment.with_file_watcher(); monitor_cert_changes(); } - // Step 2: Register all RAFT parameters. At the end of this step, raft is ready to be created/join group auto r_params = nuraft::raft_params() @@ -158,7 +157,7 @@ void RaftReplService::start() { auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); rdev->wait_for_logstore_ready(); if (!rdev->join_group()) { - HS_REL_ASSERT(false, "FAILED TO JOIN GROUP, PANIC HERE"); + HS_REL_ASSERT(false, "FAILED TO JOIN GROUP, PANIC HERE"); it = m_rd_map.erase(it); } else { ++it; @@ -191,19 +190,19 @@ void RaftReplService::monitor_cert_changes() { restart_svc.detach(); }; - //monitor ssl cert file + // monitor ssl cert file if (!fw->register_listener(ioenvironment.get_ssl_cert(), "hs_ssl_cert_watcher", cert_change_cb)) { - LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", - "hs_ssl_cert_watcher", ioenvironment.get_ssl_cert()); + LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", "hs_ssl_cert_watcher", + ioenvironment.get_ssl_cert()); } - //monitor ssl key file + // monitor ssl key file if (!fw->register_listener(ioenvironment.get_ssl_key(), "hs_ssl_key_watcher", cert_change_cb)) { - LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", - "hs_ssl_key_watcher", ioenvironment.get_ssl_key()); + LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", "hs_ssl_key_watcher", + ioenvironment.get_ssl_key()); } } -void RaftReplService::restart_raft_svc(const std::string filepath, const bool deleted){ +void RaftReplService::restart_raft_svc(const std::string filepath, const bool deleted) { if (deleted && !wait_for_cert(filepath)) { LOGINFO("file {} deleted, ", filepath) // wait for the deleted file to be added again @@ -215,7 +214,7 @@ void RaftReplService::restart_raft_svc(const std::string filepath, const bool de } bool RaftReplService::wait_for_cert(const std::string& filepath) { - auto attempts = cert_change_timeout/cert_check_sleep; + auto attempts = cert_change_timeout / cert_check_sleep; for (auto i = attempts; i > 0; --i) { if (std::filesystem::exists(filepath)) { return true; } std::this_thread::sleep_for(cert_check_sleep); @@ -394,8 +393,8 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki add_repl_dev(group_id, rdev); } -AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, - uint32_t commit_quorum) const { +AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum) const { auto rdev_result = get_repl_dev(group_id); if (!rdev_result) { return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index e0d1e6718..9a53ad07d 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -51,7 +51,7 @@ class RaftReplService : public GenericReplService, iomgr::timer_handle_t m_flush_durable_commit_timer_hdl; iomgr::io_fiber_t m_reaper_fiber; std::mutex raft_restart_mutex; - + public: RaftReplService(cshared< ReplApplication >& repl_app); @@ -73,8 +73,8 @@ class RaftReplService : public GenericReplService, std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, replica_id_t member_in, - uint32_t commit_quorum = 0) const override; + AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0) const override; private: RaftReplDev* raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie); @@ -98,12 +98,13 @@ struct ReplDevCPContext; class ReplSvcCPContext : public CPContext { std::shared_mutex m_cp_map_mtx; - std::map< ReplDev*, cshared > m_cp_ctx_map; + std::map< ReplDev*, cshared< ReplDevCPContext > > m_cp_ctx_map; + public: - ReplSvcCPContext(CP* cp) : CPContext(cp){}; + ReplSvcCPContext(CP* cp) : CPContext(cp) {}; virtual ~ReplSvcCPContext() = default; - int add_repl_dev_ctx(ReplDev* dev, cshared dev_ctx); - cshared get_repl_dev_ctx(ReplDev* dev); + int add_repl_dev_ctx(ReplDev* dev, cshared< ReplDevCPContext > dev_ctx); + cshared< ReplDevCPContext > get_repl_dev_ctx(ReplDev* dev); }; class RaftReplServiceCPHandler : public CPCallbacks { diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index a3160f13a..e0e2f6487 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -301,7 +301,10 @@ class TestReplicatedDB : public homestore::ReplDevListener { ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { return blk_alloc_hints{}; } - void replace_member(replica_id_t member_out, replica_id_t member_in) override {} + void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) override { + LOGINFO("[Replica={}] replace member out {} in {}", g_helper->replica_num(), + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + } void on_destroy() override { LOGINFOMOD(replication, "[Replica={}] Group={} is being destroyed", g_helper->replica_num(), @@ -615,9 +618,9 @@ class RaftReplDevTestBase : public testing::Test { this->run_on_leader(db, [this, db, member_out, member_in, commit_quorum]() { LOGINFO("Replace member out={} in={}", boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); - auto v = hs()->repl_service() - .replace_member(db->repl_dev()->group_id(), member_out, member_in, commit_quorum) - .get(); + replica_member_info out{member_out, ""}; + replica_member_info in{member_in, ""}; + auto v = hs()->repl_service().replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); ASSERT_EQ(v.hasError(), false) << "Error in replacing member"; }); } diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index c26ba273d..2e17235f2 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -136,7 +136,7 @@ class SoloReplDevTest : public testing::Test { cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received error={} on repl_dev", enum_name(error)); } - void replace_member(replica_id_t member_out, replica_id_t member_in) override {} + void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) override {} void on_destroy() override {} }; From 8a80eef33716ceeb668979ea57b21b6ff6b340a0 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Mon, 28 Oct 2024 09:20:30 -0700 Subject: [PATCH 015/170] Add package version and show in log (#575) --- conanfile.py | 4 +++- src/lib/homestore.cpp | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index a59c35466..7b94c14ca 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.1" + version = "6.5.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" @@ -94,6 +94,8 @@ def generate(self): tc.variables['BUILD_COVERAGE'] = 'ON' elif self.options.get_safe("sanitize"): tc.variables['MEMORY_SANITIZER_ON'] = 'ON' + tc.variables["CONAN_PACKAGE_NAME"] = self.name + tc.variables["CONAN_PACKAGE_VERSION"] = self.version tc.generate() # This generates "boost-config.cmake" and "grpc-config.cmake" etc in self.generators_folder diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index af2d521c5..0b9e0ed76 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -58,6 +59,7 @@ HomeStoreSafePtr HomeStore::s_instance{nullptr}; static std::unique_ptr< IndexServiceCallbacks > s_index_cbs; static shared< ChunkSelector > s_custom_chunk_selector{nullptr}; static shared< ReplApplication > s_repl_app{nullptr}; +std::string version = PACKAGE_VERSION; HomeStore* HomeStore::instance() { if (s_instance == nullptr) { s_instance = std::make_shared< HomeStore >(); } @@ -121,6 +123,12 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ static std::once_flag flag1; std::call_once(flag1, [this]() { +#ifndef NDEBUG + LOGINFO("HomeStore DEBUG version: {}", version); +#else + LOGINFO("HomeStore RELEASE version: {}", version); +#endif + sisl::VersionMgr::addVersion(PACKAGE_NAME, version::Semver200_version(PACKAGE_VERSION)); m_periodic_logger = sisl::logging::CreateCustomLogger("homestore", "_periodic", false, true /* tee_to_stdout_stderr */); sisl::logging::SetLogPattern("[%D %T.%f] [%^%L%$] [%t] %v", m_periodic_logger); From a7a9fe53f59fdb1c78a1327c4d5946e54d97b1d5 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Tue, 29 Oct 2024 09:57:45 +0800 Subject: [PATCH 016/170] add chunksize to vchunk interface (#572) --- conanfile.py | 2 +- src/include/homestore/vchunk.h | 1 + src/lib/device/vchunk.cpp | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 7b94c14ca..7c0d5aa76 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.2" + version = "6.5.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/vchunk.h b/src/include/homestore/vchunk.h index b52832faa..0406d428f 100644 --- a/src/include/homestore/vchunk.h +++ b/src/include/homestore/vchunk.h @@ -35,6 +35,7 @@ class VChunk { uint32_t get_pdev_id() const; uint16_t get_chunk_id() const; cshared< Chunk > get_internal_chunk() const; + uint64_t size() const; private: shared< Chunk > m_internal_chunk; diff --git a/src/lib/device/vchunk.cpp b/src/lib/device/vchunk.cpp index 1a7aaeac5..26391ac1b 100644 --- a/src/lib/device/vchunk.cpp +++ b/src/lib/device/vchunk.cpp @@ -33,5 +33,7 @@ uint32_t VChunk::get_pdev_id() const { return m_internal_chunk->physical_dev()-> uint16_t VChunk::get_chunk_id() const { return m_internal_chunk->chunk_id(); } +uint64_t VChunk::size() const { return m_internal_chunk->size(); } + cshared< Chunk > VChunk::get_internal_chunk() const { return m_internal_chunk; } } // namespace homestore From 60eea4afa6f059148e5f16ba1e63b894a9553de9 Mon Sep 17 00:00:00 2001 From: koujl <108138320+koujl@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:10:39 +0800 Subject: [PATCH 017/170] Add index CR UT for basic merge (#556) Signed-off-by: Jilong Kou --- conanfile.py | 2 +- .../homestore/btree/detail/btree_internal.hpp | 4 +- .../homestore/btree/detail/btree_node.hpp | 21 +- src/include/homestore/index/index_table.hpp | 120 ++++-- src/lib/device/virtual_dev.cpp | 9 + src/lib/homestore.cpp | 2 - src/lib/index/index_cp.cpp | 15 +- src/lib/index/wb_cache.cpp | 166 +++++--- .../test_common/homestore_test_common.hpp | 7 +- src/tests/test_index_crash_recovery.cpp | 375 +++++++++++++----- src/tests/test_scripts/index_test.py | 1 + 11 files changed, 519 insertions(+), 203 deletions(-) diff --git a/conanfile.py b/conanfile.py index 7c0d5aa76..f61f1ef54 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.3" + version = "6.5.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/detail/btree_internal.hpp b/src/include/homestore/btree/detail/btree_internal.hpp index 67b33b089..8989a2d5d 100644 --- a/src/include/homestore/btree/detail/btree_internal.hpp +++ b/src/include/homestore/btree/detail/btree_internal.hpp @@ -245,7 +245,9 @@ struct BtreeConfig { uint8_t m_split_pct{50}; uint32_t m_max_merge_nodes{3}; #ifdef _PRERELEASE - uint64_t m_max_keys_in_node{0}; + // These are for testing purpose only + uint64_t m_max_keys_in_node{0}; + uint64_t m_min_keys_in_node{0}; #endif bool m_rebalance_turned_on{false}; bool m_merge_turned_on{true}; diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index a3285ef35..988b683cf 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -37,6 +37,7 @@ struct transient_hdr_t { /* these variables are accessed without taking lock and are not expected to change after init */ uint8_t leaf_node{0}; uint64_t max_keys_in_node{0}; + uint64_t min_keys_in_node{0}; // to specify the threshold for triggering merge bool is_leaf() const { return (leaf_node != 0); } }; @@ -116,6 +117,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { m_trans_hdr.leaf_node = is_leaf; #ifdef _PRERELEASE m_trans_hdr.max_keys_in_node = cfg.m_max_keys_in_node; + m_trans_hdr.min_keys_in_node = cfg.m_min_keys_in_node; #endif } @@ -299,6 +301,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { template < typename K > K get_first_key() const { + if (total_entries() == 0) { return K{}; } return get_nth_key< K >(0, true); } @@ -333,6 +336,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { // uint32_t total_entries() const { return (has_valid_edge() ? total_entries() + 1 : total_entries()); } uint64_t max_keys_in_node() const { return m_trans_hdr.max_keys_in_node; } + uint64_t min_keys_in_node() const { return m_trans_hdr.min_keys_in_node; } void lock(locktype_t l) const { if (l == locktype_t::READ) { @@ -392,6 +396,12 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { } fmt::format_to(std::back_inserter(str), "]"); } + + // Should not happen + if (this->is_node_deleted()) { + fmt::format_to(std::back_inserter(str), " **DELETED** "); + } + return str; } @@ -527,15 +537,10 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { virtual uint32_t occupied_size() const { return (node_data_size() - available_size()); } bool is_merge_needed(const BtreeConfig& cfg) const { -#if 0 #ifdef _PRERELEASE - if (iomgr_flip::instance()->test_flip("btree_merge_node") && occupied_size() < node_data_size) { - return true; - } - - auto ret = iomgr_flip::instance()->get_test_flip< uint64_t >("btree_merge_node_pct"); - if (ret && occupied_size() < (ret.get() * node_data_size() / 100)) { return true; } -#endif + if (min_keys_in_node()) { + return total_entries() < min_keys_in_node(); + } #endif return (occupied_size() < cfg.suggested_min_size()); } diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 86f3a8c86..94b8685a3 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -78,7 +78,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } void destroy() override { - Btree< K, V >::destroy_btree(nullptr); + auto cpg = cp_mgr().cp_guard(); + Btree::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); m_sb.destroy(); } @@ -130,13 +131,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { idx_buf->m_dirtied_cp_id = cpg->id(); BtreeNodePtr bn = BtreeNodePtr{n}; - LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string()); - repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + // Only for interior nodes we need to repair its links + if (!bn->is_leaf()) { + LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string()); + repair_links(bn, (void *) cpg.context(cp_consumer_t::INDEX_SVC)); + } if (idx_buf->m_up_buffer && idx_buf->m_up_buffer->is_meta_buf()) { // Our up buffer is a meta buffer, which means that we are the new root node, we need to update the // meta_buf with new root as well - on_root_changed(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + on_root_changed(bn, (void *) cpg.context(cp_consumer_t::INDEX_SVC)); } } @@ -223,7 +227,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { wb_cache().free_buf(n->m_idx_buf, r_cast< CPContext* >(context)); } - btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override { + btree_status_t + on_root_changed(BtreeNodePtr const &new_root, void *context) override { // todo: if(m_sb->root_node == new_root->node_id() && m_sb->root_link_version == new_root->link_version()){ // return btree_status_t::success;} m_sb->root_node = new_root->node_id(); @@ -235,12 +240,12 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } auto& root_buf = static_cast< IndexBtreeNode* >(new_root.get())->m_idx_buf; - wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, r_cast< CPContext* >(context)); + wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, r_cast(context)); return btree_status_t::success; } btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { - BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->to_string()); + BT_LOG(DEBUG, "Repairing links for parent node [{}]", parent_node->to_string()); // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this needs // to be handled. Get the last key in the node auto const last_parent_key = parent_node->get_last_key< K >(); @@ -250,7 +255,15 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { parent_node->node_id()); return btree_status_t::not_found; } - BT_LOG(INFO, "Repairing node={} with last_parent_key={}", parent_node->to_string(), + + // Get all original child ids as a support to check if we are beyond the last child node + std::set orig_child_ids; + for (uint32_t i = 0; i < parent_node->total_entries(); ++i) { + BtreeLinkInfo link_info; + parent_node->get_nth_value(i, &link_info, true); + orig_child_ids.insert(link_info.bnode_id()); + } + BT_LOG(INFO, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), last_parent_key.to_string()); // Get the first child node and its link info @@ -275,22 +288,45 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { auto cur_parent = parent_node; BtreeNodeList new_parent_nodes; do { - if (child_node->has_valid_edge() || - (child_node->is_leaf() && (child_node->next_bnode() == empty_bnodeid))) { - BT_DBG_ASSERT(is_parent_edge_node, - "Child node={} is an edge node but parent_node={} is not an edge node", - child_node->node_id(), cur_parent->node_id()); - cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) { + if (child_node->is_node_deleted()) { + // Edge node is merged, we need to set the current last entry as edge + if (cur_parent->total_entries() > 0) { + auto prev_val = V{}; + cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true); + cur_parent->remove(cur_parent->total_entries() - 1); + cur_parent->set_edge_value(prev_val); + BT_LOG(INFO, "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}", + cur_parent->node_id(), child_node->to_string(), prev_val.to_string()); + } else { + BT_LOG(INFO, "Found an empty interior node {} with maybe all childs deleted", + cur_parent->node_id()); + } + } else { + // Update edge and finish + BT_LOG(INFO, "Repairing node={}, child_node=[{}] is an edge node, end loop", cur_parent->node_id(), + child_node->to_string()); + child_node->set_next_bnode(empty_bnodeid); + write_node_impl(child_node, cp_ctx); + cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + } break; } auto const child_last_key = child_node->get_last_key< K >(); - BT_LOG(INFO, "Repairing node={} child_node={} child_last_key={}", cur_parent->node_id(), + BT_LOG(INFO, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(), child_node->to_string(), child_last_key.to_string()); - if (child_last_key.compare(last_parent_key) > 0 && !is_parent_edge_node) { - // We have reached the last key, and the parent node doesn't have edge, so we can stop now - break; + // Check if we are beyond the last child node. + // + // There can be cases where the child level merge is successfully persisted but the parent level is not. + // In this case, you may have your rightmost child node with last key greater than the last_parent_key. + // That's why here we have to check if the child node is one of the original child nodes first. + if (!is_parent_edge_node && !orig_child_ids.contains(child_node->node_id())) { + if (child_node->total_entries() == 0 || child_last_key.compare(last_parent_key) > 0) { + // We have reached a child beyond this parent, we can stop now + break; + } } if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(), @@ -312,20 +348,37 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } // Insert the last key of the child node into parent node - cur_parent->insert(cur_parent->total_entries(), child_last_key, - BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (!child_node->is_node_deleted()) { + cur_parent->insert(cur_parent->total_entries(), + child_node->total_entries() > 0 ? child_last_key : last_parent_key, + BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (child_node->total_entries() == 0) { + // There should be at most one empty child node per parent - if we find one, we should stop here + BT_LOG(INFO, "Repairing node={}, child_node=[{}] is empty, end loop", cur_parent->node_id(), + child_node->to_string()); + break; + } + } else { + // Node deleted indicates it's freed & no longer used during recovery + BT_LOG(INFO, "Repairing node={}, child node=[{}] is deleted, skipping the insert", + cur_parent->node_id(), child_node->to_string()); + } - BT_LOG(INFO, "Repairing node={}, repaired so_far={}", cur_parent->node_id(), cur_parent->to_string()); + BT_LOG(INFO, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), cur_parent->to_string()); // Move to the next child node - this->unlock_node(child_node, locktype_t::READ); auto const next_node_id = child_node->next_bnode(); + this->unlock_node(child_node, locktype_t::READ); if (next_node_id == empty_bnodeid) { - BT_LOG_ASSERT(false, - "Child node={} next_node_id is empty, while its not a edge node, parent_node={} " - "repair is partial", - child_node->node_id(), parent_node->node_id()); - ret = btree_status_t::not_found; + // This can be a deleted edge node - only check if it is still valid + if (!child_node->is_node_deleted()) { + BT_LOG_ASSERT(false, + "Child node={} next_node_id is empty, while its not a edge node, parent_node={} " + "repair is partial", + child_node->node_id(), parent_node->node_id()); + ret = btree_status_t::not_found; + } + child_node = nullptr; break; } @@ -333,10 +386,21 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { if (ret != btree_status_t::success) { BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}", parent_node->node_id(), enum_name(ret)); + child_node = nullptr; break; } } while (true); - this->unlock_node(child_node, locktype_t::READ); + + if (child_node) { + this->unlock_node(child_node, locktype_t::READ); + } + + if (parent_node->total_entries() == 0 && !parent_node->has_valid_edge()) { + // We shouldn't have an empty interior node in the tree, let's delete it. + // The buf will be released by the caller + BT_LOG(INFO, "Parent node={} is empty, deleting it", parent_node->node_id()); + parent_node->set_node_deleted(); + } if (ret == btree_status_t::success) { ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx); diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp index 3665f13b9..ac49f95dd 100644 --- a/src/lib/device/virtual_dev.cpp +++ b/src/lib/device/virtual_dev.cpp @@ -424,6 +424,8 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId con Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(), + dev_offset); if (sisl_unlikely(dev_offset == INVALID_DEV_OFFSET)) { return std::make_error_code(std::errc::resource_unavailable_try_again); } @@ -436,6 +438,9 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared< if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; } #endif + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(), + chunk->start_offset() + offset_in_chunk); + if (sisl_unlikely(!is_chunk_available(chunk))) { return std::make_error_code(std::errc::resource_unavailable_try_again); } @@ -457,6 +462,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId cons auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset); + COUNTER_INCREMENT(m_metrics, vdev_write_count, 1); if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) { COUNTER_INCREMENT(m_metrics, unalign_writes, 1); @@ -479,6 +486,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< C auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset); + COUNTER_INCREMENT(m_metrics, vdev_write_count, 1); if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) { COUNTER_INCREMENT(m_metrics, unalign_writes, 1); diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 0b9e0ed76..feec506c5 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -329,8 +329,6 @@ void HomeStore::shutdown() { #ifdef _PRERELEASE flip::Flip::instance().stop_rpc_server(); #endif - - HomeStore::reset_instance(); LOGINFO("Homestore is completed its shutdown"); } diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index 99ba7dbba..d58e33c22 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -261,7 +261,6 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, } if (up_buf) { - DEBUG_ASSERT(((buf->m_up_buffer == nullptr) || (buf->m_up_buffer == up_buf)), "Inconsistent up buffer"); auto real_up_buf = (up_buf->m_created_cp_id == cpg->id()) ? up_buf->m_up_buffer : up_buf; #ifndef NDEBUG @@ -279,6 +278,20 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, #endif if (buf->m_up_buffer != real_up_buf) { + if (buf->m_up_buffer) { + buf->m_up_buffer->m_wait_for_down_buffers.decrement(1); +#ifndef NDEBUG + bool found{false}; + for (auto it = buf->m_up_buffer->m_down_buffers.begin(); it != buf->m_up_buffer->m_down_buffers.end(); ++it) { + if (it->lock() == buf) { + buf->m_up_buffer->m_down_buffers.erase(it); + found = true; + break; + } + } + HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); +#endif + } real_up_buf->m_wait_for_down_buffers.increment(1); buf->m_up_buffer = real_up_buf; } diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index ed5dd7e6d..b17d67dc8 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -289,13 +289,8 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p } icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {}); } else { - icp_ctx->add_to_txn_journal(index_ordinal, // Ordinal - child_buf->m_up_buffer, // real up buffer - new_node_bufs.empty() ? freed_node_bufs[0]->m_up_buffer - : new_node_bufs[0]->m_up_buffer, // real in place child - new_node_bufs, // new node bufs - freed_node_bufs // free_node_bufs - ); + icp_ctx->add_to_txn_journal(index_ordinal, child_buf->m_up_buffer /* real up buffer */, child_buf, + new_node_bufs, freed_node_bufs); } #ifdef _PRERELEASE // log new nodes and freed nodes and parent and child @@ -415,6 +410,22 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const& } // Now we link the down_buffer to the real up_buffer + if (down_buf->m_up_buffer) { + // release existing up_buffer's wait count + down_buf->m_up_buffer->m_wait_for_down_buffers.decrement(); +#ifndef NDEBUG + bool found{false}; + for (auto it = down_buf->m_up_buffer->m_down_buffers.begin(); it != down_buf->m_up_buffer->m_down_buffers.end(); + ++it) { + if (it->lock() == down_buf) { + down_buf->m_up_buffer->m_down_buffers.erase(it); + found = true; + break; + } + } + HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); +#endif + } real_up_buf->m_wait_for_down_buffers.increment(1); down_buf->m_up_buffer = real_up_buf; #ifndef NDEBUG @@ -428,13 +439,13 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { bool done = m_cache.remove(buf->m_blkid, node); HS_REL_ASSERT_EQ(done, true, "Race on cache removal of btree blkid?"); } - + buf->m_node_freed = true; resource_mgr().inc_free_blk(m_node_size); - m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(cp_ctx)); + m_vdev->free_blk(buf->m_blkid, s_cast(cp_ctx)); } //////////////////// Recovery Related section ///////////////////////////////// -void IndexWBCache::load_buf(IndexBufferPtr const& buf) { +void IndexWBCache::load_buf(IndexBufferPtr const &buf) { if (buf->m_bytes == nullptr) { buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid()); @@ -462,17 +473,17 @@ void IndexWBCache::recover(sisl::byte_view sb) { #ifdef _PRERELEASE auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, - std::vector< IndexBufferPtr > const& l0_bufs) { + std::vector const &pending_bufs) { std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); - for (auto const& [_, buf] : bufs) { + for (auto const &[_, buf]: bufs) { load_buf(buf); fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); } // list of new_bufs - if (!l0_bufs.empty()) { - fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size()); - for (auto const& buf : l0_bufs) { + if (!pending_bufs.empty()) { + fmt::format_to(std::back_inserter(log), "\n\tpending_bufs (#of bufs = {})\n", pending_bufs.size()); + for (auto const &buf: pending_bufs) { fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); } } @@ -493,57 +504,79 @@ void IndexWBCache::recover(sisl::byte_view sb) { // This has to be done before doing any repair, because repair can allocate blkids and we don't want to allocate // the same blkid which could clash with the blkid next in the buf list. // - // On the second pass, we only take the new nodes/bufs and then repair their up buffers, if needed. - std::vector< IndexBufferPtr > l0_bufs; - for (auto const& [_, buf] : bufs) { - if (buf->m_node_freed || (buf->m_created_cp_id == icp_ctx->id())) { + // On the second pass, we only take part of the parents/siblings and then repair them, if needed. + std::vector pending_bufs; + std::vector deleted_bufs; + for (auto const &[_, buf]: bufs) { + if (buf->m_node_freed) { + // Freed node + load_buf(buf); if (was_node_committed(buf)) { - if (was_node_committed(buf->m_up_buffer)) { - if (buf->m_node_freed) { - // Up buffer was written, so this buffer can be freed and thus can free the blk. - m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); - } else { - m_vdev->commit_blk(buf->m_blkid); - } - l0_bufs.push_back(buf); - } else { - buf->m_up_buffer->m_wait_for_down_buffers.decrement(); + // Mark this buffer as deleted, so that we can avoid using it anymore when repairing its parent's link + r_cast(buf->m_bytes)->node_deleted = true; + write_buf(nullptr, buf, icp_ctx); + deleted_bufs.push_back(buf); + pending_bufs.push_back(buf->m_up_buffer); + } else { + // (Up) buffer is not committed, node need to be kept and (potentially) repaired later + buf->m_node_freed = false; + if (buf->m_created_cp_id == icp_ctx->id()) { + // New nodes need to be commited first + m_vdev->commit_blk(buf->m_blkid); + } + pending_bufs.push_back(buf); + buf->m_wait_for_down_buffers.increment(1); // Purely for recover_buf() counter consistency + } + } else if (buf->m_created_cp_id == icp_ctx->id()) { + // New node + if (was_node_committed(buf) && was_node_committed(buf->m_up_buffer)) { + // Both current and up buffer is commited, we can safely commit the current block + m_vdev->commit_blk(buf->m_blkid); + pending_bufs.push_back(buf->m_up_buffer); + } else { + // Just ignore it + buf->m_up_buffer->m_wait_for_down_buffers.decrement(); #ifndef NDEBUG - bool found{false}; - for (auto it = buf->m_up_buffer->m_down_buffers.begin(); - it != buf->m_up_buffer->m_down_buffers.end(); ++it) { - auto sp = it->lock(); - if (sp && sp == buf) { - found = true; - buf->m_up_buffer->m_down_buffers.erase(it); - break; - } + bool found{false}; + for (auto it = buf->m_up_buffer->m_down_buffers.begin(); + it != buf->m_up_buffer->m_down_buffers.end(); ++it) { + auto sp = it->lock(); + if (sp && sp == buf) { + found = true; + buf->m_up_buffer->m_down_buffers.erase(it); + break; } - HS_DBG_ASSERT(found, - "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); -#endif } + HS_DBG_ASSERT(found, + "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); +#endif } } } #ifdef _PRERELEASE LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", - l0_bufs.size(), bufs.size(), icp_ctx->id()); - LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, l0_bufs)); + pending_bufs.size(), bufs.size(), icp_ctx->id()); + LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, pending_bufs)); #endif - // Second iteration we start from the lowest levels (which are all new_bufs) and check if up_buffers need to be - // repaired. All L1 buffers are not needed to repair, because they are sibling nodes and so we pass false in - // do_repair flag. - for (auto const& buf : l0_bufs) { - recover_buf(buf->m_up_buffer); + for (auto const &buf: pending_bufs) { + recover_buf(buf); + if (buf->m_bytes != nullptr && r_cast(buf->m_bytes)->node_deleted) { + // This buffer was marked as deleted during repair, so we also need to free it + deleted_bufs.push_back(buf); + } } + + for (auto const &buf: deleted_bufs) { + m_vdev->free_blk(buf->m_blkid, s_cast(icp_ctx)); + } + m_in_recovery = false; m_vdev->recovery_completed(); } -void IndexWBCache::recover_buf(IndexBufferPtr const& buf) { +void IndexWBCache::recover_buf(IndexBufferPtr const &buf) { if (!buf->m_wait_for_down_buffers.decrement_testz()) { // TODO: remove the buf_>m_up_buffer from down_buffers list of buf->m_up_buffer return; @@ -638,6 +671,10 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) { #ifdef _PRERELEASE static std::once_flag flag; + if (hs()->crash_simulator().is_crashed()) { + std::call_once(flag, []() { LOGINFO("Crash simulation is ongoing; aid simulation by not flushing."); }); + return; + } if (buf->m_crash_flag_on) { std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; LOGINFO("Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename); @@ -645,33 +682,34 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const hs()->crash_simulator().crash(); cp_ctx->complete(true); return; - } else if (hs()->crash_simulator().is_crashed()) { - std::call_once(flag, []() { LOGINFO("Crash simulation is ongoing; aid simulation by not flushing."); }); - return; } #endif buf->set_state(index_buf_state_t::FLUSHING); if (buf->is_meta_buf()) { - LOGTRACEMOD(wbcache, "flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), + LOGTRACEMOD(wbcache, "Flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), buf->to_string()); - auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; - meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + auto const &sb = r_cast(buf.get())->m_sb; + if (!sb.is_empty()) { + meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + } process_write_completion(cp_ctx, buf); } else if (buf->m_node_freed) { LOGTRACEMOD(wbcache, "Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), buf->to_string()); process_write_completion(cp_ctx, buf); } else { - LOGTRACEMOD(wbcache, "flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); - m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) - .thenValue([buf, cp_ctx](auto) { - try { - auto& pthis = s_cast< IndexWBCache& >(wb_cache()); - pthis.process_write_completion(cp_ctx, buf); - } catch (const std::runtime_error& e) { LOGERROR("Failed to access write-back cache: {}", e.what()); } - }); + LOGTRACEMOD(wbcache, "Flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); + m_vdev->async_write(r_cast(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) + .thenValue([buf, cp_ctx](auto) { + try { + auto &pthis = s_cast(wb_cache()); + pthis.process_write_completion(cp_ctx, buf); + } catch (const std::runtime_error &e) { + LOGERROR("Failed to access write-back cache: {}", e.what()); + } + }); if (!part_of_batch) { m_vdev->submit_batch(); } } @@ -764,7 +802,7 @@ void IndexWBCache::get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_c std::optional< IndexBufferPtr > buf = cp_ctx->next_dirty(); if (!buf) { break; } // End of list - if ((*buf)->m_wait_for_down_buffers.testz()) { + if ((*buf)->state() == index_buf_state_t::DIRTY && (*buf)->m_wait_for_down_buffers.testz()) { bufs.emplace_back(std::move(*buf)); ++count; } else { diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 97ca410f7..af1b38f0e 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -198,8 +198,8 @@ class HSTestHelper { } homestore::HomeStore::instance()->shutdown(); + iomanager.stop(); // Stop iomanager first in case any fiber is still referencing homestore resources homestore::HomeStore::reset_instance(); - iomanager.stop(); if (cleanup) { remove_files(m_generated_devs); @@ -251,6 +251,11 @@ class HSTestHelper { m_fc.inject_delay_flip(flip_name, {null_cond}, freq, delay_usec); LOGDEBUG("Flip {} set", flip_name); } + + void remove_flip(const std::string flip_name) { + m_fc.remove_flip(flip_name); + LOGDEBUG("Flip {} removed", flip_name); + } #endif static void fill_data_buf(uint8_t* buf, uint64_t size, uint64_t pattern = 0) { diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 83b1928cd..560bf0f83 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -37,27 +37,29 @@ SISL_LOGGING_DECL(test_index_crash_recovery) SISL_OPTION_GROUP( test_index_crash_recovery, (num_iters, "", "num_iters", "number of iterations for rand ops", - ::cxxopts::value< uint32_t >()->default_value("500"), "number"), + ::cxxopts::value()->default_value("500"), "number"), (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), - (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), + ::cxxopts::value()->default_value("5000"), "number"), + (run_time, "", "run_time", "run time for io", ::cxxopts::value()->default_value("360000"), "seconds"), (num_rounds, "", "num_rounds", "number of rounds to test with", - ::cxxopts::value< uint32_t >()->default_value("100"), "number"), + ::cxxopts::value()->default_value("100"), "number"), (num_entries_per_rounds, "", "num_entries_per_rounds", "number of entries per rounds", - ::cxxopts::value< uint32_t >()->default_value("40"), "number"), - (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("0"), - ""), + ::cxxopts::value()->default_value("40"), "number"), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", + ::cxxopts::value()->default_value("20"), ""), + (min_keys_in_node, "", "min_keys_in_node", "min_keys_in_node", + ::cxxopts::value()->default_value("6"), ""), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", - ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), + ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", - ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + ::cxxopts::value()->default_value("1000"), "number"), (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), (load_from_file, "", "load_from_file", "load from file", ::cxxopts::value< bool >()->default_value("0"), ""), (save_to_file, "", "save_to_file", "save to file", ::cxxopts::value< bool >()->default_value("0"), ""), (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", - ::cxxopts::value< bool >()->default_value("1"), ""), + ::cxxopts::value< bool >()->default_value("1"), ""), (seed, "", "seed", "random engine seed, use random if not defined", - ::cxxopts::value< uint64_t >()->default_value("0"), "number")) + ::cxxopts::value< uint64_t >()->default_value("0"), "number")) void log_obj_life_counter() { std::string str; @@ -99,10 +101,16 @@ class SequenceGenerator { keyDist_ = std::uniform_int_distribution<>(start_range_, end_range_); } + void fillRange(uint64_t start, uint64_t end) { + for (uint64_t i = start; i <= end; ++i) { + keyStates[i] = true; + } + } + OperationList generateOperations(size_t numOperations, bool reset = false) { std::vector< Operation > operations; if (reset) { this->reset(); } - for (size_t i = 0; i < numOperations; ++i) { + while (operations.size() < numOperations) { uint32_t key = keyDist_(g_re); auto [it, inserted] = keyStates.try_emplace(key, false); auto& inUse = it->second; @@ -120,6 +128,7 @@ class SequenceGenerator { return operations; } + __attribute__((noinline)) std::string showKeyState(uint64_t key) const { auto it = keyStates.find(key); if (it != keyStates.end()) { return it->second ? "Put" : "Remove"; } @@ -134,6 +143,7 @@ class SequenceGenerator { } return occurrences; } + __attribute__((noinline)) static std::string printOperations(const OperationList& operations) { std::ostringstream oss; auto count = 1; @@ -143,6 +153,7 @@ class SequenceGenerator { } return oss.str(); } + __attribute__((noinline)) static std::string printKeysOccurrences(const OperationList& operations) { std::set< uint64_t > keys = collectUniqueKeys(operations); std::ostringstream oss; @@ -156,6 +167,7 @@ class SequenceGenerator { } return oss.str(); } + __attribute__((noinline)) static std::string printKeyOccurrences(const OperationList& operations, uint64_t key) { std::ostringstream oss; auto keyOccurrences = inspect(operations, key); @@ -237,6 +249,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT m_test->m_cfg.m_leaf_node_type = T::leaf_node_type; m_test->m_cfg.m_int_node_type = T::interior_node_type; m_test->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); + m_test->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as(); m_test->m_bt = std::make_shared< typename T::BtreeType >(std::move(sb), m_test->m_cfg); return m_test->m_bt; } @@ -262,9 +275,11 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT {HS_SERVICE::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}, nullptr, {}, SISL_OPTIONS["init_device"].as< bool >()); - LOGINFO("Node size {} ", hs()->index_service().node_size()); this->m_cfg = BtreeConfig(hs()->index_service().node_size()); this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); + this->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as(); + LOGINFO("Node size {}, max_keys_in_node {}, min_keys_in_node {}", this->m_cfg.node_size(), + this->m_cfg.m_max_keys_in_node, this->m_cfg.m_min_keys_in_node); auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); @@ -301,7 +316,10 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } void reset_btree() { + hs()->index_service().remove_index_table(this->m_bt); this->m_bt->destroy(); + this->trigger_cp(true); + auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); @@ -334,14 +352,21 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } LOGINFO("Diff between shadow map and snapshot map\n{}\n", dif_str); - for (const auto& [k, addition] : diff) { + for (const auto &[k, addition]: diff) { // this->print_keys(fmt::format("reapply: before inserting key {}", k.key())); // this->visualize_keys(recovered_tree_filename); - if (addition) { this->force_upsert(k.key()); } + if (addition) { + LOGDEBUG("Reapply: Inserting key {}", k.key()); + this->force_upsert(k.key()); + } else { + LOGDEBUG("Reapply: Removing key {}", k.key()); + this->remove_one(k.key(), false); + } } - test_common::HSTestHelper::trigger_cp(true); + trigger_cp(true); this->m_shadow_map.save(m_shadow_filename); } + void reapply_after_crash(OperationList& operations) { for (const auto& [key, opType] : operations) { switch (opType) { @@ -355,7 +380,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT break; } } - test_common::HSTestHelper::trigger_cp(true); + trigger_cp(true); } void TearDown() override { @@ -377,13 +402,15 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT void crash_and_recover(uint32_t s_key, uint32_t e_key) { this->print_keys("Btree prior to CP and susbsequent simulated crash: "); - test_common::HSTestHelper::trigger_cp(false); + trigger_cp(false); this->wait_for_crash_recovery(); // this->visualize_keys("tree_after_crash_" + std::to_string(s_key) + "_" + std::to_string(e_key) + ".dot"); this->print_keys("Post crash and recovery, btree structure: "); this->reapply_after_crash(); + this->print_keys("Post reapply, btree structure: "); + this->get_all(); LOGINFO("Expect to have [{},{}) in tree and it is actually{} ", s_key, e_key, tree_key_count()); ASSERT_EQ(this->m_shadow_map.size(), this->tree_key_count()) << "shadow map size and tree size mismatch"; @@ -420,7 +447,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->visualize_keys(b_filename); } - test_common::HSTestHelper::trigger_cp(false); + trigger_cp(false); LOGINFO("waiting for crash to recover"); this->wait_for_crash_recovery(); @@ -428,8 +455,8 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT std::string rec_filename = filename + "_after_recovery.dot"; LOGINFO("Visualize the tree file after recovery : {}", rec_filename); this->visualize_keys(rec_filename); - this->print_keys("Post crash and recovery, btree structure: "); } + this->print_keys("Post crash and recovery, btree structure: "); sanity_check(operations); // Added to the index service right after recovery. Not needed here // test_common::HSTestHelper::trigger_cp(true); @@ -440,8 +467,8 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT std::string re_filename = filename + "_after_reapply.dot"; LOGINFO("Visualize the tree after reapply {}", re_filename); this->visualize_keys(re_filename); -// this->print_keys("Post crash and recovery, btree structure: "); } + this->print_keys("Post reapply, btree structure: "); this->get_all(); LOGINFO("After reapply: {} keys in shadow map and actually {} in tress", this->m_shadow_map.size(), @@ -529,82 +556,6 @@ TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) { this->query_all_paginate(80); } -/* -TYPED_TEST(IndexCrashTest, ManualMergeCrash){ - // Define the lambda function - const uint32_t num_entries = 30; - - auto initTree = [this, num_entries]() { - for (uint64_t k = 0u; k < num_entries; ++k) { - this->force_upsert(k); - } - test_common::HSTestHelper::trigger_cp(true); - this->m_shadow_map.save(this->m_shadow_filename); - }; - - std::vector< OperationList > removing_scenarios = { - {{29, OperationType::Remove}, - {28, OperationType::Remove}, - {27, OperationType::Remove}, - {26, OperationType::Remove}, - {25, OperationType::Remove}, - {24, OperationType::Remove}} - }; - - auto scenario = removing_scenarios[0]; - - LOGINFO("Step 1-1: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init.dot"); - LOGINFO("Step 2-1: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_merge_at_parent"); - - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - this->visualize_keys("tree_before_crash.dot"); - - LOGINFO("Step 3-1: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_1.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - - LOGINFO("Step 1-2: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init_02.dot"); - LOGINFO("Step 2-2: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_merge_at_left_child"); - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - this->visualize_keys("tree_before_crash_2.dot"); - - LOGINFO("Step 3-2: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_2.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - - LOGINFO("Step 1-3: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init_03.dot"); - LOGINFO("Step 2-3: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_freed_child"); - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - LOGINFO("Step 2-3: Set crash flag, remove some keys in reverse order"); - this->visualize_keys("tree_before_crash_3.dot"); - - LOGINFO("Step 3-3: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_3.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); -} -*/ - TYPED_TEST(IndexCrashTest, SplitCrash1) { // Define the lambda function auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); @@ -769,6 +720,236 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { if (renew_btree_after_crash) { this->reset_btree(); }; } } + +// Basic reverse and forward order remove with different flip points +TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { + vector flip_points = { + "crash_flush_on_merge_at_parent", + "crash_flush_on_merge_at_left_child", + // "crash_flush_on_freed_child", + }; + + for (size_t i = 0; i < flip_points.size(); ++i) { + this->reset_btree(); + + auto &flip_point = flip_points[i]; + LOGINFO("=== Testing flip point: {} - {} ===", i + 1, flip_point); + + // Populate some keys [1,num_entries) and trigger cp to persist + LOGINFO("Step {}-1: Populate some keys and flush", i+1); + auto const num_entries = SISL_OPTIONS["num_entries"].as(); + for (auto k = 0u; k < num_entries; ++k) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } + test_common::HSTestHelper::trigger_cp(true); + this->m_shadow_map.save(this->m_shadow_filename); + + this->visualize_keys("tree_merge_full.dot"); + + // Split keys into batches and remove the last one in reverse order + LOGINFO("Step {}-2: Set crash flag, remove some keys in reverse order", i + 1); + int batch_num = 4; { + int n = batch_num; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + OperationList ops; + for (auto k = r; k >= l; --k) { + ops.emplace_back(k, OperationType::Remove); + } + LOGINFO("Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, r, l); + + this->set_basic_flip(flip_point); + for (auto [k, _]: ops) { + LOGINFO("Removing key {}", k); + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_first_crash.dot"); + + LOGINFO("Step {}-2-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); + } + + // Remove the next batch of keys in forward order + LOGINFO("Step {}-3: Remove another batch in ascending order", i + 1) { + int n = batch_num - 1; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + OperationList ops; + for (auto k = l; k <= r; ++k) { + ops.emplace_back(k, OperationType::Remove); + } + LOGINFO("Step {}-3-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); + + this->set_basic_flip(flip_point); + for (auto [k, _]: ops) { + LOGINFO("Removing key {}", k); + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_second_crash.dot"); + + LOGINFO("Step {}-3-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); + } + + // Remove the next batch of keys in random order + LOGINFO("Step {}-4: Remove another batch in random order", i + 1) { + int n = batch_num - 2; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + SequenceGenerator generator(0, 100, l, r); + generator.fillRange(l, r); + OperationList ops = generator.generateOperations(r - l + 1, false); + + LOGINFO("Step {}-4-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); + + this->set_basic_flip(flip_point); + for (auto [k, _]: ops) { + LOGINFO("Removing key {}", k); + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_third_crash.dot"); + + LOGINFO("Step {}-4-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); + } + + LOGINFO("Step {}-5: Cleanup the tree", i + 1); + for (auto k = 0u; k < num_entries; ++k) { + this->remove_one(k, false); + } + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + } +} + +// +// TYPED_TEST(IndexCrashTest, MergeCrash1) { +// auto const num_entries = SISL_OPTIONS["num_entries"].as(); +// vector flips = { +// "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child", +// }; +// SequenceGenerator generator(0 /*putFreq*/, 100 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); +// OperationList operations; +// for (size_t i = 0; i < flips.size(); ++i) { +// this->reset_btree(); +// LOGINFO("Step {}-1: Init btree", i + 1); +// for (auto k = 0u; k < num_entries; ++k) { +// this->put(k, btree_put_type::INSERT, true /* expect_success */); +// } +// test_common::HSTestHelper::trigger_cp(true); +// this->print_keys("Inited tree"); +// +// LOGINFO("Step {}-2: Set flag {}", i + 1, flips[i]); +// this->set_basic_flip(flips[i], 1, 10); +// generator.reset(); +// generator.fillRange(0, num_entries - 1); +// +// // Randomly remove some keys +// std::random_device rd; +// std::mt19937 gen(rd()); +// std::uniform_int_distribution<> dis(num_entries / 4, num_entries / 2); +// auto num_keys_to_remove = dis(gen); +// LOGINFO("Removing {} keys before crash", num_keys_to_remove); +// operations = generator.generateOperations(num_keys_to_remove, false /* reset */); +// for (auto [k, _]: operations) { +// LOGINFO("Removing key {}", k); +// this->remove_one(k, true); +// } +// +// LOGINFO("Step {}-3: Simulate crash and recover", i + 1); +// this->crash_and_recover(operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); +// } +// } +// +// TYPED_TEST(IndexCrashTest, MergeManualCrash) { +// std::vector flip_points = { +// "crash_flush_on_merge_at_parent", +// "crash_flush_on_merge_at_left_child", +// }; +// +// constexpr uint32_t num_entries = 28; // with max=5 & min=3 +// +// auto initTree = [this, num_entries]() { +// for (auto k = 0u; k < num_entries; ++k) { +// this->put(k, btree_put_type::INSERT, true /* expect_success */); +// } +// test_common::HSTestHelper::trigger_cp(true); +// this->m_shadow_map.save(this->m_shadow_filename); +// }; +// +// std::vector removing_scenarios = { +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// }, // Merge 2 rightmost leaf nodes in 1 action +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {20, OperationType::Remove}, +// {19, OperationType::Remove}, +// }, // Merge 3 rightmost leaf nodes in 1 action +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// {21, OperationType::Remove}, +// {20, OperationType::Remove}, +// {19, OperationType::Remove}, +// }, // Merge 3 rightmost leaf nodes in 2 actions +// { +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// {11, OperationType::Remove}, +// {10, OperationType::Remove}, +// {13, OperationType::Remove}, +// }, // Merge from level=0 then level=1 +// // { +// // {16, OperationType::Remove}, +// // }, // Merge from level=1 then level=0 - need to set min=4 +// }; +// +// for (int i = 0; i < static_cast(removing_scenarios.size()); i++) { +// auto scenario = removing_scenarios[i]; +// auto s_idx = i + 1; +// LOGINFO("\n\tTesting scenario {}", s_idx); +// for (int j = 0; j < static_cast(flip_points.size()); j++) { +// const auto &flip_point = flip_points[j]; +// auto f_idx = j + 1; +// LOGINFO("\n\t\t\t\tTesting flip point: {}", flip_point); +// +// LOGINFO("Step {}-{}-1: Populate keys and flush", s_idx, f_idx); +// initTree(); +// this->visualize_keys(fmt::format("tree_init.{}_{}.dot", s_idx, f_idx)); +// +// LOGINFO("Step {}-{}-2: Set crash flag, remove keys in reverse order", s_idx, f_idx); +// this->set_basic_flip(flip_point); +// for (auto k: scenario) { +// LOGINFO("Removing entry {}", k.first); +// this->remove_one(k.first); +// } +// this->visualize_keys(fmt::format("tree_before_first_crash.{}_{}.dot", s_idx, f_idx)); +// this->remove_flip(flip_point); +// +// LOGINFO("Step {}-{}-3: Trigger cp to crash", s_idx, f_idx); +// this->crash_and_recover(scenario); +// test_common::HSTestHelper::trigger_cp(true); +// this->get_all(); +// +// this->reset_btree(); +// test_common::HSTestHelper::trigger_cp(true); +// } +// } +// } #endif int main(int argc, char* argv[]) { diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index 02c3e4c2c..dd2f8f010 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -52,6 +52,7 @@ def parse_arguments(): parser.add_argument('--cleanup_after_shutdown', help='Cleanup after shutdown', type=bool, default=False) parser.add_argument('--init_device', help='Initialize device', type=bool, default=True) parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=5) + parser.add_argument('--min_keys_in_node', help='Minimum num of keys in btree nodes', type=int, default=2) parser.add_argument('--num_rounds', help='number of rounds for crash test', type=int, default=10000) parser.add_argument('--num_entries_per_rounds', help='number of rounds for crash test', type=int, default=60) From c4efe11cace561b58be8155fedf284d2d5b9647e Mon Sep 17 00:00:00 2001 From: Sanal Date: Fri, 1 Nov 2024 10:22:40 -0700 Subject: [PATCH 018/170] Add additional tests for replace member (#574) --- conanfile.py | 2 +- .../repl_dev/raft_state_machine.cpp | 10 + src/tests/test_common/hs_repl_test_common.hpp | 9 + src/tests/test_common/raft_repl_test_base.hpp | 24 ++- src/tests/test_raft_repl_dev_dynamic.cpp | 182 ++++++++++++++++-- 5 files changed, 207 insertions(+), 20 deletions(-) diff --git a/conanfile.py b/conanfile.py index f61f1ef54..0cb18bd1d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.4" + version = "6.5.5" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 6e920b997..d1b210526 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -9,6 +10,7 @@ #include "repl_dev/raft_repl_dev.h" #include #include "common/homestore_config.hpp" +#include "common/crash_simulator.hpp" SISL_LOGGING_DECL(replication) @@ -293,6 +295,14 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, // Update the object offset. obj_id = snp_data->offset; + +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("baseline_resync_restart_new_follower")) { + LOGINFO("Hit flip baseline_resync_restart_new_follower crashing"); + hs()->crash_simulator().crash(); + return; + } +#endif } bool RaftStateMachine::apply_snapshot(nuraft::snapshot& s) { diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp index 672acffcb..c9ff71567 100644 --- a/src/tests/test_common/hs_repl_test_common.hpp +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -252,6 +252,10 @@ class HSReplTestHelper : public HSTestHelper { start_homestore(); } + void reinit_repl_app() { + m_token.params(HS_SERVICE::REPLICATION).repl_app = std::make_unique< TestReplApplication >(*this); + } + uint16_t replica_num() const { return replica_num_; } homestore::replica_id_t my_replica_id() const { return my_replica_id_; } homestore::replica_id_t replica_id(uint16_t member_id) const { @@ -317,6 +321,11 @@ class HSReplTestHelper : public HSTestHelper { } } + void add_listener(std::shared_ptr< ReplDevListener > listener) { + std::unique_lock lg(groups_mtx_); + pending_listeners_.emplace_back(listener); + } + size_t num_listeners() const { std::unique_lock lg(groups_mtx_); return repl_groups_.size(); diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index e0e2f6487..1ab90143a 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -204,7 +204,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { kv_snapshot_data.emplace_back(Key{v.id_}, v); LOGTRACEMOD(replication, "[Replica={}] Read logical snapshot callback fetching lsn={} size={} pattern={}", g_helper->replica_num(), v.lsn_, v.data_size_, v.data_pattern_); - if (kv_snapshot_data.size() >= 1000) { break; } + if (kv_snapshot_data.size() >= 10) { break; } } if (kv_snapshot_data.size() == 0) { @@ -430,6 +430,7 @@ class RaftReplDevTestBase : public testing::Test { for (auto const& db : dbs_) { if (db->is_zombie()) { continue; } auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + if (!repl_dev) continue; int i = 0; bool force_leave = false; do { @@ -511,6 +512,11 @@ class RaftReplDevTestBase : public testing::Test { } void run_on_leader(std::shared_ptr< TestReplicatedDB > db, auto&& lambda) { + if (!db || !db->repl_dev()) { + // Spare which are not added to group will not have repl dev. + return; + } + do { auto leader_uuid = db->repl_dev()->get_leader_id(); @@ -527,6 +533,8 @@ class RaftReplDevTestBase : public testing::Test { } void write_on_leader(uint32_t num_entries, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr) { + if (dbs_[0]->repl_dev() == nullptr) return; + do { auto leader_uuid = dbs_[0]->repl_dev()->get_leader_id(); @@ -614,14 +622,20 @@ class RaftReplDevTestBase : public testing::Test { void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, - uint32_t commit_quorum = 0) { - this->run_on_leader(db, [this, db, member_out, member_in, commit_quorum]() { + uint32_t commit_quorum = 0, ReplServiceError error = ReplServiceError::OK) { + this->run_on_leader(db, [this, error, db, member_out, member_in, commit_quorum]() { LOGINFO("Replace member out={} in={}", boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); + replica_member_info out{member_out, ""}; replica_member_info in{member_in, ""}; - auto v = hs()->repl_service().replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); - ASSERT_EQ(v.hasError(), false) << "Error in replacing member"; + auto result = hs()->repl_service().replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); + if (error == ReplServiceError::OK) { + ASSERT_EQ(result.hasError(), false) << "Error in replacing member"; + } else { + ASSERT_EQ(result.hasError(), true) << "Error in replacing member"; + ASSERT_EQ(result.error(), error); + } }); } diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp index c29f239e1..5a6095959 100644 --- a/src/tests/test_raft_repl_dev_dynamic.cpp +++ b/src/tests/test_raft_repl_dev_dynamic.cpp @@ -15,11 +15,17 @@ #include "test_common/raft_repl_test_base.hpp" // Dynamic tests spawn spare replica's also which can be used to add and remove from a repl dev. -class ReplDevDynamicTest : public RaftReplDevTestBase {}; +class ReplDevDynamicTest : public RaftReplDevTestBase { +private: + bool is_replica_num_in(const std::set< uint32_t >& replicas) { + // Check if the current replica process is in this set. + return replicas.count(g_helper->replica_num()) != 0 ? true : false; + } +}; TEST_F(ReplDevDynamicTest, ReplaceMember) { + LOGINFO("ReplaceMember test started replica={}", g_helper->replica_num()); // Write some IO's, replace a member, validate all members data except which is out. - LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); auto db = dbs_.back(); auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); @@ -45,28 +51,28 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { g_helper->sync_for_verify_start(num_members); LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); - if (g_helper->replica_num() != member_out) { + if (is_replica_num_in({0, 1, member_in})) { // Skip the member which is going to be replaced. Validate data on all other replica's. LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); this->validate_data(); - } else { + } else if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); - do { + while (repl_dev && !repl_dev->is_destroyed()) { std::this_thread::sleep_for(std::chrono::seconds(1)); auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); raft_repl_svc.gc_repl_devs(); LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num()); - } while (!repl_dev->is_destroyed()); + } LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); } g_helper->sync_for_cleanup_start(num_members); - LOGINFO("ReplaceMember test done"); + LOGINFO("ReplaceMember test done replica={}", g_helper->replica_num()); } TEST_F(ReplDevDynamicTest, TwoMemberDown) { - LOGINFO("TwoMemberDown test started"); + LOGINFO("TwoMemberDown test started replica={}", g_helper->replica_num()); // Make two members down in a group and leader cant reach a quorum. // We set the custom quorum size to 1 and call replace member. @@ -110,28 +116,176 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) { LOGINFO("Member in got all commits"); } - if (g_helper->replica_num() == 0 || g_helper->replica_num() == member_in) { + if (is_replica_num_in({0, member_in})) { // Validate data on leader replica 0 and replica 3 LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); this->validate_data(); } - g_helper->sync_for_cleanup_start(num_members); - if (g_helper->replica_num() == 1) { LOGINFO("Start replica 1"); + db->set_zombie(); this->start_replica(1); } if (g_helper->replica_num() == 2) { LOGINFO("Start replica 2"); + db->set_zombie(); + this->start_replica(2); + } + + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("TwoMemberDown test done replica={}", g_helper->replica_num()); +} + +TEST_F(ReplDevDynamicTest, OneMemberDown) { + // replica0(leader) and replica1 up, replica2 is down. Replace replica2 with replica3. + // replica0 should be able to baseline resync to replica4(new member). + // Write some IO's, replace a member, validate all members data except which is out. + LOGINFO("OneMemberDown test started replica={}", g_helper->replica_num()); + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + + this->shutdown_replica(2); + LOGINFO("Shutdown replica 2"); + + std::this_thread::sleep_for(std::chrono::seconds(3)); + if (g_helper->replica_num() == 0) { + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + std::this_thread::sleep_for(std::chrono::seconds(3)); + } else if (g_helper->replica_num() == member_in) { + LOGINFO("Wait for commits replica={}", g_helper->replica_num()); + wait_for_commits(num_io_entries); + } + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (is_replica_num_in({0, 1, member_in})) { + // Skip the member which is going to be replaced. Validate data on all other replica's. + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } + + g_helper->sync_for_cleanup_start(num_members); + + if (g_helper->replica_num() == 2) { + LOGINFO("Start replica 2"); + db->set_zombie(); this->start_replica(2); } - LOGINFO("TwoMemberDown test done"); + LOGINFO("OneMemberDown test done replica={}", g_helper->replica_num()); } -// TODO add more tests with leader and member restart, multiple member replace -// leader replace +TEST_F(ReplDevDynamicTest, LeaderReplace) { + // replica0(leader) and replica1 and replica2 is up. Replace replica0(leader) with replica3. + // replica0 will yield leadership and any other replica will be come leader and leader + // will do baseline resync to replica4(new member). + // Write some IO's, replace a member, validate all members data except which is out. + LOGINFO("LeaderReplace test started replica={}", g_helper->replica_num()); + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the leader in the group with index(0) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = 0; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + + if (g_helper->replica_num() != member_in) { + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + + // Leader will return error NOT_LEADER and yield leadership, sleep and connect again + // to the new leader. + LOGINFO("Replace old leader"); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + ReplServiceError::NOT_LEADER); + LOGINFO("Replace member leader yield done"); + + std::this_thread::sleep_for(std::chrono::seconds(3)); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + LOGINFO("Replace member old leader done"); + } + + if (g_helper->replica_num() == member_in) { + LOGINFO("Wait for commits replica={}", g_helper->replica_num()); + wait_for_commits(num_io_entries); + } + + g_helper->sync_for_verify_start(num_members); + if (is_replica_num_in({0, 1, member_in})) { + // Skip the member which is going to be replaced. Validate data on all other replica's. + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } + + if (g_helper->replica_num() == member_out) { db->set_zombie(); } + + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("LeaderReplace test done replica={}", g_helper->replica_num()); +} + +TEST_F(ReplDevDynamicTest, OneMemberRestart) { + // replica0(leader) is up and replica1 is restated, replica2 is down. Replace replica2 with replica3. + // replica0 should be able to baseline resync to replica4(new member). + // Write some IO's, replace a member, validate all members data except which is out. + LOGINFO("OneMemberRestart test started replica={}", g_helper->replica_num()); + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + if (g_helper->replica_num() == 1) { + LOGINFO("Restart replica 1"); + this->restart_replica(15); + } + + if (g_helper->replica_num() == 0) { + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + std::this_thread::sleep_for(std::chrono::seconds(3)); + } else if (g_helper->replica_num() == member_in) { + LOGINFO("Wait for commits replica={}", g_helper->replica_num()); + wait_for_commits(num_io_entries); + } + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (is_replica_num_in({0, 1, member_in})) { + // Skip the member which is going to be replaced. Validate data on all other replica's. + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } + + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("OneMemberRestart test done replica={}", g_helper->replica_num()); +} int main(int argc, char* argv[]) { int parsed_argc = argc; From 804cd6beb7bfae3c21ac0264815d2425a220ee46 Mon Sep 17 00:00:00 2001 From: koujl <108138320+koujl@users.noreply.github.com> Date: Tue, 5 Nov 2024 15:09:31 +0800 Subject: [PATCH 019/170] Add protection for concurrent access to m_down_buffers (#577) Concurrent writes to m_down_buffers may cause data inconsistency. Add a mutex lock to IndexBuffer as well as extracting add/remove operations into member functions to make the vector thread-safe. Signed-off-by: Jilong Kou --- conanfile.py | 2 +- .../homestore/index/index_internal.hpp | 10 +++- src/lib/index/index_cp.cpp | 25 ++------- src/lib/index/index_service.cpp | 49 ++++++++++++++++-- src/lib/index/wb_cache.cpp | 51 ++++--------------- 5 files changed, 70 insertions(+), 67 deletions(-) diff --git a/conanfile.py b/conanfile.py index 0cb18bd1d..ca96905c5 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.5" + version = "6.5.6" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp index 85c2a304d..fea20dbd6 100644 --- a/src/include/homestore/index/index_internal.hpp +++ b/src/include/homestore/index/index_internal.hpp @@ -97,7 +97,8 @@ struct IndexBuffer : public sisl::ObjLifeCounter< IndexBuffer > { sisl::atomic_counter< int > m_wait_for_down_buffers{0}; // Number of children need to wait for before persisting #ifndef NDEBUG // Down buffers are not mandatory members, but only to keep track of any bugs and asserts - std::vector< std::weak_ptr< IndexBuffer > > m_down_buffers; + std::vector > m_down_buffers; + std::mutex m_down_buffers_mtx; std::shared_ptr< IndexBuffer > m_prev_up_buffer; // Keep a copy for debugging #endif @@ -123,6 +124,13 @@ struct IndexBuffer : public sisl::ObjLifeCounter< IndexBuffer > { std::string to_string() const; std::string to_string_dot() const; + + void add_down_buffer(const IndexBufferPtr &buf); + + void remove_down_buffer(const IndexBufferPtr &buf); +#ifndef NDEBUG + bool is_in_down_buffers(const IndexBufferPtr &buf); +#endif }; // This is a special buffer which is used to write to the meta block diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index d58e33c22..578fae997 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -266,33 +266,16 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, #ifndef NDEBUG // if (!is_sibling_link || (buf->m_up_buffer == real_up_buf)) { return buf;} // Already linked with same buf or its not a sibling link to override - bool found{false}; - for (auto const& dbuf : real_up_buf->m_down_buffers) { - if (dbuf.lock() == buf) { - found = true; - break; - } + if (real_up_buf->is_in_down_buffers(buf)) { + return buf; } - if (found) { return buf; } - real_up_buf->m_down_buffers.emplace_back(buf); #endif if (buf->m_up_buffer != real_up_buf) { if (buf->m_up_buffer) { - buf->m_up_buffer->m_wait_for_down_buffers.decrement(1); -#ifndef NDEBUG - bool found{false}; - for (auto it = buf->m_up_buffer->m_down_buffers.begin(); it != buf->m_up_buffer->m_down_buffers.end(); ++it) { - if (it->lock() == buf) { - buf->m_up_buffer->m_down_buffers.erase(it); - found = true; - break; - } - } - HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); -#endif + buf->m_up_buffer->remove_down_buffer(buf); } - real_up_buf->m_wait_for_down_buffers.increment(1); + real_up_buf->add_down_buffer(buf); buf->m_up_buffer = real_up_buf; } } diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index 4b3fb5d11..49755a4ef 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -162,9 +162,12 @@ std::string IndexBuffer::to_string() const { // store m_down_buffers in a string std::string down_bufs = ""; #ifndef NDEBUG - for (auto const& down_buf : m_down_buffers) { - if (auto ptr = down_buf.lock()) { - fmt::format_to(std::back_inserter(down_bufs), "[{}]", voidptr_cast(ptr.get())); + { + std::lock_guard lg(m_down_buffers_mtx); + for (auto const &down_buf: m_down_buffers) { + if (auto ptr = down_buf.lock()) { + fmt::format_to(std::back_inserter(down_bufs), "[{}]", voidptr_cast(ptr.get())); + } } } #endif @@ -178,6 +181,7 @@ std::string IndexBuffer::to_string() const { down_bufs); } } + std::string IndexBuffer::to_string_dot() const { auto str = fmt::format("IndexBuffer {} ", reinterpret_cast< void* >(const_cast< IndexBuffer* >(this))); if (m_bytes == nullptr) { @@ -190,6 +194,45 @@ std::string IndexBuffer::to_string_dot() const { return str; } +void IndexBuffer::add_down_buffer(const IndexBufferPtr &buf) { + m_wait_for_down_buffers.increment(); +#ifndef NDEBUG + { + std::lock_guard lg(m_down_buffers_mtx); + m_down_buffers.push_back(buf); + } +#endif +} + +void IndexBuffer::remove_down_buffer(const IndexBufferPtr &buf) { + m_wait_for_down_buffers.decrement(); +#ifndef NDEBUG + bool found{false}; { + std::lock_guard lg(m_down_buffers_mtx); + for (auto it = buf->m_up_buffer->m_down_buffers.begin(); it != buf->m_up_buffer->m_down_buffers.end(); ++it) { + if (it->lock() == buf) { + buf->m_up_buffer->m_down_buffers.erase(it); + found = true; + break; + } + } + } + HS_DBG_ASSERT(found, "Down buffer is linked to up_buf, but up_buf doesn't have down_buf in its list"); +#endif +} + +#ifndef NDEBUG +bool IndexBuffer::is_in_down_buffers(const IndexBufferPtr &buf) { + std::lock_guard lg(m_down_buffers_mtx); + for (auto const &dbuf: m_down_buffers) { + if (dbuf.lock() == buf) { + return true; + } + } + return false; +} +#endif + MetaIndexBuffer::MetaIndexBuffer(superblk< index_table_sb >& sb) : IndexBuffer{nullptr, BlkId{}}, m_sb{sb} { m_is_meta_buf = true; } diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index b17d67dc8..04383d8ac 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -396,14 +396,8 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const& HS_DBG_ASSERT((real_up_buf->m_dirtied_cp_id == down_buf->m_dirtied_cp_id) || (real_up_buf->is_meta_buf()), "Up buffer is not modified by current cp, but down buffer is linked to it"); #ifndef NDEBUG - bool found{false}; - for (auto const& dbuf : real_up_buf->m_down_buffers) { - if (dbuf.lock() == down_buf) { - found = true; - break; - } - } - HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); + HS_DBG_ASSERT(real_up_buf->is_in_down_buffers(down_buf), + "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); #endif return; } @@ -412,25 +406,10 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const& // Now we link the down_buffer to the real up_buffer if (down_buf->m_up_buffer) { // release existing up_buffer's wait count - down_buf->m_up_buffer->m_wait_for_down_buffers.decrement(); -#ifndef NDEBUG - bool found{false}; - for (auto it = down_buf->m_up_buffer->m_down_buffers.begin(); it != down_buf->m_up_buffer->m_down_buffers.end(); - ++it) { - if (it->lock() == down_buf) { - down_buf->m_up_buffer->m_down_buffers.erase(it); - found = true; - break; - } - } - HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); -#endif + down_buf->m_up_buffer->remove_down_buffer(down_buf); } - real_up_buf->m_wait_for_down_buffers.increment(1); down_buf->m_up_buffer = real_up_buf; -#ifndef NDEBUG - real_up_buf->m_down_buffers.emplace_back(down_buf); -#endif + real_up_buf->add_down_buffer(down_buf); } void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { @@ -535,21 +514,8 @@ void IndexWBCache::recover(sisl::byte_view sb) { pending_bufs.push_back(buf->m_up_buffer); } else { // Just ignore it - buf->m_up_buffer->m_wait_for_down_buffers.decrement(); -#ifndef NDEBUG - bool found{false}; - for (auto it = buf->m_up_buffer->m_down_buffers.begin(); - it != buf->m_up_buffer->m_down_buffers.end(); ++it) { - auto sp = it->lock(); - if (sp && sp == buf) { - found = true; - buf->m_up_buffer->m_down_buffers.erase(it); - break; - } - } - HS_DBG_ASSERT(found, - "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); -#endif + buf->m_up_buffer->remove_down_buffer(buf); + buf->m_up_buffer = nullptr; } } } @@ -754,7 +720,10 @@ std::pair< IndexBufferPtr, bool > IndexWBCache::on_buf_flush_done_internal(Index IndexBufferPtr const& buf) { IndexBufferPtrList buf_list; #ifndef NDEBUG - buf->m_down_buffers.clear(); + { + std::lock_guard lg(buf->m_down_buffers_mtx); + buf->m_down_buffers.clear(); + } #endif buf->set_state(index_buf_state_t::CLEAN); From 50f42fff1bb629955e4c70fbfca5b5482dc13013 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 6 Nov 2024 17:37:42 +0800 Subject: [PATCH 020/170] Implement GC_REPL_REQ Based on DSN to Prevent Resource Leaks (#576) * Implement GC_REPL_REQ Based on DSN to Prevent Resource Leaks This commit introduces a mechanism to garbage collect (GC) replication requests (rreqs) that may hang indefinitely, thereby consuming memory and disk resources unnecessarily. These rreqs can enter a hanging state under several circumstances, as outlined below: 1. Scenario with Delayed Commit: - Follower F1 receives LSN 100 and DSN 104 from Leader L1 and takes longer than the raft timeout to precommit/commit it. - L1 resends LSN 100, causing F1 to fetch the data again. Since LSN 100 was committed in a previous attempt, this log entry is skipped, leaving the rreq hanging indefinitely. 2. Scenario with Leader Failure Before Data Completion: - Follower F1 receives LSN 100 from L1, but before all data is fetched/pushed, L1 fails and L2 becomes the new leader. - L2 resends LSN 100 with L2 as the new originator. F1 proceeds with the new rreq and commits it, but the initial rreq from L1 hangs indefinitely as it cannot fetch data from the new leader L2. 3. Scenario with Leader Failure After Data Write: - Follower F1 receives data (DSN 104) from L1 and writes it. Before the log of LSN 100 reaches F1, L1 fails and L2 becomes the new leader. - L2 resends LSN 100 to F1, and F1 fetches DSN 104 from L2, leaving the original rreq hanging. This garbage collection process cleans up based on DSN. Any rreqs in `m_repl_key_req_map`, whose DSN is already committed (`rreq->dsn < repl_dev->m_next_dsn`), will be GC'd. This is safe on the follower side, as the follower updates `m_next_dsn` during commit. Any DSN below `cur_dsn` should already be committed, implying that the rreq should already be removed from `m_repl_key_req_map`. On the leader side, since `m_next_dsn` is updated when sending out the proposal, it is not safe to clean up based on `m_next_dsn`. Therefore, we explicitly skip the leader in this GC process. Skipping localize raft logs we already committed. Leader may send duplicate raft logs, if we localize them unconditionally duplicate data will be written to chunk during fetch_data. It is safe for us to skip those logs that already committed, there is no way those LSN can be over-written. Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 97 ++++++++++++++----- .../repl_dev/raft_state_machine.cpp | 11 ++- .../replication/repl_dev/raft_state_machine.h | 2 +- 5 files changed, 80 insertions(+), 34 deletions(-) diff --git a/conanfile.py b/conanfile.py index ca96905c5..06e091ba0 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.6" + version = "6.5.7" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index c2223455f..1abf5ea12 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -126,7 +126,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: friend class SoloReplDev; public: - repl_req_ctx() {} + repl_req_ctx() { m_start_time = Clock::now(); } virtual ~repl_req_ctx(); void init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index f3a4a2461..b1ff61dbb 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -895,7 +895,7 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { // Remove the request from repl_key map. m_repl_key_req_map.erase(rreq->rkey()); // Remove the request from lsn map. - m_state_machine->unlink_lsn_to_req(rreq->lsn()); + m_state_machine->unlink_lsn_to_req(rreq->lsn(), rreq); auto cur_dsn = m_next_dsn.load(std::memory_order_relaxed); while (cur_dsn <= rreq->dsn()) { @@ -1191,9 +1191,22 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu entries.size()); auto reqs = sisl::VectorPool< repl_req_ptr_t >::alloc(); - for (auto& entry : entries) { + auto last_commit_lsn = uint64_cast(get_last_commit_lsn()); + for (unsigned long i = 0; i < entries.size(); i++) { + auto& entry = entries[i]; + auto lsn = start_lsn + i; + auto term = entry->get_term(); if (entry->get_val_type() != nuraft::log_val_type::app_log) { continue; } if (entry->get_buf_ptr()->size() == 0) { continue; } + // skipping localize for already committed log(dup), they anyway will be discard + // by nuraft before append_log. + if (lsn <= last_commit_lsn) { + RD_LOGT("Raft channel: term {}, lsn {}, skipping dup, last_commit_lsn {}", term, lsn, + last_commit_lsn); + continue; + } + // Those LSNs already in logstore but not yet committed, will be dedup here, + // applier_create_req will return same req as previous one auto req = m_state_machine->localize_journal_entry_prepare(*entry); if (req == nullptr) { sisl::VectorPool< repl_req_ptr_t >::free(reqs); @@ -1265,39 +1278,71 @@ cshared< ReplDevCPContext > RaftReplDev::get_cp_ctx(CP* cp) { void RaftReplDev::cp_cleanup(CP*) {} void RaftReplDev::gc_repl_reqs() { - std::vector< int64_t > expired_keys; - m_state_machine->iterate_repl_reqs([this, &expired_keys](auto key, auto rreq) { + auto cur_dsn = m_next_dsn.load(); + if (cur_dsn != 0) cur_dsn = cur_dsn - 1; + // On follower, DSN below cur_dsn should very likely be commited. + // It is not guaranteed because DSN and LSN are generated separately, + // DSN in async_alloc_write before pushing data, LSN later when + // proposing to raft. Two simultaneous write requests on leader can have + // and during the window. + std::vector< repl_req_ptr_t > expired_rreqs; + + auto req_map_size = m_repl_key_req_map.size(); + RD_LOGI("m_repl_key_req_map size is {};", req_map_size); + for (auto [key, rreq] : m_repl_key_req_map) { + // FIXME: Skipping proposer for now, the DSN in proposer increased in proposing stage, not when commit(). + // Need other mechanism. + if (rreq->is_proposer()) { + // don't clean up proposer's request + continue; + } + if (rreq->dsn() < cur_dsn && rreq->is_expired()) { + // The DSN can be out of order, wait till rreq expired. + RD_LOGD("legacy req with commited DSN, rreq=[{}] , dsn = {}, next_dsn = {}, gap= {}, elapsed_time_sec {}", + rreq->to_string(), rreq->dsn(), cur_dsn, cur_dsn - rreq->dsn(), + get_elapsed_time_sec(rreq->created_time())); + expired_rreqs.push_back(rreq); + } + } + int sm_req_cnt = 0; + // FIXME: we ensured data written before appending log to log store, in which we add rreq to state_machine + // and during pre-commit/commit we retrieve rreq from state_machine. Removing requests outside of state + // machine is risky. + // Below logs are logging only, can be removed once we get more confidence. + m_state_machine->iterate_repl_reqs([this, cur_dsn, &sm_req_cnt](auto key, auto rreq) { + sm_req_cnt++; if (rreq->is_proposer()) { // don't clean up proposer's request return; } - if (rreq->is_expired()) { - expired_keys.push_back(key); - RD_LOGD("rreq=[{}] is expired, cleaning up; elapsed_time_sec{};", rreq->to_string(), + RD_LOGD("StateMachine: rreq=[{}] is expired, elapsed_time_sec{};", rreq->to_string(), get_elapsed_time_sec(rreq->created_time())); - - // do garbage collection - // 1. free the allocated blocks - if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { - auto blkid = rreq->local_blkid(); - data_service().async_free_blk(blkid).thenValue([this, blkid](auto&& err) { - HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", - blkid.to_string()); - RD_LOGD("blkid={} freed successfully", blkid.to_string()); - }); - } - - // 2. remove from the m_repl_key_req_map - // handle_error during fetch data response might have already removed the rreq from the this map - if (m_repl_key_req_map.find(rreq->rkey()) != m_repl_key_req_map.end()) { - m_repl_key_req_map.erase(rreq->rkey()); - } } }); + RD_LOGI("state_machine req map size is {};", sm_req_cnt); - for (auto const& l : expired_keys) { - m_state_machine->unlink_lsn_to_req(l); + for (auto removing_rreq : expired_rreqs) { + // once log flushed, the commit progress controlled by raft + if (removing_rreq->has_state(repl_req_state_t::LOG_FLUSHED)) { + RD_LOGI("Skipping GC rreq [{}] because it is in state machine", removing_rreq->to_string()); + continue; + } + // do garbage collection + // 1. free the allocated blocks + RD_LOGI("Removing rreq [{}]", removing_rreq->to_string()); + if (removing_rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { + auto blkid = removing_rreq->local_blkid(); + data_service().async_free_blk(blkid).thenValue([this, blkid](auto&& err) { + HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", + blkid.to_string()); + RD_LOGD("GC rreq: Releasing blkid={} freed successfully", blkid.to_string()); + }); + } + // 2. remove from the m_repl_key_req_map + if (m_repl_key_req_map.find(removing_rreq->rkey()) != m_repl_key_req_map.end()) { + m_repl_key_req_map.erase(removing_rreq->rkey()); + } } } diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index d1b210526..ae8f2a193 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -221,11 +221,12 @@ uint64_t RaftStateMachine::last_commit_index() { void RaftStateMachine::become_ready() { m_rd.become_ready(); } -void RaftStateMachine::unlink_lsn_to_req(int64_t lsn) { - auto const it = m_lsn_req_map.find(lsn); - if (it != m_lsn_req_map.cend()) { - RD_LOG(DEBUG, "Raft channel: erase lsn {}, rreq {}", lsn, it->second->to_string()); - m_lsn_req_map.erase(lsn); +void RaftStateMachine::unlink_lsn_to_req(int64_t lsn, repl_req_ptr_t rreq) { + // it is possible a LSN mapped to different rreq in history + // due to log overwritten. Verify the rreq before removing + auto deleted = m_lsn_req_map.erase_if_equal(lsn, rreq); + if (deleted) { + RD_LOG(DEBUG, "Raft channel: erase lsn {}, rreq {}", lsn, rreq->to_string()); } } diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index b931e42f4..a19d9a0ec 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -126,7 +126,7 @@ class RaftStateMachine : public nuraft::state_machine { repl_req_ptr_t localize_journal_entry_prepare(nuraft::log_entry& lentry); repl_req_ptr_t localize_journal_entry_finish(nuraft::log_entry& lentry); void link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn); - void unlink_lsn_to_req(int64_t lsn); + void unlink_lsn_to_req(int64_t lsn, repl_req_ptr_t rreq); repl_req_ptr_t lsn_to_req(int64_t lsn); nuraft_mesg::repl_service_ctx* group_msg_service(); From 3882211dacc70cd977f71d29217ccde071a2bbd7 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Thu, 7 Nov 2024 12:01:22 +0800 Subject: [PATCH 021/170] Releasing data buf from memory after written to disk. Data buffer persists in memory until rreq is committed or rolled back. This approach poses issues during recovery. As new data arrives via push_data and is written to disk, it remains in memory for an extended period until the replica catches up and commits the rreq. Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 6 +++++- src/lib/replication/repl_dev/common.cpp | 9 ++++++++- src/lib/replication/repl_dev/raft_repl_dev.cpp | 2 ++ 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 06e091ba0..6baa5e900 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.7" + version = "6.5.8" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 1abf5ea12..cf0e00a0c 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -143,7 +143,10 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: sisl::blob const& key() const { return m_key; } MultiBlkId const& local_blkid() const { return m_local_blkid; } RemoteBlkId const& remote_blkid() const { return m_remote_blkid; } - const char* data() const { return r_cast< const char* >(m_data); } + const char* data() const { + DEBUG_ASSERT(m_data != nullptr, "m_data is nullptr, use before save_pushed/fetched_data or after release_data()"); + return r_cast< const char* >(m_data); + } repl_req_state_t state() const { return repl_req_state_t(m_state.load()); } bool has_state(repl_req_state_t s) const { return m_state.load() & uint32_cast(s); } repl_journal_entry const* journal_entry() const { return m_journal_entry; } @@ -209,6 +212,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: bool add_state_if_not_already(repl_req_state_t s); void set_lentry(nuraft::ptr< nuraft::log_entry > const& lentry) { m_lentry = lentry; } void clear(); + void release_data(); flatbuffers::FlatBufferBuilder& create_fb_builder() { return m_fb_builder; } void release_fb_builder() { m_fb_builder.Release(); } diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index b8800afea..4fcbb0f4e 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -164,12 +164,19 @@ bool repl_req_ctx::add_state_if_not_already(repl_req_state_t s) { void repl_req_ctx::clear() { m_header = sisl::blob{}; m_key = sisl::blob{}; + release_data(); + m_pkts.clear(); +} + +void repl_req_ctx::release_data() { + m_data = nullptr; + // explicitly clear m_buf_for_unaligned_data as unaligned pushdata/fetchdata will be saved here + m_buf_for_unaligned_data = sisl::io_blob_safe{}; if (m_pushed_data) { m_pushed_data->send_response(); m_pushed_data = nullptr; } m_fetched_data = sisl::GenericClientResponse{}; - m_pkts.clear(); } static std::string req_state_name(uint32_t state) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index b1ff61dbb..59916d039 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -444,6 +444,7 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d } else { rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->m_data_written_promise.setValue(); + rreq->release_data(); const auto data_log_diff_us = push_data_rcv_time.time_since_epoch().count() > rreq->created_time().time_since_epoch().count() ? get_elapsed_time_us(rreq->created_time(), push_data_rcv_time) @@ -862,6 +863,7 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons "Error in writing data"); // TODO: Find a way to return error to the Listener rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->m_data_written_promise.setValue(); + rreq->release_data(); RD_LOGD("Data Channel: Data Write completed rreq=[{}], data_write_latency_us={}, " "total_write_latency_us={}, write_num_pieces={}", From f8426dc86ff51796ee6e7032e8d0cc978ec085a1 Mon Sep 17 00:00:00 2001 From: ywz <649521587@qq.com> Date: Mon, 11 Nov 2024 15:35:53 +0800 Subject: [PATCH 022/170] add rollback on state machine add open Leader_Restart ut (#585) * add rollback on state machine --------- Signed-off-by: yawzhang --- conanfile.py | 2 +- .../log_store/home_raft_log_store.cpp | 16 ++++++++++++++++ .../replication/repl_dev/raft_repl_dev.cpp | 19 +++++++++++++++++++ src/lib/replication/repl_dev/raft_repl_dev.h | 1 + .../repl_dev/raft_state_machine.cpp | 19 ++++++++++++++++++- .../replication/repl_dev/raft_state_machine.h | 3 ++- src/tests/test_raft_repl_dev.cpp | 2 +- 7 files changed, 58 insertions(+), 4 deletions(-) diff --git a/conanfile.py b/conanfile.py index 6baa5e900..076e8aa43 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.8" + version = "6.5.9" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index 3e82f7aa3..293494274 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -184,6 +184,22 @@ void HomeRaftLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& e m_log_store->append_async(sisl::io_blob{buf->data_begin(), uint32_cast(buf->size()), false /* is_aligned */}, nullptr /* cookie */, [buf](int64_t, sisl::io_blob&, logdev_key, void*) {}); + + auto position_in_cache = index % m_log_entry_cache.size(); + { + std::unique_lock lk(m_mutex); + m_log_entry_cache[position_in_cache] = std::make_pair(index, entry); + + // remove all cached entries after this index + for (size_t i{0}; i < m_log_entry_cache.size(); ++i) { + if (m_log_entry_cache[i].first > index) { + m_log_entry_cache[i] = std::make_pair(0, nullptr); + } + } + } + + // flushing the log before returning to ensure new(over-written) log is persisted to disk. + end_of_append_batch(index, 1); } void HomeRaftLogStore::end_of_append_batch(ulong start, ulong cnt) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 59916d039..dea117736 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -891,6 +891,25 @@ void RaftReplDev::commit_blk(repl_req_ptr_t rreq) { } } +void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { + // 1. call the listener to rollback + m_listener->on_rollback(rreq->lsn(), rreq->header(), rreq->key(), rreq); + + // 2. remove the request from maps + m_state_machine->unlink_lsn_to_req(rreq->lsn(), rreq); + m_repl_key_req_map.erase(rreq->rkey()); + + // 3. free the allocated blocks + if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { + auto blkid = rreq->local_blkid(); + data_service().async_free_blk(blkid).thenValue([this, blkid](auto&& err) { + HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", + blkid.to_string()); + RD_LOGD("Rollback rreq: Releasing blkid={} freed successfully", blkid.to_string()); + }); + } +} + void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { commit_blk(rreq); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 4be98394c..5cb1516ac 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -195,6 +195,7 @@ class RaftReplDev : public ReplDev, //////////////// Methods needed for other Raft classes to access ///////////////// void use_config(json_superblk raft_config_sb); void handle_commit(repl_req_ptr_t rreq, bool recovery = false); + void handle_rollback(repl_req_ptr_t rreq); repl_req_ptr_t repl_key_to_req(repl_key const& rkey) const; repl_req_ptr_t applier_create_req(repl_key const& rkey, journal_type_t code, sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, bool is_data_channel); diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index ae8f2a193..654851dc8 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -208,6 +208,23 @@ void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_pt // TODO:add more logic here if necessary } +void RaftStateMachine::rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) { + RD_LOGD("Raft channel: Rollback cluster conf , log_idx = {}", log_idx); + // TODO:add more logic here if necessary +} + +void RaftStateMachine::rollback_ext(const nuraft::state_machine::ext_op_params& params) { + int64_t lsn = s_cast< int64_t >(params.log_idx); + repl_req_ptr_t rreq = lsn_to_req(lsn); + if (rreq == nullptr) { + RD_LOG(ERROR, "Raft channel: Rollback lsn {} rreq not found", lsn); + return; + } + + RD_LOGD("Raft channel: Rollback lsn {}, rreq=[{}]", lsn, rreq->to_string()); + m_rd.handle_rollback(rreq); +} + void RaftStateMachine::iterate_repl_reqs(std::function< void(int64_t, repl_req_ptr_t rreq) > const& cb) { for (auto [key, rreq] : m_lsn_req_map) { cb(key, rreq); @@ -236,7 +253,7 @@ void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) { // reset the rreq created_at time to now https://github.com/eBay/HomeStore/issues/506 rreq->set_created_time(); [[maybe_unused]] auto r = m_lsn_req_map.insert(lsn, std::move(rreq)); - RD_DBG_ASSERT_EQ(r.second, true, "lsn={} already in precommit list", lsn); + RD_DBG_ASSERT_EQ(r.second, true, "lsn={} already in precommit list, exist_term={}", lsn, r.first->second->term()); } repl_req_ptr_t RaftStateMachine::lsn_to_req(int64_t lsn) { diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index a19d9a0ec..6bf4faf5a 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -109,7 +109,8 @@ class RaftStateMachine : public nuraft::state_machine { raft_buf_ptr_t pre_commit_ext(const nuraft::state_machine::ext_op_params& params) override; raft_buf_ptr_t commit_ext(const nuraft::state_machine::ext_op_params& params) override; void commit_config(const ulong log_idx, raft_cluster_config_ptr_t& new_conf) override; - void rollback(uint64_t lsn, nuraft::buffer&) override { LOGCRITICAL("Unimplemented rollback on: [{}]", lsn); } + void rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) override; + void rollback_ext(const nuraft::state_machine::ext_op_params& params) override; void become_ready(); void create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done) override; diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index 9ccc40dfc..169fc7f8a 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -152,6 +152,7 @@ TEST_F(RaftReplDevTest, Resync_From_Non_Originator) { } #if 0 + TEST_F(RaftReplDevTest, Leader_Restart) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); g_helper->sync_for_test_start(); @@ -176,7 +177,6 @@ TEST_F(RaftReplDevTest, Leader_Restart) { g_helper->sync_for_cleanup_start(); } - TEST_F(RaftReplDevTest, Drop_Raft_Entry_Switch_Leader) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); g_helper->sync_for_test_start(); From 8452fc54437b940d486ab0323e3d4eacc363971d Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 13 Nov 2024 09:36:18 +0800 Subject: [PATCH 023/170] PushData only pushed to active followers. (#584) * PushData only pushed to active followers. If a follower is lagging too far, do not flood it with data from new IOs (new rreq, new LSNs) , reserve the capability for catching up, that follower can request data via FetchData. Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- src/lib/common/homestore_config.fbs | 4 ++ .../replication/repl_dev/raft_repl_dev.cpp | 58 ++++++++++++++----- src/lib/replication/repl_dev/raft_repl_dev.h | 1 + 4 files changed, 48 insertions(+), 17 deletions(-) diff --git a/conanfile.py b/conanfile.py index 076e8aa43..fd09a4e93 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.9" + version = "6.5.10" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index 24aae9f20..32ba410f6 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -264,6 +264,10 @@ table Consensus { // Log difference to determine if the follower is in resync mode resync_log_idx_threshold: int64 = 100; + + // Log difference from leader's point of view, to determine if the + // follower is laggy and if so, leader will stop pushing data until it drops under this threshold. + laggy_threshold: int64 = 2000; } table HomeStoreSettings { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index dea117736..2b116f896 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -363,23 +363,30 @@ void RaftReplDev::push_data_to_all_followers(repl_req_ptr_t rreq, sisl::sg_list flatbuffers::FlatBufferToString(builder.GetBufferPointer() + sizeof(flatbuffers::uoffset_t), PushDataRequestTypeTable()));*/ - RD_LOGD("Data Channel: Pushing data to all followers: rreq=[{}]", rreq->to_string()); - - group_msg_service() - ->data_service_request_unidirectional(nuraft_mesg::role_regex::ALL, PUSH_DATA, rreq->m_pkts) - .via(&folly::InlineExecutor::instance()) - .thenValue([this, rreq = std::move(rreq)](auto e) { - if (e.hasError()) { - RD_LOGE("Data Channel: Error in pushing data to all followers: rreq=[{}] error={}", rreq->to_string(), - e.error()); - handle_error(rreq, RaftReplService::to_repl_error(e.error())); - return; + auto peers = get_active_peers(); + auto calls = std::vector< nuraft_mesg::NullAsyncResult >(); + for (auto peer : peers) { + RD_LOGD("Data Channel: Pushing data to follower {}, rreq=[{}]", peer, rreq->to_string()); + calls.push_back(group_msg_service() + ->data_service_request_unidirectional(peer, PUSH_DATA, rreq->m_pkts) + .via(&folly::InlineExecutor::instance())); + } + folly::collectAllUnsafe(calls).thenValue([this, rreq](auto&& v_res) { + for (auto const& res : v_res) { + if (sisl_likely(res.value())) { + auto r = res.value(); + if (r.hasError()) { + // Just logging PushData error, no action is needed as follower can try by fetchData. + RD_LOGW("Data Channel: Error in pushing data to all followers: rreq=[{}] error={}", + rreq->to_string(), r.error()); + } } - // Release the buffer which holds the packets - RD_LOGD("Data Channel: Data push completed for rreq=[{}]", rreq->to_string()); - rreq->release_fb_builder(); - rreq->m_pkts.clear(); - }); + } + RD_LOGD("Data Channel: Data push completed for rreq=[{}]", rreq->to_string()); + // Release the buffer which holds the packets + rreq->release_fb_builder(); + rreq->m_pkts.clear(); + }); } void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_data) { @@ -1039,6 +1046,25 @@ std::vector< peer_info > RaftReplDev::get_replication_status() const { return pi; } +std::set< replica_id_t > RaftReplDev::get_active_peers() const { + auto repl_status = get_replication_status(); + std::set< replica_id_t > res; + auto my_committed_idx = m_commit_upto_lsn.load(); + uint64_t least_active_repl_idx = my_committed_idx > HS_DYNAMIC_CONFIG(consensus.laggy_threshold) + ? my_committed_idx - HS_DYNAMIC_CONFIG(consensus.laggy_threshold) + : 0; + for (auto p : repl_status) { + if (p.id_ == m_my_repl_id) { continue; } + if (p.replication_idx_ >= least_active_repl_idx) { + res.insert(p.id_); + } else { + RD_LOGW("Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}", p.id_, + my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_); + } + } + return res; +} + uint32_t RaftReplDev::get_blk_size() const { return data_service().get_blk_size(); } nuraft_mesg::repl_service_ctx* RaftReplDev::group_msg_service() { return m_repl_svc_ctx.get(); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 5cb1516ac..0d5c8b8d8 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -176,6 +176,7 @@ class RaftReplDev : public ReplDev, bool is_leader() const override; replica_id_t get_leader_id() const override; std::vector< peer_info > get_replication_status() const override; + std::set< replica_id_t > get_active_peers() const; group_id_t group_id() const override { return m_group_id; } std::string group_id_str() const { return boost::uuids::to_string(m_group_id); } std::string rdev_name() const { return m_rdev_name; } From 6f6b4fbb8623c8ec9e6fdd9e83df1f84794bd0c6 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 13 Nov 2024 09:36:29 +0800 Subject: [PATCH 024/170] Set min_log_gap_to_join to max_int32 and enabled new_joiner_type Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- src/lib/common/homestore_config.fbs | 2 +- src/lib/replication/service/raft_repl_service.cpp | 9 ++++----- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/conanfile.py b/conanfile.py index fd09a4e93..05c596b80 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.10" + version = "6.5.11" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index 32ba410f6..da058fdb6 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -242,7 +242,7 @@ table Consensus { stale_log_gap_lo_threshold: int32 = 30; // Minimum log gap a replica has to be from leader before joining the replica set. - min_log_gap_to_join: int32 = 30; + min_log_gap_to_join: int32 = 2147483647; // amount of time in millis to wait on data write before fetch data from remote; wait_data_write_timer_ms: uint64 = 1500 (hotswap); diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 0469d7829..1ec45d9d0 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -99,18 +99,17 @@ void RaftReplService::start() { .with_hb_interval(HS_DYNAMIC_CONFIG(consensus.heartbeat_period_ms)) .with_max_append_size(HS_DYNAMIC_CONFIG(consensus.max_append_batch_size)) .with_log_sync_batch_size(HS_DYNAMIC_CONFIG(consensus.log_sync_batch_size)) - // TODO to fix the log_gap thresholds when adding new member. - // When the option is enabled, new member is doing log sync is stuck after the first batch - // where if the option is disabled, new member is going through append entries and it works. -#if 0 .with_log_sync_stopping_gap(HS_DYNAMIC_CONFIG(consensus.min_log_gap_to_join)) -#endif .with_stale_log_gap(HS_DYNAMIC_CONFIG(consensus.stale_log_gap_hi_threshold)) .with_fresh_log_gap(HS_DYNAMIC_CONFIG(consensus.stale_log_gap_lo_threshold)) .with_snapshot_enabled(HS_DYNAMIC_CONFIG(consensus.snapshot_freq_distance)) .with_leadership_expiry(HS_DYNAMIC_CONFIG(consensus.leadership_expiry_ms)) .with_reserved_log_items(HS_DYNAMIC_CONFIG(consensus.num_reserved_log_items)) .with_auto_forwarding(false); + // new_joiner_type fully disabled log pack behavior. + // There is no callback available for handling and localizing the log entries within the pack, which could + // result in data corruption. + r_params.use_new_joiner_type_ = true; r_params.return_method_ = nuraft::raft_params::async_handler; m_msg_mgr->register_mgr_type(params.default_group_type_, r_params); From 328cef339878f00d62072f1d841392ed6c2b7524 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Wed, 13 Nov 2024 14:32:26 +0800 Subject: [PATCH 025/170] handle nagtive log batch size returned by follower (#588) when follower hits some error before appending log entries, it will set batch_size_hint_in_bytes to -1 to ask leader do not send more log entries in the next append_log_req. https://github.com/eBay/NuRaft/blob/eabdeeda538a27370943f79a2b08b5738b697ac3/src/handle_append_entries.cxx#L760 in nuobject case , if a new member is added to a raft group and it tries to append create_shard log entry , which will try to alllocate block from the chunks of the pg, before the create_pg log is committed , which will allocated chunks to this pg, and error will happen and the log batch containing create_shard log entry will be wholy rejected and set batch_size_hint_in_bytes to -1 in the response to leader. this pr aims to set the log count in the next batch sent to follower to 1, so that: if the create_pg and create_shard are in the same log batch , the pr will first reject this log batch and leader will send only create_pg in the next batch , which will be accepted by follower , since it will only create this pg. if if the create_pg and create_shard are not in the same log batch, and create_shard is trying to allocate block before the pg it created(chunks of this pg is alllocated), then , with this pr, follower will reject this batch so that it will give more time to creating pg. create_shard log will be resent in the next batch , and at that moment pg has probably already been successfully be created. --- .../log_store/home_raft_log_store.cpp | 16 +++++++++++--- .../log_store/home_raft_log_store.h | 22 +++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index 293494274..a9f7c0301 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -192,9 +192,7 @@ void HomeRaftLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& e // remove all cached entries after this index for (size_t i{0}; i < m_log_entry_cache.size(); ++i) { - if (m_log_entry_cache[i].first > index) { - m_log_entry_cache[i] = std::make_pair(0, nullptr); - } + if (m_log_entry_cache[i].first > index) { m_log_entry_cache[i] = std::make_pair(0, nullptr); } } } @@ -223,6 +221,18 @@ nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > HomeRaftLogStore: return out_vec; } +nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > +HomeRaftLogStore::log_entries_ext(ulong start, ulong end, int64_t batch_size_hint_in_bytes) { + // in nuraft , batch_size_hint_in_bytes < 0 indicats that follower is busy now and do not want to receive any more + // log entries ATM. here we just send one log entry if this happens which is helpful for nuobject case and no harm + // to other case. + if (batch_size_hint_in_bytes < 0) end = start + 1; + + // for the case where batch_size_hint_in_bytes >= 0, we do not take any size check here for now. + // TODO: limit the size of the returned entries by batch_size_hint_in_bytes int the future if necessary + return log_entries(start, end); +} + nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::entry_at(ulong index) { auto positio_in_cache = index % m_log_entry_cache.size(); { diff --git a/src/lib/replication/log_store/home_raft_log_store.h b/src/lib/replication/log_store/home_raft_log_store.h index ccf46ef92..3c4c021ef 100644 --- a/src/lib/replication/log_store/home_raft_log_store.h +++ b/src/lib/replication/log_store/home_raft_log_store.h @@ -99,12 +99,34 @@ class HomeRaftLogStore : public nuraft::log_store { /** * Get log entries with index [start, end). * + * Return nullptr to indicate error if any log entry within the requested range + * could not be retrieved (e.g. due to external log truncation). + * * @param start The start log index number (inclusive). * @param end The end log index number (exclusive). * @return The log entries between [start, end). */ virtual nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > log_entries(ulong start, ulong end) override; + /** + * Get log entries with index [start, end). + * + * The total size of the returned entries is limited by batch_size_hint. + * + * Return nullptr to indicate error if any log entry within the requested range + * could not be retrieved (e.g. due to external log truncation). + * + * @param start The start log index number (inclusive). + * @param end The end log index number (exclusive). + * @param batch_size_hint_in_bytes Total size (in bytes) of the returned entries, + * see the detailed comment at + * `state_machine::get_next_batch_size_hint_in_bytes()`. + * @return The log entries between [start, end) and limited by the total size + * given by the batch_size_hint_in_bytes. + */ + virtual nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > + log_entries_ext(ulong start, ulong end, int64_t batch_size_hint_in_bytes = 0) override; + /** * Get the log entry at the specified log index number. * From f83679af8ce71e71fb5b6f57df6d54047cd7d940 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Mon, 11 Nov 2024 15:14:57 +0800 Subject: [PATCH 026/170] Checking received data size and reject if not match. We dont need to panic in this case, fetchData can handle this. Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 05c596b80..0bd95ab50 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.11" + version = "6.5.12" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 2b116f896..6cf4411dd 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -401,8 +401,12 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d auto const fb_size = flatbuffers::ReadScalar< flatbuffers::uoffset_t >(incoming_buf.cbytes()) + sizeof(flatbuffers::uoffset_t); auto push_req = GetSizePrefixedPushDataRequest(incoming_buf.cbytes()); - HS_DBG_ASSERT_EQ(fb_size + push_req->data_size(), incoming_buf.size(), "Size mismatch of data size vs buffer size"); - + if (fb_size + push_req->data_size() != incoming_buf.size()) { + RD_LOGW("Data Channel: PushData received with size mismatch, header size {}, data size {}, received size {}", + fb_size, push_req->data_size(), incoming_buf.size()); + rpc_data->send_response(); + return; + } sisl::blob header = sisl::blob{push_req->user_header()->Data(), push_req->user_header()->size()}; sisl::blob key = sisl::blob{push_req->user_key()->Data(), push_req->user_key()->size()}; repl_key rkey{.server_id = push_req->issuer_replica_id(), .term = push_req->raft_term(), .dsn = push_req->dsn()}; From 93b04b721483518505a1130fbfdefbfd8c7177f5 Mon Sep 17 00:00:00 2001 From: Hooper <62418134+Hooper9973@users.noreply.github.com> Date: Thu, 14 Nov 2024 17:47:02 +0800 Subject: [PATCH 027/170] Add application_hint into blk_alloc_hints (#591) Add application_hint to the blk_alloc_hints structure. This change addresses the need for certain users of homestore, such as homeobject, to pass additional hints. The application_hint can be used to specify behavior in the select_chunk interface. --- conanfile.py | 2 +- src/include/homestore/blk.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 0bd95ab50..c80ca7c92 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.12" + version = "6.5.13" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h index 5159afb08..b7e175b35 100644 --- a/src/include/homestore/blk.h +++ b/src/include/homestore/blk.h @@ -252,6 +252,7 @@ struct blk_alloc_hints { std::optional< uint32_t > pdev_id_hint; // which physical device to pick (hint if any) -1 for don't care std::optional< chunk_num_t > chunk_id_hint; // any specific chunk id to pick for this allocation std::optional< stream_id_t > stream_id_hint; // any specific stream to pick + std::optional< uint64_t > application_hint; // hints in uint64 what will be passed opaque to select_chunk bool can_look_for_other_chunk{true}; // If alloc on device not available can I pick other device bool is_contiguous{true}; // Should the entire allocation be one contiguous block bool partial_alloc_ok{false}; // ok to allocate only portion of nblks? Mutually exclusive with is_contiguous From da19fe48a5458a448e4ec141b96bbf2f2c6127fe Mon Sep 17 00:00:00 2001 From: Sanal Date: Mon, 18 Nov 2024 09:57:46 -0800 Subject: [PATCH 028/170] Disable dynamic repl ut temporarily. (#593) --- conanfile.py | 2 +- src/tests/CMakeLists.txt | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index c80ca7c92..b4326999a 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.13" + version = "6.5.14" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index ce8ccb422..940d2e891 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -130,7 +130,7 @@ if (${io_tests}) add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr) add_test(NAME DataService-Epoll COMMAND test_data_service) add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev) - add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic) + # add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic) # add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) endif() @@ -143,7 +143,7 @@ if (${io_tests}) add_test(NAME SoloReplDev-Spdk COMMAND test_solo_repl_dev -- --spdk "true") add_test(NAME HomeRaftLogStore-Spdk COMMAND test_home_raft_logstore -- --spdk "true") add_test(NAME RaftReplDev-Spdk COMMAND test_raft_repl_dev -- --spdk "true") - add_test(NAME RaftReplDevDynamic-Spdk COMMAND test_raft_repl_dev_dynamic -- --spdk "true") + # add_test(NAME RaftReplDevDynamic-Spdk COMMAND test_raft_repl_dev_dynamic -- --spdk "true") if(${epoll_tests}) SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk) SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk) From 1a87f7181a7da7b20145e719bb743b08a8e3fedc Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Mon, 25 Nov 2024 15:59:59 +0800 Subject: [PATCH 029/170] handle RemovedFromCluster event (#594) 1 consume nuraft::cb_func::Type::RemovedFromCluster callback 2 add reset function to allocator/vchunk as a preparation for implementing m_listener->on_destroy() --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 9 +++-- src/include/homestore/vchunk.h | 1 + src/lib/blkalloc/append_blk_allocator.cpp | 7 ++++ src/lib/blkalloc/append_blk_allocator.h | 33 +++++++++++-------- src/lib/blkalloc/bitmap_blk_allocator.h | 1 + src/lib/blkalloc/blk_allocator.h | 1 + src/lib/blkalloc/fixed_blk_allocator.h | 1 + src/lib/blkalloc/varsize_blk_allocator.h | 1 + src/lib/device/vchunk.cpp | 2 ++ .../replication/repl_dev/raft_repl_dev.cpp | 32 ++++++++++++++---- src/lib/replication/repl_dev/raft_repl_dev.h | 6 ++-- .../repl_dev/raft_state_machine.cpp | 30 ++++++++++++++--- src/lib/replication/repl_dev/solo_repl_dev.h | 2 ++ src/tests/test_common/raft_repl_test_base.hpp | 6 ++-- src/tests/test_solo_repl_dev.cpp | 2 +- 16 files changed, 101 insertions(+), 35 deletions(-) diff --git a/conanfile.py b/conanfile.py index b4326999a..d6fd6f6f1 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.14" + version = "6.5.16" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index cf0e00a0c..20e9a170f 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -144,7 +144,8 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: MultiBlkId const& local_blkid() const { return m_local_blkid; } RemoteBlkId const& remote_blkid() const { return m_remote_blkid; } const char* data() const { - DEBUG_ASSERT(m_data != nullptr, "m_data is nullptr, use before save_pushed/fetched_data or after release_data()"); + DEBUG_ASSERT(m_data != nullptr, + "m_data is nullptr, use before save_pushed/fetched_data or after release_data()"); return r_cast< const char* >(m_data); } repl_req_state_t state() const { return repl_req_state_t(m_state.load()); } @@ -349,7 +350,7 @@ class ReplDevListener { /// @brief Called when the repl_dev is being destroyed. The consumer is expected to clean up any related resources. /// However, it is expected that this call be idempotent. It is possible in rare scenarios that this can be called /// after restart in case crash happened during the destroy. - virtual void on_destroy() = 0; + virtual void on_destroy(const group_id_t& group_id) = 0; /// @brief Called when replace member is performed. virtual void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) = 0; @@ -450,6 +451,10 @@ class ReplDev { /// @return Block size virtual uint32_t get_blk_size() const = 0; + /// @brief Gets the last commit lsn of this repldev + /// @return last_commit_lsn + virtual repl_lsn_t get_last_commit_lsn() const = 0; + virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } virtual void detach_listener() { diff --git a/src/include/homestore/vchunk.h b/src/include/homestore/vchunk.h index 0406d428f..4b69b1332 100644 --- a/src/include/homestore/vchunk.h +++ b/src/include/homestore/vchunk.h @@ -36,6 +36,7 @@ class VChunk { uint16_t get_chunk_id() const; cshared< Chunk > get_internal_chunk() const; uint64_t size() const; + void reset(); private: shared< Chunk > m_internal_chunk; diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index 4a4c7fd18..1380a5ff6 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -162,6 +162,13 @@ bool AppendBlkAllocator::is_blk_alloced(const BlkId& in_bid, bool) const { return in_bid.blk_num() < get_used_blks(); } +void AppendBlkAllocator::reset() { + m_last_append_offset.store(0); + m_freeable_nblks.store(0); + m_commit_offset.store(0); + m_is_dirty.store(true); +} + bool AppendBlkAllocator::is_blk_alloced_on_disk(BlkId const& bid, bool) const { return bid.blk_num() < m_sb->commit_offset; } diff --git a/src/lib/blkalloc/append_blk_allocator.h b/src/lib/blkalloc/append_blk_allocator.h index 384a4936b..5e745c33a 100644 --- a/src/lib/blkalloc/append_blk_allocator.h +++ b/src/lib/blkalloc/append_blk_allocator.h @@ -38,21 +38,21 @@ struct append_blk_sb_t { }; #pragma pack() -//class AppendBlkAllocMetrics : public sisl::MetricsGroup { -//public: -// explicit AppendBlkAllocMetrics(const char* inst_name) : sisl::MetricsGroup("AppendBlkAlloc", inst_name) { -// REGISTER_COUNTER(num_alloc, "Number of blks alloc attempts"); -// REGISTER_COUNTER(num_alloc_failure, "Number of blk alloc failures"); +// class AppendBlkAllocMetrics : public sisl::MetricsGroup { +// public: +// explicit AppendBlkAllocMetrics(const char* inst_name) : sisl::MetricsGroup("AppendBlkAlloc", inst_name) { +// REGISTER_COUNTER(num_alloc, "Number of blks alloc attempts"); +// REGISTER_COUNTER(num_alloc_failure, "Number of blk alloc failures"); // -// register_me_to_farm(); -// } +// register_me_to_farm(); +// } // -// AppendBlkAllocMetrics(const AppendBlkAllocMetrics&) = delete; -// AppendBlkAllocMetrics(AppendBlkAllocMetrics&&) noexcept = delete; -// AppendBlkAllocMetrics& operator=(const AppendBlkAllocMetrics&) = delete; -// AppendBlkAllocMetrics& operator=(AppendBlkAllocMetrics&&) noexcept = delete; -// ~AppendBlkAllocMetrics() { deregister_me_from_farm(); } -//}; +// AppendBlkAllocMetrics(const AppendBlkAllocMetrics&) = delete; +// AppendBlkAllocMetrics(AppendBlkAllocMetrics&&) noexcept = delete; +// AppendBlkAllocMetrics& operator=(const AppendBlkAllocMetrics&) = delete; +// AppendBlkAllocMetrics& operator=(AppendBlkAllocMetrics&&) noexcept = delete; +// ~AppendBlkAllocMetrics() { deregister_me_from_farm(); } +// }; // // The assumption for AppendBlkAllocator: @@ -108,6 +108,11 @@ class AppendBlkAllocator : public BlkAllocator { std::string to_string() const override; + /** + * @brief : reset the allocator to initial state, so all the blks in this chunk are free. + */ + void reset() override; + void cp_flush(CP* cp) override; void recovery_completed() override {} nlohmann::json get_status(int log_level) const override; @@ -121,7 +126,7 @@ class AppendBlkAllocator : public BlkAllocator { std::atomic< blk_num_t > m_freeable_nblks{0}; // count of blks fragmentedly freed (both on-disk and in-memory) std::atomic< blk_num_t > m_commit_offset{0}; // offset in on-disk version std::atomic< bool > m_is_dirty{false}; - //AppendBlkAllocMetrics m_metrics; + // AppendBlkAllocMetrics m_metrics; superblk< append_blk_sb_t > m_sb; // only cp will be writing to this disk }; diff --git a/src/lib/blkalloc/bitmap_blk_allocator.h b/src/lib/blkalloc/bitmap_blk_allocator.h index 381767bef..a86e08757 100644 --- a/src/lib/blkalloc/bitmap_blk_allocator.h +++ b/src/lib/blkalloc/bitmap_blk_allocator.h @@ -77,6 +77,7 @@ class BitmapBlkAllocator : public BlkAllocator { void cp_flush(CP* cp) override; void recovery_completed() override {} + void reset() override {} blk_num_t get_num_portions() const { return (m_num_blks - 1) / m_blks_per_portion + 1; } blk_num_t get_blks_per_portion() const { return m_blks_per_portion; } diff --git a/src/lib/blkalloc/blk_allocator.h b/src/lib/blkalloc/blk_allocator.h index 3ba0ecf82..f02aa3dd2 100644 --- a/src/lib/blkalloc/blk_allocator.h +++ b/src/lib/blkalloc/blk_allocator.h @@ -161,6 +161,7 @@ class BlkAllocator { virtual bool is_blk_alloced(BlkId const& b, bool use_lock = false) const = 0; virtual bool is_blk_alloced_on_disk(BlkId const& b, bool use_lock = false) const = 0; virtual void recovery_completed() = 0; + virtual void reset() = 0; virtual std::string to_string() const = 0; virtual void cp_flush(CP* cp) = 0; diff --git a/src/lib/blkalloc/fixed_blk_allocator.h b/src/lib/blkalloc/fixed_blk_allocator.h index fa28681f2..01f1e1138 100644 --- a/src/lib/blkalloc/fixed_blk_allocator.h +++ b/src/lib/blkalloc/fixed_blk_allocator.h @@ -41,6 +41,7 @@ class FixedBlkAllocator : public BitmapBlkAllocator { blk_num_t available_blks() const override; blk_num_t get_used_blks() const override; blk_num_t get_defrag_nblks() const override; + void reset() override{}; bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; std::string to_string() const override; diff --git a/src/lib/blkalloc/varsize_blk_allocator.h b/src/lib/blkalloc/varsize_blk_allocator.h index 1a90de8da..03a507b03 100644 --- a/src/lib/blkalloc/varsize_blk_allocator.h +++ b/src/lib/blkalloc/varsize_blk_allocator.h @@ -222,6 +222,7 @@ class VarsizeBlkAllocator : public BitmapBlkAllocator { blk_num_t get_used_blks() const override; bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; std::string to_string() const override; + void reset() override{}; nlohmann::json get_metrics_in_json(); private: diff --git a/src/lib/device/vchunk.cpp b/src/lib/device/vchunk.cpp index 26391ac1b..a809450d1 100644 --- a/src/lib/device/vchunk.cpp +++ b/src/lib/device/vchunk.cpp @@ -25,6 +25,8 @@ const uint8_t* VChunk::get_user_private() const { return m_internal_chunk->user_ blk_num_t VChunk::get_total_blks() const { return m_internal_chunk->blk_allocator()->get_total_blks(); } +void VChunk::reset() { m_internal_chunk->blk_allocator_mutable()->reset(); } + blk_num_t VChunk::available_blks() const { return m_internal_chunk->blk_allocator()->available_blks(); } blk_num_t VChunk::get_defrag_nblks() const { return m_internal_chunk->blk_allocator()->get_defrag_nblks(); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 6cf4411dd..7b4a407cb 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -914,8 +914,7 @@ void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { auto blkid = rreq->local_blkid(); data_service().async_free_blk(blkid).thenValue([this, blkid](auto&& err) { - HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", - blkid.to_string()); + HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", blkid.to_string()); RD_LOGD("Rollback rreq: Releasing blkid={} freed successfully", blkid.to_string()); }); } @@ -1212,7 +1211,7 @@ void RaftReplDev::leave() { // We let the listener know right away, so that they can cleanup persistent structures soonest. This will // reduce the time window of leaked resources if any - m_listener->on_destroy(); + m_listener->on_destroy(group_id()); // Persist that destroy pending in superblk, so that in case of crash before cleanup of resources, it can be done // post restart. @@ -1227,7 +1226,8 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu nuraft::cb_func::Param* param) { auto ret = nuraft::cb_func::ReturnCode::Ok; - if (type == nuraft::cb_func::Type::GotAppendEntryReqFromLeader) { + switch (type) { + case nuraft::cb_func::Type::GotAppendEntryReqFromLeader: { auto raft_req = r_cast< nuraft::req_msg* >(param->ctx); auto const& entries = raft_req->log_entries(); @@ -1276,9 +1276,29 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu sisl::VectorPool< repl_req_ptr_t >::free(reqs); } return {true, ret}; - } else { - return {false, ret}; } + + case nuraft::cb_func::Type::RemovedFromCluster: { + // a node will reach here when : + // 1. it is removed from the cluster and the new config(excluding this node) is being committed on this node + // 2. it is removed from the cluster , but the node is down and new config log(excluding this node) is not + // replicated to this removed node. when the node restart, leader will not send any append entry to this node, + // since it is not a member of the raft group. it will become a condidate and send request-vote request to other + // members of this raft group. a member will send RemovedFromCluster to the node if this member finds the node + // is no longer a member of the raft group. + + // this will lazily cleanup the group + // TODO:cleanup this repl dev ASAP if necessary. + leave(); + + return {true, ret}; + } + + // TODO: Add more type handler if necessary + default: + break; + } + return {false, ret}; } void RaftReplDev::flush_durable_commit_lsn() { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 0d5c8b8d8..2bf7cc52c 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -182,7 +182,7 @@ class RaftReplDev : public ReplDev, std::string rdev_name() const { return m_rdev_name; } std::string my_replica_id_str() const { return boost::uuids::to_string(m_my_repl_id); } uint32_t get_blk_size() const override; - repl_lsn_t get_last_commit_lsn() const { return m_commit_upto_lsn.load(); } + repl_lsn_t get_last_commit_lsn() const override { return m_commit_upto_lsn.load(); } void set_last_commit_lsn(repl_lsn_t lsn) { m_commit_upto_lsn.store(lsn); } bool is_destroy_pending() const; bool is_destroyed() const; @@ -229,9 +229,7 @@ class RaftReplDev : public ReplDev, * * @param num_reserved_entries The number of reserved entries of the replication log. */ - void truncate(uint32_t num_reserved_entries) { - m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load()); - } + void truncate(uint32_t num_reserved_entries) { m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load()); } void wait_for_logstore_ready() { m_data_journal->wait_for_log_store_ready(); } diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 654851dc8..2047a3b28 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -204,8 +204,32 @@ raft_buf_ptr_t RaftStateMachine::commit_ext(nuraft::state_machine::ext_op_params } void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_ptr_t& new_conf) { + // when reaching here, the config change log has already been committed, and the new config has been applied to the + // cluster + RD_LOGD("Raft channel: Commit new cluster conf , log_idx = {}", log_idx); - // TODO:add more logic here if necessary + +#ifdef _PRERELEASE + auto& servers_in_new_conf = new_conf->get_servers(); + std::vector< int32_t > server_ids_in_new_conf; + for (auto& server : servers_in_new_conf) + server_ids_in_new_conf.emplace_back(server->get_id()); + + auto my_id = m_rd.server_id(); + + std::ostringstream oss; + auto it = server_ids_in_new_conf.begin(); + if (it != server_ids_in_new_conf.end()) { + oss << *it; + ++it; + } + for (; it != server_ids_in_new_conf.end(); ++it) { + oss << "," << *it; + } + + RD_LOG(INFO, "Raft channel: server ids in new cluster conf : {}, my_id {}, group_id {}", oss.str(), my_id, + m_rd.group_id_str()); +#endif } void RaftStateMachine::rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) { @@ -242,9 +266,7 @@ void RaftStateMachine::unlink_lsn_to_req(int64_t lsn, repl_req_ptr_t rreq) { // it is possible a LSN mapped to different rreq in history // due to log overwritten. Verify the rreq before removing auto deleted = m_lsn_req_map.erase_if_equal(lsn, rreq); - if (deleted) { - RD_LOG(DEBUG, "Raft channel: erase lsn {}, rreq {}", lsn, rreq->to_string()); - } + if (deleted) { RD_LOG(DEBUG, "Raft channel: erase lsn {}, rreq {}", lsn, rreq->to_string()); } } void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) { diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index cddb94856..911f4bd28 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -56,6 +56,8 @@ class SoloReplDev : public ReplDev { uuid_t group_id() const override { return m_group_id; } + repl_lsn_t get_last_commit_lsn() const override { return 0; } + uint32_t get_blk_size() const override; void cp_flush(CP* cp); diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 1ab90143a..889ab72bb 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -306,10 +306,10 @@ class TestReplicatedDB : public homestore::ReplDevListener { boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); } - void on_destroy() override { + void on_destroy(const group_id_t& group_id) override { LOGINFOMOD(replication, "[Replica={}] Group={} is being destroyed", g_helper->replica_num(), - boost::uuids::to_string(repl_dev()->group_id())); - g_helper->unregister_listener(repl_dev()->group_id()); + boost::uuids::to_string(group_id)); + g_helper->unregister_listener(group_id); } void db_write(uint64_t data_size, uint32_t max_size_per_iov) { diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 2e17235f2..1b990d592 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -137,7 +137,7 @@ class SoloReplDevTest : public testing::Test { LOGINFO("Received error={} on repl_dev", enum_name(error)); } void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) override {} - void on_destroy() override {} + void on_destroy(const group_id_t& group_id) override {} }; class Application : public ReplApplication { From 6a2dfd8e212a9a4b5c9b17c3d1de45acb3907ce2 Mon Sep 17 00:00:00 2001 From: ywz <649521587@qq.com> Date: Tue, 26 Nov 2024 10:54:08 +0800 Subject: [PATCH 030/170] Fix grpc crash (#595) * release data before set m_data_written_promise authored-by: yawzhang --- conanfile.py | 2 +- src/lib/replication/repl_dev/common.cpp | 5 ++++- src/lib/replication/repl_dev/raft_repl_dev.cpp | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/conanfile.py b/conanfile.py index d6fd6f6f1..606d1a6ea 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.16" + version = "6.5.17" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 4fcbb0f4e..1c2a8c560 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -164,15 +164,18 @@ bool repl_req_ctx::add_state_if_not_already(repl_req_state_t s) { void repl_req_ctx::clear() { m_header = sisl::blob{}; m_key = sisl::blob{}; - release_data(); m_pkts.clear(); } +// FIXME: Use lock to avoid concurrent release of data. void repl_req_ctx::release_data() { m_data = nullptr; // explicitly clear m_buf_for_unaligned_data as unaligned pushdata/fetchdata will be saved here m_buf_for_unaligned_data = sisl::io_blob_safe{}; if (m_pushed_data) { + LOGTRACEMOD(replication, "m_pushed_data addr={}, m_rkey={}, m_lsn={}", + static_cast(m_pushed_data.get()), + m_rkey.to_string(), m_lsn); m_pushed_data->send_response(); m_pushed_data = nullptr; } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 7b4a407cb..2d93c4070 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -453,9 +453,9 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d RD_DBG_ASSERT(false, "Error in writing data, error_code={}", err.value()); handle_error(rreq, ReplServiceError::DRIVE_WRITE_ERROR); } else { + rreq->release_data(); rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->m_data_written_promise.setValue(); - rreq->release_data(); const auto data_log_diff_us = push_data_rcv_time.time_since_epoch().count() > rreq->created_time().time_since_epoch().count() ? get_elapsed_time_us(rreq->created_time(), push_data_rcv_time) @@ -872,9 +872,9 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons RD_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener + rreq->release_data(); rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->m_data_written_promise.setValue(); - rreq->release_data(); RD_LOGD("Data Channel: Data Write completed rreq=[{}], data_write_latency_us={}, " "total_write_latency_us={}, write_num_pieces={}", From 78118557ffc63d3e0f26da66079c46ebefc4ddb6 Mon Sep 17 00:00:00 2001 From: Hooper Date: Tue, 26 Nov 2024 10:54:41 +0800 Subject: [PATCH 031/170] Support flexible virtual device creation in `homestore::BlkDataService` with num_chunks or chunk_size. Prioritize `num_chunks` over `chunk_size` if both are provided. --- conanfile.py | 2 +- src/include/homestore/blkdata_service.hpp | 8 +++++--- src/lib/blkdata_svc/blkdata_service.cpp | 3 ++- src/lib/homestore.cpp | 4 ++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/conanfile.py b/conanfile.py index 606d1a6ea..99e129017 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.17" + version = "6.5.18" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index b82ec886b..fff670f44 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -56,17 +56,19 @@ class BlkDataService { /** * @brief Creates a new virtual device with the specified size and block size, using the specified - * block allocator and chunk selector types. The virtual device will be composed of the specified - * number of chunks. + * block allocator and chunk selector types. The virtual device will be composed of a number of chunks. + * Either `num_chunks` or `chunk_size` must be specified. + * Prioritize `num_chunks` over `chunk_size` if both are provided. * * @param size The size of the virtual device, in bytes. * @param blk_size The size of each block in the virtual device, in bytes. * @param alloc_type The type of block allocator to use for the virtual device. * @param chunk_sel_type The type of chunk selector to use for the virtual device. * @param num_chunks The number of chunks to use for the virtual device. + * @param chunk_size The size of chunks to use for the virtual device, in bytes. */ void create_vdev(uint64_t size, HSDevType devType, uint32_t blk_size, blk_allocator_type_t alloc_type, - chunk_selector_type_t chunk_sel_type, uint32_t num_chunks); + chunk_selector_type_t chunk_sel_type, uint32_t num_chunks, uint32_t chunk_size); /** * @brief Opens a virtual device with the specified virtual device information. diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index 4acd3d846..5e80ac7e0 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -38,7 +38,7 @@ BlkDataService::~BlkDataService() = default; // first-time boot path void BlkDataService::create_vdev(uint64_t size, HSDevType devType, uint32_t blk_size, blk_allocator_type_t alloc_type, - chunk_selector_type_t chunk_sel_type, uint32_t num_chunks) { + chunk_selector_type_t chunk_sel_type, uint32_t num_chunks, uint32_t chunk_size) { hs_vdev_context vdev_ctx; vdev_ctx.type = hs_vdev_type_t::DATA_VDEV; @@ -48,6 +48,7 @@ void BlkDataService::create_vdev(uint64_t size, HSDevType devType, uint32_t blk_ .vdev_size = size, .num_chunks = num_chunks, .blk_size = blk_size, + .chunk_size = chunk_size, .dev_type = devType, .alloc_type = alloc_type, .chunk_sel_type = chunk_sel_type, diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index feec506c5..35782bf8d 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -226,11 +226,11 @@ void HomeStore::format_and_start(std::map< uint32_t, hs_format_params >&& format } else if ((svc_type & HS_SERVICE::DATA) && has_data_service()) { m_data_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type, fparams.block_size, fparams.alloc_type, fparams.chunk_sel_type, - fparams.num_chunks); + fparams.num_chunks, fparams.chunk_size); } else if ((svc_type & HS_SERVICE::REPLICATION) && has_repl_data_service()) { m_data_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type, fparams.block_size, fparams.alloc_type, fparams.chunk_sel_type, - fparams.num_chunks); + fparams.num_chunks, fparams.chunk_size); } } From 89b86ff3995ef214a2d18b280e1d9c0e8e4566e6 Mon Sep 17 00:00:00 2001 From: yuwmao <148639999+yuwmao@users.noreply.github.com> Date: Wed, 4 Dec 2024 10:06:59 +0800 Subject: [PATCH 032/170] Support Baseline Resync (#596) * Support Baseline resync For Nuraft baseline resync, we separate the process into two layers: HomeStore layer and Application layer. We use the first bit of the obj_id to indicate the message type: 0 is for HS, 1 is for Application. In the HomeStore layer, leader needs to transmit the DSN to the follower, this is intended to handle the following case: 1. Leader sends snapshot at LSN T1 to follower F1. 2. F1 fully receives the snapshot and now at T1. 3. Leader yield its leadership, F1 elected as leader. In this sequence the incremental resync will not kicked in to update the m_next_dsn, and as result, duplication may occur. --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 27 ++++++--- .../replication/repl_dev/raft_repl_dev.cpp | 36 ++++++++++++ src/lib/replication/repl_dev/raft_repl_dev.h | 2 + .../repl_dev/raft_state_machine.cpp | 29 ++++++++-- .../replication/repl_dev/raft_state_machine.h | 6 ++ src/tests/test_common/raft_repl_test_base.hpp | 58 +++++++++++++------ src/tests/test_solo_repl_dev.cpp | 4 +- 8 files changed, 131 insertions(+), 33 deletions(-) diff --git a/conanfile.py b/conanfile.py index 99e129017..bc914e16c 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.18" + version = "6.5.19" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 20e9a170f..335cda834 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -46,6 +46,10 @@ VENUM(journal_type_t, uint16_t, HS_CTRL_REPLACE = 3, // Control message to replace a member ) +// magic num comes from the first 8 bytes of 'echo homestore_resync_data | md5sum' +static constexpr uint64_t HOMESTORE_RESYNC_DATA_MAGIC = 0xa65dbd27c213f327; +static constexpr uint32_t HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1 = 0x01; + struct repl_key { int32_t server_id{0}; // Server Id which this req is originated from uint64_t term; // RAFT term number @@ -112,14 +116,23 @@ class nuraft_snapshot_context : public snapshot_context { nuraft::ptr< nuraft::snapshot > snapshot_; }; -struct snapshot_data { +struct snapshot_obj { void* user_ctx{nullptr}; - int64_t offset{0}; + uint64_t offset{0}; sisl::io_blob_safe blob; bool is_first_obj{false}; bool is_last_obj{false}; }; +//HomeStore has some meta information to be transmitted during the baseline resync, +//Although now only dsn needs to be synced, this structure is defined as a general message, and we can easily add data if needed in the future. +struct snp_repl_dev_data { + uint64_t magic_num{HOMESTORE_RESYNC_DATA_MAGIC}; + uint32_t protocol_version{HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1}; + uint32_t crc{0}; + uint64_t dsn{0}; +}; + struct repl_journal_entry; struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost::thread_safe_counter >, sisl::ObjLifeCounter< repl_req_ctx > { @@ -368,16 +381,16 @@ class ReplDevListener { /// uses offset given by the follower to the know the current state of the follower. /// Leader sends the snapshot data to the follower in batch. This callback is called multiple /// times on the leader till all the data is transferred to the follower. is_last_obj in - /// snapshot_data will be true once all the data has been trasnferred. After this the raft on + /// snapshot_obj will be true once all the data has been trasnferred. After this the raft on /// the follower side can do the incremental resync. - virtual int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) = 0; + virtual int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_obj) = 0; /// @brief Called on the follower when the leader sends the data during the baseline resyc. - /// is_last_obj in in snapshot_data will be true once all the data has been transfered. + /// is_last_obj in in snapshot_obj will be true once all the data has been transfered. /// After this the raft on the follower side can do the incremental resync. - virtual void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) = 0; + virtual void write_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_obj) = 0; - /// @brief Free up user-defined context inside the snapshot_data that is allocated during read_snapshot_data. + /// @brief Free up user-defined context inside the snapshot_obj that is allocated during read_snapshot_obj. virtual void free_user_snp_ctx(void*& user_snp_ctx) = 0; private: diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 2d93c4070..72a39a27a 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1491,6 +1491,42 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx handle_commit(rreq, true /* recovery */); } +void RaftReplDev::create_snp_resync_data(raft_buf_ptr_t& data_out) { + snp_repl_dev_data msg; + auto msg_size = sizeof(snp_repl_dev_data); + msg.dsn = m_next_dsn; + auto crc = crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(&msg), msg_size); + RD_LOGD("create snapshot resync msg, dsn={}, crc={}", msg.dsn, crc); + msg.crc = crc; + data_out = nuraft::buffer::alloc(msg_size); + std::memcpy(data_out->data_begin(), &msg, msg_size); +} + +bool RaftReplDev::apply_snp_resync_data(nuraft::buffer& data) { + auto msg = r_cast< snp_repl_dev_data* >(data.data_begin()); + if (msg->magic_num != HOMESTORE_RESYNC_DATA_MAGIC || msg->protocol_version != + HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1) { + RD_LOGE("Snapshot resync data validation failed, magic={}, version={}", msg->magic_num, msg->protocol_version); + return false; + } + auto received_crc = msg->crc; + RD_LOGD("received snapshot resync msg, dsn={}, crc={}, received crc={}", msg->dsn, msg->crc, received_crc); + // Clear the crc field before verification, because the crc value computed by leader doesn't contain it. + msg->crc = 0; + auto computed_crc = crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(msg), + sizeof(snp_repl_dev_data)); + if (received_crc != computed_crc) { + RD_LOGE("Snapshot resync data crc mismatch, received_crc={}, computed_crc={}", received_crc, computed_crc); + return false; + } + if (msg->dsn > m_next_dsn) { + m_next_dsn = msg->dsn; + RD_LOGD("Update next_dsn from {} to {}", m_next_dsn.load(), msg->dsn); + return true; + } + return true; +} + void RaftReplDev::on_restart() { m_listener->on_restart(); } bool RaftReplDev::is_resync_mode() { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 2bf7cc52c..0550858cf 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -285,6 +285,8 @@ class RaftReplDev : public ReplDev, void commit_blk(repl_req_ptr_t rreq); void replace_member(repl_req_ptr_t rreq); void reset_quorum_size(uint32_t commit_quorum); + void create_snp_resync_data(raft_buf_ptr_t& data_out); + bool apply_snp_resync_data(nuraft::buffer& data); }; } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 2047a3b28..b64a32c24 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -297,14 +297,22 @@ void RaftStateMachine::create_snapshot(nuraft::snapshot& s, nuraft::async_result int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, ulong obj_id, raft_buf_ptr_t& data_out, bool& is_last_obj) { + // For Nuraft baseline resync, we separate the process into two layers: HomeStore layer and Application layer. + // We use the highest bit of the obj_id to indicate the message type: 0 is for HS, 1 is for Application. + if (is_hs_snp_obj(obj_id)) { + // This is the preserved msg for homestore to resync data + m_rd.create_snp_resync_data(data_out); + is_last_obj = false; + return 0; + } auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); - auto snp_data = std::make_shared< snapshot_data >(); + auto snp_data = std::make_shared< snapshot_obj >(); snp_data->user_ctx = user_ctx; snp_data->offset = obj_id; snp_data->is_last_obj = is_last_obj; // Listener will read the snapshot data and we pass through the same. - int ret = m_rd.m_listener->read_snapshot_data(snp_ctx, snp_data); + int ret = m_rd.m_listener->read_snapshot_obj(snp_ctx, snp_data); if (ret < 0) return ret; // Update user_ctx and whether is_last_obj @@ -320,8 +328,16 @@ int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, nuraft::buffer& data, bool is_first_obj, bool is_last_obj) { + if (is_hs_snp_obj(obj_id)) { + // Homestore preserved msg + if (m_rd.apply_snp_resync_data(data)) { + obj_id = snp_obj_id_type_app; + LOGDEBUG("apply_snp_resync_data success, next obj_id={}", obj_id); + } + return; + } auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); - auto snp_data = std::make_shared< snapshot_data >(); + auto snp_data = std::make_shared< snapshot_obj >(); snp_data->offset = obj_id; snp_data->is_first_obj = is_first_obj; snp_data->is_last_obj = is_last_obj; @@ -331,7 +347,7 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, std::memcpy(blob.bytes(), data.data_begin(), data.size()); snp_data->blob = std::move(blob); - m_rd.m_listener->write_snapshot_data(snp_ctx, snp_data); + m_rd.m_listener->write_snapshot_obj(snp_ctx, snp_data); // Update the object offset. obj_id = snp_data->offset; @@ -349,7 +365,10 @@ bool RaftStateMachine::apply_snapshot(nuraft::snapshot& s) { m_rd.set_last_commit_lsn(s.get_last_log_idx()); m_rd.m_data_journal->set_last_durable_lsn(s.get_last_log_idx()); auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); - return m_rd.m_listener->apply_snapshot(snp_ctx); + auto res = m_rd.m_listener->apply_snapshot(snp_ctx); + //make sure the changes are flushed. + hs()->cp_mgr().trigger_cp_flush(true /* force */).get(); + return res; } nuraft::ptr< nuraft::snapshot > RaftStateMachine::last_snapshot() { diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index 6bf4faf5a..8f00cec43 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -86,6 +86,10 @@ class StateMachineStore; #define RD_LOGE(...) RD_LOG(ERROR, ##__VA_ARGS__) #define RD_LOGC(...) RD_LOG(CRITICAL, ##__VA_ARGS__) +// For the logic snapshot obj_id, we use the highest bit to indicate the type of the snapshot message. +// 0 is for HS, 1 is for Application. +static constexpr uint64_t snp_obj_id_type_app = 1ULL << 63; + using AsyncNotify = folly::SemiFuture< folly::Unit >; using AsyncNotifier = folly::Promise< folly::Unit >; @@ -135,6 +139,8 @@ class RaftStateMachine : public nuraft::state_machine { std::string rdev_name() const; + static bool is_hs_snp_obj(uint64_t obj_id) { return (obj_id & snp_obj_id_type_app) == 0; } + private: void after_precommit_in_leader(const nuraft::raft_server::req_ext_cb_params& params); }; diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 889ab72bb..7445568b8 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -182,10 +182,26 @@ class TestReplicatedDB : public homestore::ReplDevListener { return make_async_success<>(); } - int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { + static int64_t get_next_lsn(uint64_t& obj_id) { + return obj_id & ((1ULL << 63) - 1); + } + static void set_resync_msg_type_bit(uint64_t& obj_id) { + obj_id |= 1ULL << 63; + } + + int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override { auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); + if(RaftStateMachine::is_hs_snp_obj(snp_data->offset)) { + LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset); + return -1; + } + if ((snp_data->offset & snp_obj_id_type_app) == 0) { + LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset); + return -1; + } - if (snp_data->offset == 0) { + int64_t next_lsn = get_next_lsn(snp_data->offset); + if (next_lsn == 0) { snp_data->is_last_obj = false; snp_data->blob = sisl::io_blob_safe(sizeof(ulong)); LOGINFOMOD(replication, @@ -194,38 +210,37 @@ class TestReplicatedDB : public homestore::ReplDevListener { return 0; } - int64_t next_lsn = snp_data->offset; - std::vector< KeyValuePair > kv_snapshot_data; + std::vector< KeyValuePair > kv_snapshot_obj; // we can not use find to get the next element, since if the next lsn is a config lsn , it will not be put into // lsn_index_ and as a result, the find will return the end of the map. so here we use lower_bound to get the // first element to be read and transfered. for (auto iter = lsn_index_.lower_bound(next_lsn); iter != lsn_index_.end(); iter++) { auto& v = iter->second; - kv_snapshot_data.emplace_back(Key{v.id_}, v); + kv_snapshot_obj.emplace_back(Key{v.id_}, v); LOGTRACEMOD(replication, "[Replica={}] Read logical snapshot callback fetching lsn={} size={} pattern={}", g_helper->replica_num(), v.lsn_, v.data_size_, v.data_pattern_); - if (kv_snapshot_data.size() >= 10) { break; } + if (kv_snapshot_obj.size() >= 10) { break; } } - if (kv_snapshot_data.size() == 0) { + if (kv_snapshot_obj.size() == 0) { snp_data->is_last_obj = true; LOGINFOMOD(replication, "Snapshot is_last_obj is true"); return 0; } - int64_t kv_snapshot_data_size = sizeof(KeyValuePair) * kv_snapshot_data.size(); - sisl::io_blob_safe blob{static_cast< uint32_t >(kv_snapshot_data_size)}; - std::memcpy(blob.bytes(), kv_snapshot_data.data(), kv_snapshot_data_size); + int64_t kv_snapshot_obj_size = sizeof(KeyValuePair) * kv_snapshot_obj.size(); + sisl::io_blob_safe blob{static_cast< uint32_t >(kv_snapshot_obj_size)}; + std::memcpy(blob.bytes(), kv_snapshot_obj.data(), kv_snapshot_obj_size); snp_data->blob = std::move(blob); snp_data->is_last_obj = false; LOGINFOMOD(replication, "[Replica={}] Read logical snapshot callback obj_id={} term={} idx={} num_items={}", g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), - kv_snapshot_data.size()); + kv_snapshot_obj.size()); return 0; } - void snapshot_data_write(uint64_t data_size, uint64_t data_pattern, MultiBlkId& out_blkids) { + void snapshot_obj_write(uint64_t data_size, uint64_t data_pattern, MultiBlkId& out_blkids) { auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); auto write_sgs = test_common::HSTestHelper::create_sgs(data_size, block_size, data_pattern); auto fut = homestore::data_service().async_alloc_write(write_sgs, blk_alloc_hints{}, out_blkids); @@ -235,21 +250,27 @@ class TestReplicatedDB : public homestore::ReplDevListener { } } - void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { + void write_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override { + if (RaftStateMachine::is_hs_snp_obj(snp_data->offset)) { + LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset); + return; + } + int64_t next_lsn = get_next_lsn(snp_data->offset); auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); auto last_committed_idx = std::dynamic_pointer_cast< RaftReplDev >(repl_dev())->raft_server()->get_committed_log_idx(); - if (snp_data->offset == 0) { + if (next_lsn == 0) { snp_data->offset = last_committed_lsn + 1; + set_resync_msg_type_bit(snp_data->offset); LOGINFOMOD(replication, "[Replica={}] Save logical snapshot callback return obj_id={}", g_helper->replica_num(), snp_data->offset); return; } - size_t kv_snapshot_data_size = snp_data->blob.size(); - if (kv_snapshot_data_size == 0) return; + size_t kv_snapshot_obj_size = snp_data->blob.size(); + if (kv_snapshot_obj_size == 0) return; - size_t num_items = kv_snapshot_data_size / sizeof(KeyValuePair); + size_t num_items = kv_snapshot_obj_size / sizeof(KeyValuePair); std::unique_lock lk(db_mtx_); auto ptr = r_cast< const KeyValuePair* >(snp_data->blob.bytes()); for (size_t i = 0; i < num_items; i++) { @@ -261,7 +282,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { // Write to data service and inmem map. MultiBlkId out_blkids; if (value.data_size_ != 0) { - snapshot_data_write(value.data_size_, value.data_pattern_, out_blkids); + snapshot_obj_write(value.data_size_, value.data_pattern_, out_blkids); value.blkid_ = out_blkids; } inmem_db_.insert_or_assign(key, value); @@ -271,6 +292,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { } snp_data->offset = last_committed_lsn + 1; + set_resync_msg_type_bit(snp_data->offset); LOGINFOMOD(replication, "[Replica={}] Save logical snapshot callback obj_id={} term={} idx={} is_last={} num_items={}", g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 1b990d592..eaec0ff1e 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -111,10 +111,10 @@ class SoloReplDevTest : public testing::Test { AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { return make_async_success<>(); } - int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { + int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override { return 0; } - void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override {} + void write_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override {} bool apply_snapshot(shared< snapshot_context > context) override { return true; } shared< snapshot_context > last_snapshot() override { return nullptr; } void free_user_snp_ctx(void*& user_snp_ctx) override {} From e03a7fd4a8974d2b246e9a271312f3359de999e0 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Tue, 3 Dec 2024 12:37:53 +0800 Subject: [PATCH 033/170] Implement get_next_batch_size_hint_in_bytes() we use the `byte` as `cnt` as of now. Also update the log_entries_ext() which will be called on leader, if hint < 0 means follower want nothing, return an empty vector so that an empty append_entries_req will be sent, to carry the commit_index update and trigger the follower to commit. if hint > 0, respect the cnt that the follower want, this is useful when two logs within same batch has dependency, we can exclude the dependent one. if hint = 0 means control by leader. Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- .../log_store/home_raft_log_store.cpp | 29 ++++++++++++++----- .../replication/repl_dev/raft_repl_dev.cpp | 8 +++++ .../repl_dev/raft_state_machine.cpp | 19 ++++++++++++ .../replication/repl_dev/raft_state_machine.h | 4 +++ 5 files changed, 53 insertions(+), 9 deletions(-) diff --git a/conanfile.py b/conanfile.py index bc914e16c..c3dba8dc8 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.19" + version = "6.5.20" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index a9f7c0301..823ab62bc 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -223,14 +223,27 @@ nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > HomeRaftLogStore: nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > HomeRaftLogStore::log_entries_ext(ulong start, ulong end, int64_t batch_size_hint_in_bytes) { - // in nuraft , batch_size_hint_in_bytes < 0 indicats that follower is busy now and do not want to receive any more - // log entries ATM. here we just send one log entry if this happens which is helpful for nuobject case and no harm - // to other case. - if (batch_size_hint_in_bytes < 0) end = start + 1; - - // for the case where batch_size_hint_in_bytes >= 0, we do not take any size check here for now. - // TODO: limit the size of the returned entries by batch_size_hint_in_bytes int the future if necessary - return log_entries(start, end); + // WARNING: we interpret batch_size_hint_in_bytes as count as of now. + auto batch_size_hint_cnt = batch_size_hint_in_bytes; + auto new_end = end; + // batch_size_hint_in_bytes < 0 indicats that follower is busy now and do not want to receive any more log entry. + if (batch_size_hint_cnt < 0) + new_end = start; + else if (batch_size_hint_cnt > 0) { + // limit to the hint, also prevent overflow by a huge batch_size_hint_cnt + if (sisl_unlikely(start + (uint64_t)batch_size_hint_cnt < start)) { + new_end = end; + } else { + new_end = start + (uint64_t)batch_size_hint_cnt; + } + // limit to original end + new_end = std::min(new_end, end); + } + DEBUG_ASSERT(new_end <= end, "new end {} should be <= original end {}", new_end, end); + DEBUG_ASSERT(start <= new_end, "start {} should be <= new_end {}", start, new_end); + REPL_STORE_LOG(TRACE, "log_entries_ext, start={} end={}, hint {}, adjusted range {} ~ {}, cnt {}", start, end, + batch_size_hint_cnt, start, new_end, new_end - start); + return log_entries(start, new_end); } nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::entry_at(ulong index) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 72a39a27a..4be1aa78e 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1261,6 +1261,13 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu auto req = m_state_machine->localize_journal_entry_prepare(*entry); if (req == nullptr) { sisl::VectorPool< repl_req_ptr_t >::free(reqs); + // The hint set here will be used by the next after next appendEntry, the next one + // always go with -1 from NuRraft code. + // + // We are rejecting this log entry, meaning we can accept previous log entries. + // If there is nothing we can accept(i==0), that maens we are waiting for commit + // of previous lsn, set it to 1 in this case. + m_state_machine->reset_next_batch_size_hint(std::max(1ul, i)); return {true, nuraft::cb_func::ReturnCode::ReturnNull}; } reqs->emplace_back(std::move(req)); @@ -1275,6 +1282,7 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu } sisl::VectorPool< repl_req_ptr_t >::free(reqs); } + if (ret == nuraft::cb_func::ReturnCode::Ok) { m_state_machine->inc_next_batch_size_hint(); } return {true, ret}; } diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index b64a32c24..8909614a0 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -249,6 +249,25 @@ void RaftStateMachine::rollback_ext(const nuraft::state_machine::ext_op_params& m_rd.handle_rollback(rreq); } +int64_t RaftStateMachine::get_next_batch_size_hint_in_bytes() { return next_batch_size_hint; } + +int64_t RaftStateMachine::inc_next_batch_size_hint() { + constexpr int64_t next_batch_size_hint_limit = 16; + // set to minimal if previous hint is negative (i.e do not want any log) + if (next_batch_size_hint < 0) { + next_batch_size_hint = 1; + return next_batch_size_hint; + } + // Exponential growth till next_batch_size_hint_limit, set to 0 afterward means leader take control. + next_batch_size_hint = next_batch_size_hint * 2 > next_batch_size_hint_limit ? 0 : next_batch_size_hint * 2; + return next_batch_size_hint; +} + +int64_t RaftStateMachine::reset_next_batch_size_hint(int64_t new_hint) { + next_batch_size_hint = new_hint; + return next_batch_size_hint; +} + void RaftStateMachine::iterate_repl_reqs(std::function< void(int64_t, repl_req_ptr_t rreq) > const& cb) { for (auto [key, rreq] : m_lsn_req_map) { cb(key, rreq); diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index 8f00cec43..2b50fea7b 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -101,6 +101,7 @@ class RaftStateMachine : public nuraft::state_machine { nuraft::ptr< nuraft::buffer > m_success_ptr; // Preallocate the success return to raft // iomgr::timer_handle_t m_wait_blkid_write_timer_hdl{iomgr::null_timer_handle}; bool m_resync_mode{false}; + int64_t next_batch_size_hint{0}; public: RaftStateMachine(RaftReplDev& rd); @@ -116,6 +117,7 @@ class RaftStateMachine : public nuraft::state_machine { void rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) override; void rollback_ext(const nuraft::state_machine::ext_op_params& params) override; void become_ready(); + int64_t get_next_batch_size_hint_in_bytes() override; void create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done) override; int read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, ulong obj_id, raft_buf_ptr_t& data_out, @@ -138,6 +140,8 @@ class RaftStateMachine : public nuraft::state_machine { void iterate_repl_reqs(std::function< void(int64_t, repl_req_ptr_t rreq) > const& cb); std::string rdev_name() const; + int64_t reset_next_batch_size_hint(int64_t new_hint); + int64_t inc_next_batch_size_hint(); static bool is_hs_snp_obj(uint64_t obj_id) { return (obj_id & snp_obj_id_type_app) == 0; } From 4f4df879e9da59f183b5bde18c6f47367329adac Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 4 Dec 2024 16:52:32 -0700 Subject: [PATCH 034/170] Bump up nuraft_mesg to >=3.7 Nuraft_mesg(<3.7) do not ship batch size hint. c.f https://github.com/eBay/nuraft_mesg/pull/111 Signed-off-by: Xiaoxi Chen --- conanfile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index c3dba8dc8..a71e86639 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.20" + version = "6.5.21" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" @@ -54,7 +54,7 @@ def build_requirements(self): def requirements(self): self.requires("iomgr/[^11.3]@oss/master", transitive_headers=True) self.requires("sisl/[^12.2]@oss/master", transitive_headers=True) - self.requires("nuraft_mesg/[^3.4]@oss/main", transitive_headers=True) + self.requires("nuraft_mesg/[^3.7]@oss/main", transitive_headers=True) self.requires("farmhash/cci.20190513@", transitive_headers=True) if self.settings.arch in ['x86', 'x86_64']: From b38f95614edcd509a5c0596d501e5fdcb3d95cf1 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Sat, 7 Dec 2024 11:43:07 -0700 Subject: [PATCH 035/170] Add on_repl_devs_init_completed cb. A stable callback is needed regardless whether we have repl_dev created. This CB is a nice place for upper layer to recover those depends on repl_dev. Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- src/include/homestore/replication_service.hpp | 4 ++++ src/lib/replication/service/raft_repl_service.cpp | 2 ++ src/tests/test_common/hs_repl_test_common.hpp | 1 + src/tests/test_solo_repl_dev.cpp | 1 + 5 files changed, 9 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index a71e86639..75a10f115 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.21" + version = "6.5.22" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index c3e56d9a3..bac805dd5 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -75,6 +75,10 @@ class ReplApplication { // Listener corresponding to the ReplDev which will be used to perform the precommit/commit/rollback. virtual shared< ReplDevListener > create_repl_dev_listener(group_id_t group_id) = 0; + // Called after all the repl devs are found upon restart of the homestore instance. + // it is a nice place for upper layer to recovery anything depends on repl_devs + virtual void on_repl_devs_init_completed() = 0; + // Given the uuid of the peer, get their address and port virtual std::pair< std::string, uint16_t > lookup_peer(replica_id_t uuid) const = 0; diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 1ec45d9d0..0fd8940e3 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -131,6 +131,8 @@ void RaftReplService::start() { rdev->on_restart(); } m_config_sb_bufs.clear(); + LOGINFO("Repl devs load completed, calling upper layer on_repl_devs_init_completed"); + m_repl_app->on_repl_devs_init_completed(); // Step 5: Start the data and logstore service now. This step is essential before we can ask Raft to join groups etc diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp index c9ff71567..7b93cccb2 100644 --- a/src/tests/test_common/hs_repl_test_common.hpp +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -115,6 +115,7 @@ class HSReplTestHelper : public HSTestHelper { create_repl_dev_listener(homestore::group_id_t group_id) override { return helper_.get_listener(group_id); } + void on_repl_devs_init_completed() { LOGINFO("Repl dev init completed CB called"); } std::pair< std::string, uint16_t > lookup_peer(homestore::replica_id_t replica_id) const override { uint16_t port; diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index eaec0ff1e..e525ff494 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -153,6 +153,7 @@ class SoloReplDevTest : public testing::Test { shared< ReplDevListener > create_repl_dev_listener(uuid_t) override { return std::make_shared< Listener >(m_test); } + void on_repl_devs_init_completed() { LOGINFO("Repl dev init completed CB called"); } std::pair< std::string, uint16_t > lookup_peer(uuid_t uuid) const override { return std::make_pair("", 0u); } replica_id_t get_my_repl_id() const override { return hs_utils::gen_random_uuid(); } }; From 3b7ad0b800dd0676a974ec1ff74fad67d2b69f98 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 11 Dec 2024 14:01:10 +0800 Subject: [PATCH 036/170] Calling pre_commit for lsn > dc_lsn. driven by nuraft implementation https://github.com/eBay/NuRaft/blob/1adcc6282109c2ddf1121bbc374d48d303145e39/src/handle_append_entries.cxx#L847-L852 the pre_commit is called once log appened to log store, even before persist. For logs recovered from log store, it should call pre-commit with no condition. Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 28 +++++++++++-------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/conanfile.py b/conanfile.py index 75a10f115..6f4e8d36c 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.22" + version = "6.5.23" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 4be1aa78e..f34d14464 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1261,12 +1261,12 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu auto req = m_state_machine->localize_journal_entry_prepare(*entry); if (req == nullptr) { sisl::VectorPool< repl_req_ptr_t >::free(reqs); - // The hint set here will be used by the next after next appendEntry, the next one - // always go with -1 from NuRraft code. - // + // The hint set here will be used by the next after next appendEntry, the next one + // always go with -1 from NuRraft code. + // // We are rejecting this log entry, meaning we can accept previous log entries. - // If there is nothing we can accept(i==0), that maens we are waiting for commit - // of previous lsn, set it to 1 in this case. + // If there is nothing we can accept(i==0), that maens we are waiting for commit + // of previous lsn, set it to 1 in this case. m_state_machine->reset_next_batch_size_hint(std::max(1ul, i)); return {true, nuraft::cb_func::ReturnCode::ReturnNull}; } @@ -1485,16 +1485,20 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx rreq->add_state(repl_req_state_t::LOG_FLUSHED); RD_LOGD("Replay log on restart, rreq=[{}]", rreq->to_string()); + // 2. Pre-commit the log entry as in nuraft pre-commit was called once log appended to logstore. + m_listener->on_pre_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq); + + // LSN above dc_lsn we forgot their states, they can either + // a. be committed before, but DC_LSN not yet flushed + // b. not yet committed, might be committed or rollback if (repl_lsn > m_rd_sb->durable_commit_lsn) { // In memory state of these blks is lost. Commit them now to avoid usage of same blk twice. commit_blk(rreq); + // add rreq to state machine, state-machine will decide to commit or rollback this rreq. m_state_machine->link_lsn_to_req(rreq, int64_cast(repl_lsn)); return; } - // 2. Pre-commit the log entry - m_listener->on_pre_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq); - // 3. Commit the log entry handle_commit(rreq, true /* recovery */); } @@ -1512,8 +1516,8 @@ void RaftReplDev::create_snp_resync_data(raft_buf_ptr_t& data_out) { bool RaftReplDev::apply_snp_resync_data(nuraft::buffer& data) { auto msg = r_cast< snp_repl_dev_data* >(data.data_begin()); - if (msg->magic_num != HOMESTORE_RESYNC_DATA_MAGIC || msg->protocol_version != - HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1) { + if (msg->magic_num != HOMESTORE_RESYNC_DATA_MAGIC || + msg->protocol_version != HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1) { RD_LOGE("Snapshot resync data validation failed, magic={}, version={}", msg->magic_num, msg->protocol_version); return false; } @@ -1521,8 +1525,8 @@ bool RaftReplDev::apply_snp_resync_data(nuraft::buffer& data) { RD_LOGD("received snapshot resync msg, dsn={}, crc={}, received crc={}", msg->dsn, msg->crc, received_crc); // Clear the crc field before verification, because the crc value computed by leader doesn't contain it. msg->crc = 0; - auto computed_crc = crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(msg), - sizeof(snp_repl_dev_data)); + auto computed_crc = + crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(msg), sizeof(snp_repl_dev_data)); if (received_crc != computed_crc) { RD_LOGE("Snapshot resync data crc mismatch, received_crc={}, computed_crc={}", received_crc, computed_crc); return false; From 69ea506d3c0cc07f16385ee5d0a9c8f28f64416a Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Sun, 15 Dec 2024 20:02:04 -0700 Subject: [PATCH 037/170] fix potential bug of home raft log store initialization the home raft log store set the log_found_cb and log_replay_done_cb in the result future of open_log_store, which is an async call. so there is a chance that when repl_dev start replaying log , the above two callback are not registered yes, which might lead to some error. this PR move the register out of the future to the initialization phase of log store, which will avoid this case --- conanfile.py | 2 +- src/include/homestore/logstore_service.hpp | 3 ++- src/lib/logstore/log_dev.cpp | 8 +++++++- src/lib/logstore/log_dev.hpp | 6 +++++- src/lib/logstore/log_store_service.cpp | 5 +++-- src/lib/replication/log_store/home_raft_log_store.cpp | 6 ++---- 6 files changed, 20 insertions(+), 10 deletions(-) diff --git a/conanfile.py b/conanfile.py index 6f4e8d36c..e14783d2c 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.23" + version = "6.5.25" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/logstore_service.hpp b/src/include/homestore/logstore_service.hpp index 44ba1ab53..18c1e75e3 100644 --- a/src/include/homestore/logstore_service.hpp +++ b/src/include/homestore/logstore_service.hpp @@ -132,7 +132,8 @@ class LogStoreService { * @return std::shared_ptr< HomeLogStore > */ folly::Future< shared< HomeLogStore > > open_log_store(logdev_id_t logdev_id, logstore_id_t store_id, - bool append_mode); + bool append_mode, log_found_cb_t log_found_cb = nullptr, + log_replay_done_cb_t log_replay_done_cb = nullptr); /** * @brief Close the log store instance and free-up the resources diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index e70c7b0f5..6f1f8760f 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -617,7 +617,9 @@ std::shared_ptr< HomeLogStore > LogDev::create_new_log_store(bool append_mode) { return lstore; } -folly::Future< shared< HomeLogStore > > LogDev::open_log_store(logstore_id_t store_id, bool append_mode) { +folly::Future< shared< HomeLogStore > > LogDev::open_log_store(logstore_id_t store_id, bool append_mode, + log_found_cb_t log_found_cb, + log_replay_done_cb_t log_replay_done_cb) { folly::SharedMutexWritePriority::WriteHolder holder(m_store_map_mtx); auto it = m_id_logstore_map.find(store_id); if (it == m_id_logstore_map.end()) { @@ -626,6 +628,8 @@ folly::Future< shared< HomeLogStore > > LogDev::open_log_store(logstore_id_t sto logstore_info{ .log_store = nullptr, .append_mode = append_mode, + .log_found_cb = log_found_cb, + .log_replay_done_cb = log_replay_done_cb, })); HS_REL_ASSERT_EQ(happened, true, "Unable to insert logstore into id_logstore_map"); } @@ -658,6 +662,8 @@ void LogDev::on_log_store_found(logstore_id_t store_id, const logstore_superblk& logstore_info& info = it->second; info.log_store = std::make_shared< HomeLogStore >(shared_from_this(), store_id, info.append_mode, sb.m_first_seq_num); + info.log_store->register_log_found_cb(info.log_found_cb); + info.log_store->register_log_replay_done_cb(info.log_replay_done_cb); info.promise.setValue(info.log_store); } diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index cf09e57bc..2875d7823 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -564,6 +564,8 @@ class log_stream_reader { struct logstore_info { std::shared_ptr< HomeLogStore > log_store; bool append_mode; + log_found_cb_t log_found_cb{nullptr}; + log_replay_done_cb_t log_replay_done_cb{nullptr}; folly::SharedPromise< std::shared_ptr< HomeLogStore > > promise{}; }; @@ -708,7 +710,9 @@ class LogDev : public std::enable_shared_from_this< LogDev > { /// @param append_mode Is this log store is append mode or not. If append mode, write_async call fails and only /// append_async calls succeed. /// @return future< shared< HomeLogStore > > : Future which will be set with the log store once it is opened - folly::Future< shared< HomeLogStore > > open_log_store(logstore_id_t store_id, bool append_mode); + folly::Future< shared< HomeLogStore > > open_log_store(logstore_id_t store_id, bool append_mode, + log_found_cb_t log_found_cb = nullptr, + log_replay_done_cb_t log_replay_done_cb = nullptr); /// @brief Remove the log store and its associated resources /// @param store_id Store id that was created/opened diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index d09d02f7c..c98593b43 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -273,12 +273,13 @@ std::shared_ptr< HomeLogStore > LogStoreService::create_new_log_store(logdev_id_ } folly::Future< shared< HomeLogStore > > LogStoreService::open_log_store(logdev_id_t logdev_id, logstore_id_t store_id, - bool append_mode) { + bool append_mode, log_found_cb_t log_found_cb, + log_replay_done_cb_t log_replay_done_cb) { folly::SharedMutexWritePriority::ReadHolder holder(m_logdev_map_mtx); const auto it = m_id_logdev_map.find(logdev_id); HS_REL_ASSERT((it != m_id_logdev_map.end()), "logdev id {} doesnt exist", logdev_id); COUNTER_INCREMENT(m_metrics, logstores_count, 1); - return it->second->open_log_store(store_id, append_mode); + return it->second->open_log_store(store_id, append_mode, log_found_cb, log_replay_done_cb); } void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t store_id) { diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index 823ab62bc..1c09afa91 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -99,13 +99,11 @@ HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore LOGDEBUGMOD(replication, "Opening existing home log_dev={} log_store={}", m_logdev_id, logstore_id); logstore_service().open_logdev(m_logdev_id); m_log_store_future = logstore_service() - .open_log_store(m_logdev_id, logstore_id, true) - .thenValue([this, log_found_cb, log_replay_done_cb](auto log_store) { + .open_log_store(m_logdev_id, logstore_id, true, log_found_cb, log_replay_done_cb) + .thenValue([this](auto log_store) { m_log_store = std::move(log_store); DEBUG_ASSERT_EQ(m_logstore_id, m_log_store->get_store_id(), "Mismatch in passed and create logstore id"); - m_log_store->register_log_found_cb(log_found_cb); - m_log_store->register_log_replay_done_cb(log_replay_done_cb); REPL_STORE_LOG(DEBUG, "Home Log store created/opened successfully"); }); } From 6756b810a0d8ab136c4f99ffafd2ac75a243f6f1 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Tue, 17 Dec 2024 08:19:19 +0800 Subject: [PATCH 038/170] disable restart for destroy-pending repl-dev (#605) --- conanfile.py | 2 +- src/lib/logstore/log_dev.cpp | 5 ++++- src/lib/logstore/log_store_service.cpp | 5 ++++- .../replication/repl_dev/raft_repl_dev.cpp | 4 +++- .../replication/service/raft_repl_service.cpp | 20 +++++++++++++++++-- 5 files changed, 30 insertions(+), 6 deletions(-) diff --git a/conanfile.py b/conanfile.py index e14783d2c..a8d70345a 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.25" + version = "6.5.26" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 6f1f8760f..8644692db 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -641,7 +641,10 @@ void LogDev::remove_log_store(logstore_id_t store_id) { { folly::SharedMutexWritePriority::WriteHolder holder(m_store_map_mtx); auto ret = m_id_logstore_map.erase(store_id); - HS_REL_ASSERT((ret == 1), "try to remove invalid store_id {}-{}", m_logdev_id, store_id); + if (ret == 0) { + LOGWARN("try to remove invalid store_id {}-{}", m_logdev_id, store_id); + return; + } } unreserve_store_id(store_id); } diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index c98593b43..375f892b3 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -286,7 +286,10 @@ void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t stor folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); COUNTER_INCREMENT(m_metrics, logstores_count, 1); const auto it = m_id_logdev_map.find(logdev_id); - HS_REL_ASSERT((it != m_id_logdev_map.end()), "logdev id {} doesnt exist", logdev_id); + if (it == m_id_logdev_map.end()) { + HS_LOG(WARN, logstore, "logdev id {} doesnt exist", logdev_id); + return; + } it->second->remove_log_store(store_id); COUNTER_DECREMENT(m_metrics, logstores_count, 1); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index f34d14464..da8535602 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1197,11 +1197,13 @@ std::shared_ptr< nuraft::state_machine > RaftReplDev::get_state_machine() { retu void RaftReplDev::permanent_destroy() { RD_LOGI("Permanent destroy for raft repl dev group_id={}", group_id_str()); - m_rd_sb.destroy(); m_raft_config_sb.destroy(); m_data_journal->remove_store(); logstore_service().destroy_log_dev(m_data_journal->logdev_id()); m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::PERMANENT_DESTROYED; }); + // we should destroy repl_dev superblk only after all the resources are cleaned up, so that is crash recovery + // occurs, we have a chance to find the stale repl_dev and reclaim all the stale resources. + m_rd_sb.destroy(); } void RaftReplDev::leave() { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 0fd8940e3..2ed7a3bc1 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -128,7 +128,8 @@ void RaftReplService::start() { // We need to first load the repl_dev with its config and then attach the raft config to that repl dev. for (auto const& [buf, mblk] : m_config_sb_bufs) { auto rdev = raft_group_config_found(buf, voidptr_cast(mblk)); - rdev->on_restart(); + // if repl_dev is in destroy_pending state, it will not be loaded. + if (rdev) rdev->on_restart(); } m_config_sb_bufs.clear(); LOGINFO("Repl devs load completed, calling upper layer on_repl_devs_init_completed"); @@ -383,7 +384,22 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } if (rd_sb->destroy_pending == 0x1) { - LOGINFOMOD(replication, "ReplDev group_id={} was destroyed, skipping the load", group_id); + LOGINFOMOD(replication, "ReplDev group_id={} was destroyed, reclaim the stale resource", group_id); + // if we do not add the repl_dev to m_rd_map, it will not be permanently destroyed since gc thread finds the + // pending destroy repl_dev only from m_rd_map. so, we should try to reclaim all the repl_dev stale resources + // here. + + // 1 since we permanantly destroy the repl_dev here, it will not join_raft group where raft_server will be + // created. hence , no need to detroy it through nuraft_mesg, where raft_server will be shutdown. + // 2 m_raft_config_sb will be destroyed in raft_group_config_found() method if repl_dev is is not found, so + // skip it. + + // 3 logdev will be destroyed in delete_unopened_logdevs() if we don't open it(create repl_dev) here, so skip + // it. + + // 4 destroy the superblk, and after this, the repl_dev will not be loaded and found again. + rd_sb.destroy(); + return; } From c4a06976f335faf28af6b2a6055eb56d03426fa0 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Tue, 17 Dec 2024 19:14:35 -0700 Subject: [PATCH 039/170] fix twice call of leave Removed From cluster will be called twice when committing config_change log and journal_type_t::HS_CTRL_DESTROY in the moved out member, so the destroy future will be setvalue twice , which will lead to an error. this PR fixes this bug --- conanfile.py | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 31 +++++++++---------- src/lib/replication/repl_dev/raft_repl_dev.h | 4 ++- src/tests/test_common/raft_repl_test_base.hpp | 10 ++---- 4 files changed, 22 insertions(+), 25 deletions(-) diff --git a/conanfile.py b/conanfile.py index a8d70345a..f241149e6 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.26" + version = "6.5.27" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index da8535602..d92464c8b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1183,7 +1183,7 @@ nuraft::ptr< nuraft::log_store > RaftReplDev::load_log_store() { return m_data_j int32_t RaftReplDev::server_id() { return m_raft_server_id; } -bool RaftReplDev::is_destroy_pending() const { return (m_rd_sb->destroy_pending == 0x1); } +bool RaftReplDev::is_destroy_pending() const { return (*m_stage.access().get() == repl_dev_stage_t::DESTROYED); } bool RaftReplDev::is_destroyed() const { return (*m_stage.access().get() == repl_dev_stage_t::PERMANENT_DESTROYED); } /////////////////////////////////// nuraft_mesg::mesg_state_mgr overrides //////////////////////////////////// @@ -1207,6 +1207,19 @@ void RaftReplDev::permanent_destroy() { } void RaftReplDev::leave() { + // this will be called in 3 cases : + // 1. commit log entry of journal_type_t::HS_CTRL_DESTROY + // 2. it is removed from the cluster and the new config(excluding this node) is being committed on this node + // 3. it is removed from the cluster , but the node is down and new config log(excluding this node) is not + // replicated to this removed node. when the node restart, leader will not send any append entry to this node, + // since it is not a member of the raft group. it will become a condidate and send request-vote request to other + // members of this raft group. a member will send RemovedFromCluster to the node if this member finds the node + // is no longer a member of the raft group. + + // leave() will never be called concurrently, since config change and journal_type_t::HS_CTRL_DESTROY are all log + // entry, which will be committed sequentially. + if (is_destroy_pending()) return; + // We update that this repl_dev in destroyed state, actual clean up of resources happen in reaper thread later m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::DESTROYED; }); m_destroyed_time = Clock::now(); @@ -1288,21 +1301,7 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu return {true, ret}; } - case nuraft::cb_func::Type::RemovedFromCluster: { - // a node will reach here when : - // 1. it is removed from the cluster and the new config(excluding this node) is being committed on this node - // 2. it is removed from the cluster , but the node is down and new config log(excluding this node) is not - // replicated to this removed node. when the node restart, leader will not send any append entry to this node, - // since it is not a member of the raft group. it will become a condidate and send request-vote request to other - // members of this raft group. a member will send RemovedFromCluster to the node if this member finds the node - // is no longer a member of the raft group. - - // this will lazily cleanup the group - // TODO:cleanup this repl dev ASAP if necessary. - leave(); - - return {true, ret}; - } + // RemovedFromCluster will be handled in nuraft_mesg::generic_raft_event_handler where leave() is called // TODO: Add more type handler if necessary default: diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 0550858cf..9e29a5737 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -229,7 +229,9 @@ class RaftReplDev : public ReplDev, * * @param num_reserved_entries The number of reserved entries of the replication log. */ - void truncate(uint32_t num_reserved_entries) { m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load()); } + void truncate(uint32_t num_reserved_entries) { + m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load()); + } void wait_for_logstore_ready() { m_data_journal->wait_for_log_store_ready(); } diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 7445568b8..21d5fa3f2 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -182,16 +182,12 @@ class TestReplicatedDB : public homestore::ReplDevListener { return make_async_success<>(); } - static int64_t get_next_lsn(uint64_t& obj_id) { - return obj_id & ((1ULL << 63) - 1); - } - static void set_resync_msg_type_bit(uint64_t& obj_id) { - obj_id |= 1ULL << 63; - } + static int64_t get_next_lsn(uint64_t& obj_id) { return obj_id & ((1ULL << 63) - 1); } + static void set_resync_msg_type_bit(uint64_t& obj_id) { obj_id |= 1ULL << 63; } int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override { auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); - if(RaftStateMachine::is_hs_snp_obj(snp_data->offset)) { + if (RaftStateMachine::is_hs_snp_obj(snp_data->offset)) { LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset); return -1; } From 707c11137a8de7dc859a6d00e933b8b33eee59e8 Mon Sep 17 00:00:00 2001 From: Sanal Date: Wed, 18 Dec 2024 08:59:07 -0800 Subject: [PATCH 040/170] Add lock for log dev read api's. (#612) Avoid concurrent access to journal vdev for truncate, write and read api's from different threads. --- src/lib/logstore/log_dev.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 8644692db..01a6b4181 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -264,6 +264,7 @@ int64_t LogDev::append_async(logstore_id_t store_id, logstore_seq_num_t seq_num, } log_buffer LogDev::read(const logdev_key& key) { + std::unique_lock lg = flush_guard(); auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread); auto ec = m_vdev_jd->sync_pread(buf->bytes(), initial_read_size, key.dev_offset); if (ec) { @@ -292,6 +293,7 @@ log_buffer LogDev::read(const logdev_key& key) { } void LogDev::read_record_header(const logdev_key& key, serialized_log_record& return_record_header) { + std::unique_lock lg = flush_guard(); auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread); auto ec = m_vdev_jd->sync_pread(buf->bytes(), initial_read_size, key.dev_offset); if (ec) LOGERROR("Failed to read from Journal vdev log_dev={} {} {}", m_logdev_id, ec.value(), ec.message()); From 0de97cd8e4777b924d8aa434379f802ae159ed20 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Thu, 12 Dec 2024 15:20:01 +0800 Subject: [PATCH 041/170] Ensure Consistent LSN Before Opening for Traffic in Raft Group We identified a gap when majority members in a Raft group are down. To save IO operations, we do not persist the last_commit_idx for every commit but instead at regular intervals. Consequently, upon reboot, we may not reflect the latest commit, leaving some logs in the state machine waiting for re-commitment. For instance, if we committed up to LSN 103 but only persisted up to LSN 100, then LSNs 100-103 will remain in the log-store, awaiting re-commitment from the leader. If all members restart after a disaster, they face the following state: - [S1]: commit_idx 100, last_log {idx = 105, term = 1} - S2: commit_idx 100, last_log {idx = 103, term = 1} - S3: commit_idx 100, last_log {idx = 103, term = 1} If S1 opens for traffic at this point, previously committed LSN 102 might return NOT_FOUND to clients due to the uncommitted state. Proposed Solution: - Mark last_log_idx as `traffic_ready_lsn` in the BECOME_LEADER callback. In the example above, it is 105 if S1 becomes the leader. - The leader will not accept IO until it commits up to this `consistent_lsn` (105), ensuring correctness by over-committing. - The HO will call `repl_dev->is_ready_for_traffic()` for each IO. - On followers, the traffic_ready_lsn is zero so it allows all. - On the leader, all requests are rejected until it commits to the `traffic_ready_lsn` (105 in this example). Signed-off-by: Xiaoxi Chen --- conanfile.py | 4 +-- src/include/homestore/replication/repl_dev.h | 9 ++++-- src/lib/replication/repl_dev/common.cpp | 3 +- .../replication/repl_dev/raft_repl_dev.cpp | 23 +++++++++++---- src/lib/replication/repl_dev/raft_repl_dev.h | 29 +++++++++++++++++-- .../repl_dev/raft_state_machine.cpp | 2 +- src/lib/replication/repl_dev/solo_repl_dev.h | 1 + src/tests/test_common/raft_repl_test_base.hpp | 11 ++++++- 8 files changed, 67 insertions(+), 15 deletions(-) diff --git a/conanfile.py b/conanfile.py index f241149e6..f557e1bba 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.27" + version = "6.5.28" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" @@ -54,7 +54,7 @@ def build_requirements(self): def requirements(self): self.requires("iomgr/[^11.3]@oss/master", transitive_headers=True) self.requires("sisl/[^12.2]@oss/master", transitive_headers=True) - self.requires("nuraft_mesg/[^3.7]@oss/main", transitive_headers=True) + self.requires("nuraft_mesg/[^3.7.1]@oss/main", transitive_headers=True) self.requires("farmhash/cci.20190513@", transitive_headers=True) if self.settings.arch in ['x86', 'x86_64']: diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 335cda834..ec8344be0 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -124,8 +124,9 @@ struct snapshot_obj { bool is_last_obj{false}; }; -//HomeStore has some meta information to be transmitted during the baseline resync, -//Although now only dsn needs to be synced, this structure is defined as a general message, and we can easily add data if needed in the future. +// HomeStore has some meta information to be transmitted during the baseline resync, +// Although now only dsn needs to be synced, this structure is defined as a general message, and we can easily add data +// if needed in the future. struct snp_repl_dev_data { uint64_t magic_num{HOMESTORE_RESYNC_DATA_MAGIC}; uint32_t protocol_version{HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1}; @@ -468,6 +469,10 @@ class ReplDev { /// @return last_commit_lsn virtual repl_lsn_t get_last_commit_lsn() const = 0; + /// @brief if this replica is ready for accepting client IO. + /// @return true if ready, false otherwise + virtual bool is_ready_for_traffic() const = 0; + virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } virtual void detach_listener() { diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 1c2a8c560..e5b34dbcd 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -174,8 +174,7 @@ void repl_req_ctx::release_data() { m_buf_for_unaligned_data = sisl::io_blob_safe{}; if (m_pushed_data) { LOGTRACEMOD(replication, "m_pushed_data addr={}, m_rkey={}, m_lsn={}", - static_cast(m_pushed_data.get()), - m_rkey.to_string(), m_lsn); + static_cast< void* >(m_pushed_data.get()), m_rkey.to_string(), m_lsn); m_pushed_data->send_response(); m_pushed_data = nullptr; } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index d92464c8b..2449f7833 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1237,8 +1237,7 @@ void RaftReplDev::leave() { m_destroy_promise.setValue(ReplServiceError::OK); // In case proposer is waiting for the destroy to complete } -std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nuraft::cb_func::Type type, - nuraft::cb_func::Param* param) { +nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, nuraft::cb_func::Param* param) { auto ret = nuraft::cb_func::ReturnCode::Ok; switch (type) { @@ -1283,7 +1282,7 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu // If there is nothing we can accept(i==0), that maens we are waiting for commit // of previous lsn, set it to 1 in this case. m_state_machine->reset_next_batch_size_hint(std::max(1ul, i)); - return {true, nuraft::cb_func::ReturnCode::ReturnNull}; + return nuraft::cb_func::ReturnCode::ReturnNull; } reqs->emplace_back(std::move(req)); } @@ -1298,7 +1297,21 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu sisl::VectorPool< repl_req_ptr_t >::free(reqs); } if (ret == nuraft::cb_func::ReturnCode::Ok) { m_state_machine->inc_next_batch_size_hint(); } - return {true, ret}; + return ret; + } + case nuraft::cb_func::Type::JoinedCluster: + RD_LOGD("Raft channel: Received JoinedCluster, implies become_follower"); + become_follower_cb(); + return nuraft::cb_func::ReturnCode::Ok; + case nuraft::cb_func::Type::BecomeFollower: { + RD_LOGD("Raft channel: Received BecomeFollower"); + become_follower_cb(); + return nuraft::cb_func::ReturnCode::Ok; + } + case nuraft::cb_func::Type::BecomeLeader: { + RD_LOGD("Raft channel: Received BecomeLeader"); + become_leader_cb(); + return nuraft::cb_func::ReturnCode::Ok; } // RemovedFromCluster will be handled in nuraft_mesg::generic_raft_event_handler where leave() is called @@ -1307,7 +1320,7 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu default: break; } - return {false, ret}; + return nuraft::cb_func::ReturnCode::Ok; } void RaftReplDev::flush_durable_commit_lsn() { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 9e29a5737..e9ec2a1ad 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -137,6 +137,10 @@ class RaftReplDev : public ReplDev, std::atomic< repl_lsn_t > m_commit_upto_lsn{0}; // LSN which was lastly written, to track flushes std::atomic< repl_lsn_t > m_compact_lsn{0}; // LSN upto which it was compacted, it is used to track where to + // The `traffic_ready_lsn` variable holds the Log Sequence Number (LSN) up to which + // the state machine should committed to before accepting traffic. This threshold ensures that + // all potential committed log be committed before handling incoming requests. + std::atomic< repl_lsn_t > m_traffic_ready_lsn{0}; std::mutex m_sb_mtx; // Lock to protect the repl dev superblock @@ -187,6 +191,13 @@ class RaftReplDev : public ReplDev, bool is_destroy_pending() const; bool is_destroyed() const; Clock::time_point destroyed_time() const { return m_destroyed_time; } + bool is_ready_for_traffic() const { + auto committed_lsn = m_commit_upto_lsn.load(); + auto gate = m_traffic_ready_lsn.load(); + bool ready = committed_lsn >= gate; + if (!ready) { RD_LOGD("Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); } + return ready; + } //////////////// Accessor/shortcut methods /////////////////////// nuraft_mesg::repl_service_ctx* group_msg_service(); @@ -206,6 +217,20 @@ class RaftReplDev : public ReplDev, cshared< ReplDevCPContext > get_cp_ctx(CP* cp); void cp_cleanup(CP* cp); void become_ready(); + void become_leader_cb() { + auto new_gate = raft_server()->get_last_log_idx(); + repl_lsn_t existing_gate = 0; + if (!m_traffic_ready_lsn.compare_exchange_strong(existing_gate, new_gate)) { + // was a follower, m_traffic_ready_lsn should be zero on follower. + RD_REL_ASSERT(existing_gate == 0, "existing gate should be zero"); + } + RD_LOGD("become_leader_cb: setting traffic_ready_lsn from {} to {}", existing_gate, new_gate); + }; + void become_follower_cb() { + // m_traffic_ready_lsn should be zero on follower. + m_traffic_ready_lsn.store(0); + RD_LOGD("become_follower_cb setting traffic_ready_lsn to 0"); + } /// @brief This method is called when the data journal is compacted /// @@ -270,8 +295,8 @@ class RaftReplDev : public ReplDev, std::shared_ptr< nuraft::state_machine > get_state_machine() override; void permanent_destroy() override; void leave() override; - std::pair< bool, nuraft::cb_func::ReturnCode > handle_raft_event(nuraft::cb_func::Type, - nuraft::cb_func::Param*) override; + + nuraft::cb_func::ReturnCode raft_event(nuraft::cb_func::Type, nuraft::cb_func::Param*) override; private: shared< nuraft::log_store > data_journal() { return m_data_journal; } diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 8909614a0..09bd6b7ba 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -385,7 +385,7 @@ bool RaftStateMachine::apply_snapshot(nuraft::snapshot& s) { m_rd.m_data_journal->set_last_durable_lsn(s.get_last_log_idx()); auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); auto res = m_rd.m_listener->apply_snapshot(snp_ctx); - //make sure the changes are flushed. + // make sure the changes are flushed. hs()->cp_mgr().trigger_cp_flush(true /* force */).get(); return res; } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 911f4bd28..e5f33fb63 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -53,6 +53,7 @@ class SoloReplDev : public ReplDev { std::vector< peer_info > get_replication_status() const override { return std::vector< peer_info >{peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0}}; } + bool is_ready_for_traffic() const override { return true; } uuid_t group_id() const override { return m_group_id; } diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 21d5fa3f2..19a346f5a 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -350,6 +350,10 @@ class TestReplicatedDB : public homestore::ReplDevListener { void validate_db_data() { g_helper->runner().set_num_tasks(inmem_db_.size()); + while (!repl_dev()->is_ready_for_traffic()) { + LOGINFO("not yet ready for traffic, waiting"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } LOGINFOMOD(replication, "[{}]: Total {} keys committed, validating them", boost::uuids::to_string(repl_dev()->group_id()), inmem_db_.size()); @@ -554,7 +558,8 @@ class RaftReplDevTestBase : public testing::Test { if (dbs_[0]->repl_dev() == nullptr) return; do { - auto leader_uuid = dbs_[0]->repl_dev()->get_leader_id(); + auto repl_dev = dbs_[0]->repl_dev(); + auto leader_uuid = repl_dev->get_leader_id(); if (leader_uuid.is_nil()) { LOGINFO("Waiting for leader to be elected"); @@ -562,6 +567,10 @@ class RaftReplDevTestBase : public testing::Test { } else if (leader_uuid == g_helper->my_replica_id()) { LOGINFO("Writing {} entries since I am the leader my_uuid={}", num_entries, boost::uuids::to_string(g_helper->my_replica_id())); + if (!repl_dev->is_ready_for_traffic()) { + LOGINFO("leader is not yet ready for traffic, waiting"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } auto const block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); g_helper->runner().set_num_tasks(num_entries); From 4bd3e127e880d831f0ae7f39f221bd9fcc468b65 Mon Sep 17 00:00:00 2001 From: koujl <108138320+koujl@users.noreply.github.com> Date: Fri, 20 Dec 2024 11:28:37 +0800 Subject: [PATCH 042/170] Refine variant_node::put return value (#615) Replace the generic put_failed error status with more specific error messages to enhance clarity and prevent confusion during log analysis. --- .../homestore/btree/detail/btree_internal.hpp | 4 +-- .../btree/detail/btree_mutate_impl.ipp | 8 ++---- .../homestore/btree/detail/variant_node.hpp | 28 +++++++++++-------- src/tests/btree_helpers/btree_test_helper.hpp | 3 +- src/tests/test_btree_node.cpp | 16 +++++++---- 5 files changed, 32 insertions(+), 27 deletions(-) diff --git a/src/include/homestore/btree/detail/btree_internal.hpp b/src/include/homestore/btree/detail/btree_internal.hpp index 8989a2d5d..14533a8e5 100644 --- a/src/include/homestore/btree/detail/btree_internal.hpp +++ b/src/include/homestore/btree/detail/btree_internal.hpp @@ -201,8 +201,8 @@ VENUM(btree_node_type, uint32_t, FIXED = 0, VAR_VALUE = 1, VAR_KEY = 2, VAR_OBJE VENUM(btree_store_type, uint8_t, MEM = 0, SSD = 1) #endif -ENUM(btree_status_t, uint32_t, success, not_found, retry, has_more, node_read_failed, put_failed, space_not_avail, - cp_mismatch, merge_not_required, merge_failed, crc_mismatch, not_supported, node_freed) +ENUM(btree_status_t, uint32_t, success, not_found, retry, has_more, node_read_failed, already_exists, filtered_out, + space_not_avail, cp_mismatch, merge_not_required, merge_failed, crc_mismatch, not_supported, node_freed) /*ENUM(btree_node_write_type, uint8_t, new_node, // Node write whenever a new node is created. diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp index 3e90ccfd5..3cfc19a18 100644 --- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp +++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp @@ -56,7 +56,7 @@ retry: const auto matched = my_node->match_range(req.working_range(), start_idx, end_idx); if (!matched) { BT_NODE_LOG_ASSERT(false, my_node, "match_range returns 0 entries for interior node is not valid pattern"); - ret = btree_status_t::put_failed; + ret = btree_status_t::not_found; goto out; } } else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) { @@ -182,10 +182,8 @@ btree_status_t Btree< K, V >::mutate_write_leaf_node(const BtreeNodePtr& my_node req.shift_working_range(); } } else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) { - if (!to_variant_node(my_node)->put(req.key(), req.value(), req.m_put_type, req.m_existing_val, - req.m_filter_cb)) { - ret = btree_status_t::put_failed; - } + ret = to_variant_node(my_node)->put(req.key(), req.value(), req.m_put_type, req.m_existing_val, + req.m_filter_cb); COUNTER_INCREMENT(m_metrics, btree_obj_count, 1); } diff --git a/src/include/homestore/btree/detail/variant_node.hpp b/src/include/homestore/btree/detail/variant_node.hpp index 0814f6187..004313ce1 100644 --- a/src/include/homestore/btree/detail/variant_node.hpp +++ b/src/include/homestore/btree/detail/variant_node.hpp @@ -192,14 +192,14 @@ class VariantNode : public StoreSpecificBtreeNode { /// translates into one of "Insert", "Update" or "Upsert". /// @param existing_val [optional] A pointer to a value to store the value of the existing entry if it was updated. /// @param filter_cb [optional] A callback function to be called for each entry found in the node that has a key. It - /// is used as an filter to remove anything that needn't be updated. - /// @return A boolean indicating whether the operation was successful. + /// is used as a filter to remove anything that needn't be updated. + /// @return A status code indicating whether the operation was successful. /// - virtual bool put(BtreeKey const& key, BtreeValue const& val, btree_put_type put_type, BtreeValue* existing_val, - put_filter_cb_t const& filter_cb = nullptr) { + virtual btree_status_t put(BtreeKey const &key, BtreeValue const &val, btree_put_type put_type, + BtreeValue *existing_val, put_filter_cb_t const &filter_cb = nullptr) { LOGMSG_ASSERT_EQ(magic(), BTREE_NODE_MAGIC, "Magic mismatch on btree_node {}", get_persistent_header_const()->to_string()); - bool ret = true; + auto ret = btree_status_t::success; DEBUG_ASSERT_EQ( this->is_leaf(), true, @@ -210,22 +210,26 @@ class VariantNode : public StoreSpecificBtreeNode { if (existing_val) { get_nth_value(idx, existing_val, true); } if (filter_cb && filter_cb(get_nth_key< K >(idx, false), get_nth_value(idx, false), val) != - put_filter_decision::replace) { - return false; + put_filter_decision::replace) { + LOGINFO("Filter callback rejected the update for key {}", key.to_string()); + return btree_status_t::filtered_out; } } if (put_type == btree_put_type::INSERT) { if (found) { - LOGDEBUG("Attempt to insert duplicate entry {}", key.to_string()); - return false; + LOGINFO("Attempt to insert duplicate entry {}", key.to_string()); + return btree_status_t::already_exists; } - ret = (insert(idx, key, val) == btree_status_t::success); + ret = insert(idx, key, val); } else if (put_type == btree_put_type::UPDATE) { - if (!found) return false; + if (!found) { + LOGINFO("Attempt to update non-existent entry {}", key.to_string()); + return btree_status_t::not_found; + } update(idx, key, val); } else if (put_type == btree_put_type::UPSERT) { - (found) ? update(idx, key, val) : (void)insert(idx, key, val); + found ? update(idx, key, val) : (void) insert(idx, key, val); } else { DEBUG_ASSERT(false, "Wrong put_type {}", put_type); } diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index a047fed23..1480f5358 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -439,8 +439,7 @@ struct BtreeTestHelper { K key = K{k}; auto sreq = BtreeSinglePutRequest{&key, &value, put_type, existing_v.get()}; sreq.enable_route_tracing(); - bool done = expect_success ? (m_bt->put(sreq) == btree_status_t::success) - : m_bt->put(sreq) == btree_status_t::put_failed; + bool done = expect_success == (m_bt->put(sreq) == btree_status_t::success); if (put_type == btree_put_type::INSERT) { ASSERT_EQ(done, !m_shadow_map.exists(key)); diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp index 4eb775572..2b1a02e71 100644 --- a/src/tests/test_btree_node.cpp +++ b/src/tests/test_btree_node.cpp @@ -88,13 +88,17 @@ struct NodeTest : public testing::Test { K key{k}; V value{V::generate_rand()}; V existing_v; - bool done = m_node1->put(key, value, put_type, &existing_v); + btree_status_t status = m_node1->put(key, value, put_type, &existing_v); - bool expected_done{true}; - if (m_shadow_map.find(key) != m_shadow_map.end()) { expected_done = (put_type != btree_put_type::INSERT); } - ASSERT_EQ(done, expected_done) << "Expected put of key " << k << " of put_type " << enum_name(put_type) - << " to be " << expected_done; - if (expected_done) { + auto expected_status = btree_status_t::success; + if (m_shadow_map.contains(key)) { + expected_status = put_type != btree_put_type::INSERT + ? btree_status_t::success + : btree_status_t::already_exists; + } + ASSERT_EQ(status, expected_status) << "Expected put of key " << k << " of put_type " << enum_name(put_type) + << " to be " << expected_status; + if (expected_status == btree_status_t::success) { m_shadow_map.insert(std::make_pair(key, value)); } else { const auto r = m_shadow_map.find(key); From 6695df205d1b579f76a07734a9bc5c6eec2d86c4 Mon Sep 17 00:00:00 2001 From: koujl <108138320+koujl@users.noreply.github.com> Date: Fri, 20 Dec 2024 16:59:49 +0800 Subject: [PATCH 043/170] read_logical_snp_obj: pass user_ctx to prevent memleak (#617) Ensure user_ctx is passed up to NuRaft regardless of the return value to enable the cleanup of the allocated context object. Signed-off-by: Jilong Kou --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_state_machine.cpp | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index f557e1bba..67cb2dad9 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.28" + version = "6.5.29" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 09bd6b7ba..c8f7f118a 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -332,10 +332,9 @@ int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, // Listener will read the snapshot data and we pass through the same. int ret = m_rd.m_listener->read_snapshot_obj(snp_ctx, snp_data); + user_ctx = snp_data->user_ctx; // Have to pass the user_ctx to NuRaft even if ret<0 to get it freed later if (ret < 0) return ret; - // Update user_ctx and whether is_last_obj - user_ctx = snp_data->user_ctx; is_last_obj = snp_data->is_last_obj; // We are doing a copy here. From f69e78ecfdac3868e285b73fe3c835231c1be142 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Tue, 10 Dec 2024 18:15:30 +0800 Subject: [PATCH 044/170] 1. fix handle_error and only trigger handle_error for timeout rreqs in handle_raft_event 2. include concept 'volatile' vs 'non-volatile' for log 3. update replay logic : add BLK_ALLOCATED and DATA_RECEIVED only when data linked and received --- src/include/homestore/replication/repl_dev.h | 3 ++ .../replication/log_store/repl_log_store.cpp | 13 ++++++ .../replication/repl_dev/raft_repl_dev.cpp | 44 ++++++++++++++----- src/lib/replication/repl_dev/raft_repl_dev.h | 9 +++- .../repl_dev/raft_state_machine.cpp | 8 +++- 5 files changed, 62 insertions(+), 15 deletions(-) diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index ec8344be0..db79b5f9c 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -152,6 +152,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: int64_t lsn() const { return m_lsn; } bool is_proposer() const { return m_is_proposer; } journal_type_t op_code() const { return m_op_code; } + bool is_volatile() const { return m_is_volatile.load(); } sisl::blob const& header() const { return m_header; } sisl::blob const& key() const { return m_key; } @@ -222,6 +223,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: void set_remote_blkid(RemoteBlkId const& rbid) { m_remote_blkid = rbid; } void set_local_blkid(MultiBlkId const& lbid) { m_local_blkid = lbid; } // Only used during recovery + void set_is_volatile(bool is_volatile) { m_is_volatile.store(is_volatile); } void set_lsn(int64_t lsn); void add_state(repl_req_state_t s); bool add_state_if_not_already(repl_req_state_t s); @@ -248,6 +250,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: bool m_is_proposer{false}; // Is the repl_req proposed by this node Clock::time_point m_start_time; // Start time of the request journal_type_t m_op_code{journal_type_t::HS_DATA_INLINED}; // Operation code for this request + std::atomic< bool > m_is_volatile{true}; // Is the log still in memory and not flushed to disk yet /////////////// Data related section ///////////////// MultiBlkId m_local_blkid; // Local BlkId for the data diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index 36cec9370..97d70ff92 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -93,6 +93,19 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { if (rreq) { rreq->add_state(repl_req_state_t::LOG_FLUSHED); } } } + + // Convert volatile logs to non-volatile logs in state machine + for (int64_t lsn = int64_cast(start_lsn); lsn <= end_lsn; ++lsn) { + auto rreq = m_sm.lsn_to_req(lsn); + if (rreq != nullptr) { + if (rreq->has_state(repl_req_state_t::ERRORED)) { + RD_LOGE("Raft Channel: rreq=[{}] met some errors before", rreq->to_compact_string()); + continue; + } + rreq->set_is_volatile(false); + } + } + sisl::VectorPool< repl_req_ptr_t >::free(reqs); sisl::VectorPool< repl_req_ptr_t >::free(proposer_reqs); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 2449f7833..1270ed761 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -594,7 +594,8 @@ folly::Future< folly::Unit > RaftReplDev::notify_after_data_written(std::vector< }); } -bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms) { +bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms, + std::vector< repl_req_ptr_t >* timeout_rreqs) { std::vector< folly::Future< folly::Unit > > futs; std::vector< repl_req_ptr_t > only_wait_reqs; only_wait_reqs.reserve(rreqs.size()); @@ -621,14 +622,23 @@ bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rre // We are yet to support reactive fetch from remote. if (is_resync_mode()) { - check_and_fetch_remote_data(std::move(only_wait_reqs)); + check_and_fetch_remote_data(only_wait_reqs); } else { - m_repl_svc.add_to_fetch_queue(shared_from_this(), std::move(only_wait_reqs)); + m_repl_svc.add_to_fetch_queue(shared_from_this(), only_wait_reqs); } // block waiting here until all the futs are ready (data channel filled in and promises are made); - auto all_futs = folly::collectAllUnsafe(futs).wait(std::chrono::milliseconds(timeout_ms)); - return (all_futs.isReady()); + auto all_futs_ready = folly::collectAllUnsafe(futs).wait(std::chrono::milliseconds(timeout_ms)).isReady(); + if (!all_futs_ready && timeout_rreqs != nullptr) { + timeout_rreqs->clear(); + for (size_t i{0}; i < futs.size(); ++i) { + if (!futs[i].isReady()) { + timeout_rreqs->emplace_back(only_wait_reqs[i]); + } + } + all_futs_ready = timeout_rreqs->empty(); + } + return all_futs_ready; } void RaftReplDev::check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreqs) { @@ -953,18 +963,25 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) { if (err == ReplServiceError::OK) { return; } + RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); if (!rreq->add_state_if_not_already(repl_req_state_t::ERRORED)) { - RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); + RD_LOGE("Raft Channel: Error has been added for rreq=[{}] error={}", rreq->to_string(), err); return; } // Remove from the map and thus its no longer accessible from applier_create_req m_repl_key_req_map.erase(rreq->rkey()); - if (rreq->op_code() == journal_type_t::HS_DATA_INLINED) { + // Ensure non-volatile lsn not exist because handle_error should not be called after append entries. + auto exist_rreq = m_state_machine->lsn_to_req(rreq->lsn()); + if (exist_rreq != nullptr && !exist_rreq->is_volatile()) { + HS_REL_ASSERT(false, "Unexpected: LSN={} is already ready to commit, exist_rreq=[{}]", + rreq->lsn(), exist_rreq->to_string()); + } + + if (rreq->op_code() == journal_type_t::HS_DATA_LINKED) { // Free the blks which is allocated already - RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { auto blkid = rreq->local_blkid(); data_service().async_free_blk(blkid).thenValue([blkid](auto&& err) { @@ -1288,8 +1305,9 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, } // Wait till we receive the data from its originator for all the requests - if (!wait_for_data_receive(*reqs, HS_DYNAMIC_CONFIG(consensus.data_receive_timeout_ms))) { - for (auto const& rreq : *reqs) { + std::vector< repl_req_ptr_t > timeout_rreqs; + if (!wait_for_data_receive(*reqs, HS_DYNAMIC_CONFIG(consensus.data_receive_timeout_ms), &timeout_rreqs)) { + for (auto const& rreq : timeout_rreqs) { handle_error(rreq, ReplServiceError::TIMEOUT); } ret = nuraft::cb_func::ReturnCode::ReturnNull; @@ -1480,11 +1498,15 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx RD_DBG_ASSERT(happened, "rreq already exists for rkey={}", rkey.to_string()); uint32_t data_size{0u}; + // If the data is linked and value_size is non-zero, it means blks have been allocated for data. + // Since the log is flushed after data is written, the data has already been received. if ((jentry->code == journal_type_t::HS_DATA_LINKED) && (jentry->value_size > 0)) { MultiBlkId entry_blkid; entry_blkid.deserialize(entry_to_val(jentry), true /* copy */); data_size = entry_blkid.blk_count() * get_blk_size(); rreq->set_local_blkid(entry_blkid); + rreq->add_state(repl_req_state_t::BLK_ALLOCATED); + rreq->add_state(repl_req_state_t::DATA_RECEIVED); } rreq->set_lsn(repl_lsn); @@ -1492,8 +1514,6 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx rreq->set_lentry(lentry); rreq->init(rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), entry_to_key(jentry), data_size); // we load the log from log device, implies log flushed. We only flush log after data is written to data device. - rreq->add_state(repl_req_state_t::BLK_ALLOCATED); - rreq->add_state(repl_req_state_t::DATA_RECEIVED); rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->add_state(repl_req_state_t::LOG_RECEIVED); rreq->add_state(repl_req_state_t::LOG_FLUSHED); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index e9ec2a1ad..28706f716 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -306,8 +306,15 @@ class RaftReplDev : public ReplDev, void fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs); void handle_fetch_data_response(sisl::GenericClientResponse response, std::vector< repl_req_ptr_t > rreqs); bool is_resync_mode(); + + /** + * \brief This method handles errors that occur during append entries or data receiving. + * It should not be called after the append entries phase. + */ void handle_error(repl_req_ptr_t const& rreq, ReplServiceError err); - bool wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms); + + bool wait_for_data_receive(std::vector < repl_req_ptr_t > const &rreqs, uint64_t timeout_ms, + std::vector < repl_req_ptr_t > *timeout_rreqs = nullptr); void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); void commit_blk(repl_req_ptr_t rreq); void replace_member(repl_req_ptr_t rreq); diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index c8f7f118a..10fb9285f 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -293,8 +293,12 @@ void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) { rreq->add_state(repl_req_state_t::LOG_RECEIVED); // reset the rreq created_at time to now https://github.com/eBay/HomeStore/issues/506 rreq->set_created_time(); - [[maybe_unused]] auto r = m_lsn_req_map.insert(lsn, std::move(rreq)); - RD_DBG_ASSERT_EQ(r.second, true, "lsn={} already in precommit list, exist_term={}", lsn, r.first->second->term()); + auto r = m_lsn_req_map.insert(lsn, std::move(rreq)); + if (!r.second) { + RD_LOG(ERROR, "lsn={} already in precommit list, exist_term={}, is_volatile={}", + lsn, r.first->second->term(), r.first->second->is_volatile()); + // TODO: we need to think about the case where volatile is in the map already, is it safe to overwrite it? + } } repl_req_ptr_t RaftStateMachine::lsn_to_req(int64_t lsn) { From 9f9bd45b140158eee7b4c86baf5892c43d870d67 Mon Sep 17 00:00:00 2001 From: yuwmao <148639999+yuwmao@users.noreply.github.com> Date: Mon, 23 Dec 2024 12:38:33 +0800 Subject: [PATCH 045/170] Duplication Handling (#611) * Duplication Handling in Blob Write This commit addresses duplication issues on the follower side caused by resync from the leader, it mainly happens when resend snapshot mesg during baseline resync and apply log after snapshot completion. This helps avoid unnecessary GC due to duplicated data. Key Changes: - Utilize allocation hints to check data existence via the application listener. - Introduce `committed_blk_id` in `blk_alloc_hints` to indicate already allocated and committed blocks and pass it from application to HS, preventing reallocation and recommitment. - In `alloc_local_blks()`, if `committed_blk_id` is returned, also add states `DATA_RECEIVED`, `DATA_WRITTEN`, and `DATA_COMMITTED` to skip async_write() and commit_blk(). On the leader side (`RaftReplDev::async_alloc_write`), duplication is treated as an error, as the leader should not propose duplicate data, which may result from mistakes. * Add UT and bump up to 6.6.0 * Move alloc blk logic into rreq.init This commit addresses the issue encountered during a restart. In the previous commit, the DATA_COMMITTED state was used to skip the commit_blk operation. However, after restart, repl_req state DATA_COMMITTED is lost. In this case, if the lsn of log entry is greater than durable_commit_lsn, the data will be committed directly without the opportunity to find if the data is duplicated, as a result, commit_blk may fail due to duplication. --- conanfile.py | 2 +- src/include/homestore/blk.h | 1 + .../homestore/replication/repl_decls.h | 1 + src/include/homestore/replication/repl_dev.h | 7 +- src/lib/replication/repl_dev/common.cpp | 30 ++++++- .../replication/repl_dev/raft_repl_dev.cpp | 63 ++++++++------- .../replication/repl_dev/solo_repl_dev.cpp | 18 ++--- src/tests/test_common/raft_repl_test_base.hpp | 78 ++++++++++++++++++- src/tests/test_raft_repl_dev.cpp | 48 ++++++++++++ 9 files changed, 199 insertions(+), 49 deletions(-) diff --git a/conanfile.py b/conanfile.py index 67cb2dad9..f4d5fc38b 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.29" + version = "6.6.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h index b7e175b35..1ceab0b8a 100644 --- a/src/include/homestore/blk.h +++ b/src/include/homestore/blk.h @@ -251,6 +251,7 @@ struct blk_alloc_hints { blk_temp_t desired_temp{0}; // Temperature hint for the device std::optional< uint32_t > pdev_id_hint; // which physical device to pick (hint if any) -1 for don't care std::optional< chunk_num_t > chunk_id_hint; // any specific chunk id to pick for this allocation + std::optional committed_blk_id; // blk id indicates the blk was already allocated and committed, don't allocate and commit again std::optional< stream_id_t > stream_id_hint; // any specific stream to pick std::optional< uint64_t > application_hint; // hints in uint64 what will be passed opaque to select_chunk bool can_look_for_other_chunk{true}; // If alloc on device not available can I pick other device diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index edcdbe51e..160733c0d 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -34,6 +34,7 @@ VENUM(ReplServiceError, int32_t, NOT_IMPLEMENTED = -10001, NO_SPACE_LEFT = -20000, DRIVE_WRITE_ERROR = -20001, + DATA_DUPLICATED = -20002, FAILED = -32768); // clang-format on diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index db79b5f9c..d05be3fde 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -36,7 +36,8 @@ VENUM(repl_req_state_t, uint32_t, DATA_WRITTEN = 1 << 2, // Data has been written to the storage LOG_RECEIVED = 1 << 3, // Log is received and waiting for data LOG_FLUSHED = 1 << 4, // Log has been flushed - ERRORED = 1 << 5 // Error has happened and cleaned up + ERRORED = 1 << 5, // Error has happened and cleaned up + DATA_COMMITTED = 1 << 6 // Data has already been committed, used in duplication handling, will skip commit_blk ) VENUM(journal_type_t, uint16_t, @@ -142,8 +143,8 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: public: repl_req_ctx() { m_start_time = Clock::now(); } virtual ~repl_req_ctx(); - void init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size); + ReplServiceError init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, + sisl::blob const& key, uint32_t data_size, cshared< ReplDevListener >& listener); /////////////////////// All getters /////////////////////// repl_key const& rkey() const { return m_rkey; } diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index e5b34dbcd..b2ba6bce4 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -6,11 +6,12 @@ #include #include "replication/repl_dev/common.h" #include +#include namespace homestore { -void repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size) { +ReplServiceError repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, + sisl::blob const& key, uint32_t data_size, cshared< ReplDevListener >& listener) { m_rkey = std::move(rkey); #ifndef NDEBUG if (data_size > 0) { @@ -24,6 +25,18 @@ void repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer, m_header = user_header; m_key = key; m_is_jentry_localize_pending = (!is_proposer && (data_size > 0)); // Pending on the applier and with linked data + + // We need to allocate the block if the req has data linked, since entry doesn't exist or if it exist, two threads(data channel and raft channel) are trying to do the same + // thing. So take state mutex and allocate the blk + std::unique_lock< std::mutex > lg(m_state_mtx); + if (has_linked_data() && !has_state(repl_req_state_t::BLK_ALLOCATED)) { + auto alloc_status = alloc_local_blks(listener, data_size); + if (alloc_status != ReplServiceError::OK) { + LOGERROR("Allocate blk for rreq failed error={}", alloc_status); + } + return alloc_status; + } + return ReplServiceError::OK; } repl_req_ctx::~repl_req_ctx() { @@ -91,6 +104,19 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list auto const hints_result = listener->get_blk_alloc_hints(m_header, data_size); if (hints_result.hasError()) { return hints_result.error(); } + if (hints_result.value().committed_blk_id.has_value()) { + //if the committed_blk_id is already present, use it and skip allocation and commitment + LOGINFO("For Repl_key=[{}] data already exists, skip", rkey().to_string()); + m_local_blkid = hints_result.value().committed_blk_id.value(); + add_state(repl_req_state_t::BLK_ALLOCATED); + add_state(repl_req_state_t::DATA_RECEIVED); + add_state(repl_req_state_t::DATA_WRITTEN); + add_state(repl_req_state_t::DATA_COMMITTED); + m_data_received_promise.setValue(); + m_data_written_promise.setValue(); + return ReplServiceError::OK; + } + auto status = data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), hints_result.value(), m_local_blkid); if (status != BlkAllocStatus::SUCCESS) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 1270ed761..a39d6035b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -184,7 +184,7 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_members_ctx)); rreq->init( repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, - journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0); + journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0, m_listener); auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { @@ -251,7 +251,7 @@ folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { // here, we set the dsn to a new one , which is definitely unique in the follower, so that the new rreq will not // have a conflict with the old rreq. rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, - journal_type_t::HS_CTRL_DESTROY, true, sisl::blob{}, sisl::blob{}, 0); + journal_type_t::HS_CTRL_DESTROY, true, sisl::blob{}, sisl::blob{}, 0, m_listener); auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { @@ -292,25 +292,28 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } } - rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, + auto status = rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, - header, key, data.size); + header, key, data.size, m_listener); // Add the request to the repl_dev_rreq map, it will be accessed throughout the life cycle of this request auto const [it, happened] = m_repl_key_req_map.emplace(rreq->rkey(), rreq); RD_DBG_ASSERT(happened, "Duplicate repl_key={} found in the map", rreq->rkey().to_string()); + if (status != ReplServiceError::OK) { + RD_LOGD("Initializing rreq failed error={}, failing this req", status); + handle_error(rreq, status); + return; + } + // If it is header only entry, directly propose to the raft if (rreq->has_linked_data()) { - push_data_to_all_followers(rreq, data); - - // Step 1: Alloc Blkid - auto const status = rreq->alloc_local_blks(m_listener, data.size); - if (status != ReplServiceError::OK) { - RD_LOGD("Allocating blks failed error={}, failing this req", status); - handle_error(rreq, status); + if (rreq->is_proposer() && rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { + RD_LOGD("data blks has already been allocated and committed, failing this req"); + handle_error(rreq, ReplServiceError::DATA_DUPLICATED); return; } + push_data_to_all_followers(rreq, data); COUNTER_INCREMENT(m_metrics, total_write_cnt, 1); COUNTER_INCREMENT(m_metrics, outstanding_data_write_cnt, 1); @@ -498,32 +501,24 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ } } - // We need to allocate the block, since entry doesn't exist or if it exist, two threads are trying to do the same - // thing. So take state mutex and allocate the blk - std::unique_lock< std::mutex > lg(rreq->m_state_mtx); - rreq->init(rkey, code, false /* is_proposer */, user_header, key, data_size); - - // There is no data portion, so there is not need to allocate + // rreq->init will allocate the block if it has linked data. + auto status = rreq->init(rkey, code, false /* is_proposer */, user_header, key, data_size, m_listener); if (!rreq->has_linked_data()) { return rreq; } - if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { return rreq; } - - auto alloc_status = rreq->alloc_local_blks(m_listener, data_size); #ifdef _PRERELEASE if (is_data_channel) { if (iomgr_flip::instance()->test_flip("fake_reject_append_data_channel")) { LOGINFO("Data Channel: Reject append_entries flip is triggered for rkey={}", rkey.to_string()); - alloc_status = ReplServiceError::NO_SPACE_LEFT; + status = ReplServiceError::NO_SPACE_LEFT; } } else { if (iomgr_flip::instance()->test_flip("fake_reject_append_raft_channel")) { LOGINFO("Raft Channel: Reject append_entries flip is triggered for rkey={}", rkey.to_string()); - alloc_status = ReplServiceError::NO_SPACE_LEFT; + status = ReplServiceError::NO_SPACE_LEFT; } } #endif - - if (alloc_status != ReplServiceError::OK) { - RD_LOGE("For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), alloc_status); + if (status != ReplServiceError::OK) { + RD_LOGD("For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), status); // Do not call handle_error here, because handle_error is for rreq which needs to be terminated. This one can be // retried. return nullptr; @@ -930,8 +925,8 @@ void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { } } -void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { - commit_blk(rreq); + void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { + if (!rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { commit_blk(rreq); } // Remove the request from repl_key map. m_repl_key_req_map.erase(rreq->rkey()); @@ -979,7 +974,12 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) HS_REL_ASSERT(false, "Unexpected: LSN={} is already ready to commit, exist_rreq=[{}]", rreq->lsn(), exist_rreq->to_string()); } - + if (err == ReplServiceError::DATA_DUPLICATED) { + RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); + m_listener->on_error(err, rreq->header(), rreq->key(), rreq); + rreq->clear(); + return; + } if (rreq->op_code() == journal_type_t::HS_DATA_LINKED) { // Free the blks which is allocated already if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { @@ -1512,7 +1512,12 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx rreq->set_lsn(repl_lsn); // keep lentry in scope for the lyfe cycle of the rreq rreq->set_lentry(lentry); - rreq->init(rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), entry_to_key(jentry), data_size); + auto status = rreq->init(rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), entry_to_key(jentry), + data_size, m_listener); + if (status != ReplServiceError::OK) { + RD_LOGE("Initializing rreq failed, rreq=[{}], error={}", rreq->to_string(), status); + } + // we load the log from log device, implies log flushed. We only flush log after data is written to data device. rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->add_state(repl_req_state_t::LOG_RECEIVED); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index e5e2cb1a5..4a6a92144 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -30,24 +30,18 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, repl_req_ptr_t rreq) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } - rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1}, - value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, header, key, - value.size); - + auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1}, + value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, + header, key, value.size, m_listener); + HS_REL_ASSERT_EQ(status, ReplServiceError::OK, "Error in allocating local blks"); // If it is header only entry, directly write to the journal - if (rreq->has_linked_data()) { - // Step 1: Alloc Blkid - auto const status = rreq->alloc_local_blks(m_listener, value.size); - HS_REL_ASSERT_EQ(status, ReplServiceError::OK, "Error in allocating local blks"); - + if (rreq->has_linked_data() && !rreq->has_state(repl_req_state_t::DATA_WRITTEN)) { // Write the data data_service().async_write(value, rreq->local_blkid()).thenValue([this, rreq = std::move(rreq)](auto&& err) { HS_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener write_journal(std::move(rreq)); }); - } else { - write_journal(std::move(rreq)); - } + } else { write_journal(std::move(rreq)); } } void SoloReplDev::write_journal(repl_req_ptr_t rreq) { diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 19a346f5a..2f7ab9f1c 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -94,8 +94,8 @@ class TestReplicatedDB : public homestore::ReplDevListener { struct journal_header { uint64_t data_size; uint64_t data_pattern; + uint64_t key_id; //put it in header to test duplication in alloc_local_blks }; - journal_header jheader; uint64_t key_id; sisl::sg_list write_sgs; @@ -108,6 +108,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { write_sgs.size = 0; read_sgs.size = 0; key_id = (uint64_t)rand() << 32 | rand(); + jheader.key_id = key_id; } ~test_req() { @@ -171,6 +172,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { cintrusive< repl_req_ctx >& ctx) override { LOGINFOMOD(replication, "[Replica={}] Received error={} on key={}", g_helper->replica_num(), enum_name(error), *(r_cast< uint64_t const* >(key.cbytes()))); + g_helper->runner().comp_promise_.setException(folly::make_exception_wrapper(error)); } AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { @@ -316,7 +318,16 @@ class TestReplicatedDB : public homestore::ReplDevListener { void free_user_snp_ctx(void*& user_snp_ctx) override {} - ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { + ReplResult get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { + auto jheader = r_cast(header.cbytes()); + Key k{.id_ = jheader->key_id}; + auto iter = inmem_db_.find(k); + if (iter != inmem_db_.end()) { + LOGDEBUG("data already exists in mem db, key={}", k.id_); + auto hints = blk_alloc_hints{}; + hints.committed_blk_id = iter->second.blkid_; + return hints; + } return blk_alloc_hints{}; } void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) override { @@ -335,6 +346,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { auto req = intrusive< test_req >(new test_req()); req->jheader.data_size = data_size; req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num; + req->jheader.key_id = req->key_id; auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); LOGINFOMOD(replication, "[Replica={}] Db write key={} data_size={} pattern={} block_size={}", @@ -591,6 +603,68 @@ class RaftReplDevTestBase : public testing::Test { written_entries_ += num_entries; if (wait_for_commit) { this->wait_for_all_commits(); } } + replica_id_t wait_and_get_leader_id() { + do { + auto leader_uuid = dbs_[0]->repl_dev()->get_leader_id(); + if (leader_uuid.is_nil()) { + LOGINFO("Waiting for leader to be elected"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } else { + return leader_uuid; + } + } while (true); + } + + ReplServiceError write_with_id(uint64_t id, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr) { + if (dbs_[0]->repl_dev() == nullptr) return ReplServiceError::FAILED; + if (db == nullptr) { db = pick_one_db(); } + LOGINFO("Writing data {} since I am the leader my_uuid={}", id, + boost::uuids::to_string(g_helper->my_replica_id())); + auto const block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + + LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); + g_helper->runner().set_num_tasks(1); + g_helper->runner().set_task([this, block_size, db, id]() { + static std::normal_distribution<> num_blks_gen{3.0, 1.0}; + auto data_size = std::max(1L, std::abs(std::lround(num_blks_gen(g_re)))) * block_size; + ASSERT_GT(data_size, 0); + LOGINFO("data_size larger than 0, go ahead, data_size= {}.", data_size); + static std::atomic s_uniq_num{0}; + auto req = intrusive(new TestReplicatedDB::test_req()); + req->jheader.data_size = data_size; + req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num; + //overwrite the key_id with the id passed in + req->jheader.key_id = id; + req->key_id = id; + + LOGINFOMOD(replication, "[Replica={}] Db write key={} data_size={} pattern={} block_size={}", + g_helper->replica_num(), req->key_id, data_size, req->jheader.data_pattern, block_size); + + if (data_size != 0) { + req->write_sgs = + test_common::HSTestHelper::create_sgs(data_size, block_size, req->jheader.data_pattern); + } + + db->repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req); + }); + + if (!wait_for_commit) { + return ReplServiceError::OK; + } + try { + g_helper->runner().execute().get(); + LOGDEBUG("write data task complete, id={}", id) + } catch (const ReplServiceError& e) { + LOGERRORMOD(replication, "[Replica={}] Error in writing data: id={}, error={}", g_helper->replica_num(), + id, enum_name(e)); + return e; + } + + written_entries_ += 1; + LOGINFO("wait_for_commit={}", written_entries_); + this->wait_for_all_commits(); + return ReplServiceError::OK; + } void remove_db(std::shared_ptr< TestReplicatedDB > db, bool wait_for_removal) { this->run_on_leader(db, [this, db]() { diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index 169fc7f8a..51ca8e470 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -15,6 +15,54 @@ #include "test_common/raft_repl_test_base.hpp" class RaftReplDevTest : public RaftReplDevTestBase {}; +TEST_F(RaftReplDevTest, Write_Duplicated_Data) { + uint64_t total_writes = 1; + g_helper->runner().qdepth_ = total_writes; + g_helper->runner().total_tasks_ = total_writes; + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + auto leader_uuid = wait_and_get_leader_id(); + + uint64_t id; + TestReplicatedDB::Key stored_key; + TestReplicatedDB::Value stored_val; + if (leader_uuid == g_helper->my_replica_id()) { + id = (uint64_t)rand() << 32 | rand(); + LOGINFO("going to write data with id={}", id); + this->write_with_id(id, true /* wait_for_commit */); + stored_key = dbs_[0]->inmem_db_.cbegin()->first; + ASSERT_EQ(id, stored_key.id_); + } else { + LOGINFO("I am not leader, leader_uuid={} my_uuid={}, do nothing", + boost::uuids::to_string(leader_uuid), boost::uuids::to_string(g_helper->my_replica_id())); + } + wait_for_commits(total_writes); + + g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + /* test duplication + if duplication found in leader proposal, reject it; + if duplication found in the followers, skip it. + */ + //1. write the same data again on leader, should fail + if (leader_uuid == g_helper->my_replica_id()) { + auto err = this->write_with_id(id, true /* wait_for_commit */); + ASSERT_EQ(ReplServiceError::DATA_DUPLICATED, err); + + //2. delete it from the db to simulate duplication in followers(skip the duplication check in leader side) + dbs_[0]->inmem_db_.erase(stored_key); + LOGINFO("data with id={} has been deleted from db", id); + err = this->write_with_id(id, true /* wait_for_commit */); + ASSERT_EQ(ReplServiceError::OK, err); + } + if (leader_uuid != g_helper->my_replica_id()) { + wait_for_commits(total_writes + 1); + ASSERT_EQ(dbs_[0]->inmem_db_.size(), total_writes); + } + + g_helper->sync_for_cleanup_start(); +} TEST_F(RaftReplDevTest, Write_Restart_Write) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); From c388f69d4eddd058dd4a207e48612dc4baaf2e1b Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Thu, 19 Dec 2024 15:51:28 +0800 Subject: [PATCH 046/170] Only call cp_flush for those consumer paticipated in this cp. If a consumer registered after a cp goes to flushing state, the on_switchover_cp cb will not be called for this consumer. In this CP, the ctx for this consumer is nullptr as the consumer never participant in the cp. Previous code calling cp_flush for every consumer, leaving the duty of properly handle the nullptr returned by cp->context(svc_id) to consumer. However, none of the existing consumer handled the case. As a result, we hit an occurance that Index generate a CP sololy, but before the cp fully flushed, other consumer registered and be called into cp_flush(), the replication service, doesnt properly handled the nullptr like below, `get_repl_dev_ctx` was called with this_ptr is null, it is dangerous as invalid memory get accessed. This change is a breaking change for consumer like HO so bump up the version. HomeObject participant the CP as CLIENT, current implementation of HO always returns nullptr for `on_switchover_cp` which will result the CLIENT be excluded from cp_flush after this commit merged. callstack: ``` homestore::ReplSvcCPContext::get_repl_dev_ctx (this=0x0, dev=0x56010ab52b00) at /home/ubuntu/HomeStore/src/lib/replication/service/raft_repl_service.cpp:521 0x0000560106d58f1e in homestore::RaftReplServiceCPHandler::cp_flush (this=, cp=0x56010a467940) at /home/ubuntu/HomeStore/src/lib/replication/service/raft_repl_service.cpp:549 ``` code: ``` auto cp_ctx = s_cast< ReplSvcCPContext* >(cp->context(cp_consumer_t::REPLICATION_SVC)); ... auto dev_ctx = cp_ctx->get_repl_dev_ctx(repl_dev.get()); ``` Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- src/lib/checkpoint/cp_mgr.cpp | 9 ++++++--- src/lib/replication/service/generic_repl_svc.cpp | 4 +++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/conanfile.py b/conanfile.py index f4d5fc38b..087d8ca98 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.0" + version = "6.6.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp index 7072d7c91..7fd6f7460 100644 --- a/src/lib/checkpoint/cp_mgr.cpp +++ b/src/lib/checkpoint/cp_mgr.cpp @@ -187,7 +187,8 @@ folly::Future< bool > CPManager::do_trigger_cp_flush(bool force, bool flush_on_s // sealer should be the first one to switch over auto& sealer_cp = m_cp_cb_table[(size_t)cp_consumer_t::SEALER]; if (sealer_cp) { - new_cp->m_contexts[(size_t)cp_consumer_t::SEALER] = std::move(sealer_cp->on_switchover_cp(cur_cp.get(), new_cp)); + new_cp->m_contexts[(size_t)cp_consumer_t::SEALER] = + std::move(sealer_cp->on_switchover_cp(cur_cp.get(), new_cp)); } // switch over other consumers for (size_t svcid = 0; svcid < (size_t)cp_consumer_t::SENTINEL; svcid++) { @@ -227,7 +228,8 @@ void CPManager::cp_start_flush(CP* cp) { for (size_t svcid = 0; svcid < (size_t)cp_consumer_t::SENTINEL; svcid++) { if (svcid == (size_t)cp_consumer_t::SEALER) { continue; } auto& consumer = m_cp_cb_table[svcid]; - if (consumer) { futs.emplace_back(std::move(consumer->cp_flush(cp))); } + bool participated = (cp->m_contexts[svcid] != nullptr); + if (consumer && participated) { futs.emplace_back(std::move(consumer->cp_flush(cp))); } } folly::collectAllUnsafe(futs).thenValue([this, cp](auto) { @@ -235,7 +237,8 @@ void CPManager::cp_start_flush(CP* cp) { // at last as the cp_lsn updated here. Other component should // at least flushed to cp_lsn. auto& sealer_cp = m_cp_cb_table[(size_t)cp_consumer_t::SEALER]; - if (sealer_cp) { sealer_cp->cp_flush(cp).wait(); } + bool participated = (cp->m_contexts[(size_t)cp_consumer_t::SEALER] != nullptr); + if (sealer_cp && participated) { sealer_cp->cp_flush(cp).wait(); } // All consumers have flushed for the cp on_cp_flush_done(cp); }); diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 9aa2c044d..f5671cb16 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -152,7 +152,9 @@ AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const rep return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } -std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; } +std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { + return std::make_unique< CPContext >(new_cp); +} folly::Future< bool > SoloReplServiceCPHandler::cp_flush(CP* cp) { repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) { From 877c041998041fcb3f206bc04b43baa75bd3d062 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Mon, 23 Dec 2024 18:03:01 -0800 Subject: [PATCH 047/170] Fix Index recovery path for split(put) (#609) --- conanfile.py | 2 +- .../homestore/index/index_internal.hpp | 9 +- src/include/homestore/index/index_table.hpp | 49 ++++-- src/include/homestore/index_service.hpp | 1 + src/lib/index/index_cp.cpp | 23 ++- src/lib/index/index_service.cpp | 68 +++++--- src/lib/index/wb_cache.cpp | 157 ++++++++++++++---- src/lib/index/wb_cache.hpp | 2 + src/tests/test_index_crash_recovery.cpp | 93 ++++++----- src/tests/test_scripts/index_test.py | 10 +- 10 files changed, 281 insertions(+), 133 deletions(-) diff --git a/conanfile.py b/conanfile.py index 087d8ca98..75fb41167 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.1" + version = "6.6.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp index fea20dbd6..989e650c4 100644 --- a/src/include/homestore/index/index_internal.hpp +++ b/src/include/homestore/index/index_internal.hpp @@ -73,6 +73,7 @@ class IndexTableBase { virtual uint64_t used_size() const = 0; virtual void destroy() = 0; virtual void repair_node(IndexBufferPtr const& buf) = 0; + virtual void repair_root_node(IndexBufferPtr const& buf) = 0; }; enum class index_buf_state_t : uint8_t { @@ -97,7 +98,7 @@ struct IndexBuffer : public sisl::ObjLifeCounter< IndexBuffer > { sisl::atomic_counter< int > m_wait_for_down_buffers{0}; // Number of children need to wait for before persisting #ifndef NDEBUG // Down buffers are not mandatory members, but only to keep track of any bugs and asserts - std::vector > m_down_buffers; + std::vector< std::weak_ptr< IndexBuffer > > m_down_buffers; std::mutex m_down_buffers_mtx; std::shared_ptr< IndexBuffer > m_prev_up_buffer; // Keep a copy for debugging #endif @@ -125,11 +126,11 @@ struct IndexBuffer : public sisl::ObjLifeCounter< IndexBuffer > { std::string to_string() const; std::string to_string_dot() const; - void add_down_buffer(const IndexBufferPtr &buf); + void add_down_buffer(const IndexBufferPtr& buf); - void remove_down_buffer(const IndexBufferPtr &buf); + void remove_down_buffer(const IndexBufferPtr& buf); #ifndef NDEBUG - bool is_in_down_buffers(const IndexBufferPtr &buf); + bool is_in_down_buffers(const IndexBufferPtr& buf); #endif }; diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 94b8685a3..83411b5c0 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -79,7 +79,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { void destroy() override { auto cpg = cp_mgr().cp_guard(); - Btree::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); + Btree< K, V >::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); m_sb.destroy(); } @@ -114,11 +114,40 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return ret; } + void repair_root_node(IndexBufferPtr const& idx_buf) override { + LOGTRACEMOD(wbcache, "check if this was the previous root node {} for buf {} ", m_sb->root_node, + idx_buf->to_string()); + if (m_sb->root_node == idx_buf->blkid().to_integer()) { + // This is the root node, we need to update the root node in superblk + LOGTRACEMOD(wbcache, "{} is old root so we need to update the meta node ", idx_buf->to_string()); + BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */, + BtreeNode::identify_leaf_node(idx_buf->raw_buffer())); + static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf); + auto edge_id = n->next_bnode(); + + BT_DBG_ASSERT(!n->has_valid_edge(), + "root {} already has a valid edge {}, so we should have found the new root node", + n->to_string(), n->get_edge_value().bnode_id()); + n->set_next_bnode(empty_bnodeid); + n->set_edge_value(BtreeLinkInfo{edge_id, 0}); + LOGTRACEMOD(wbcache, "change root node {}: edge updated to {} and invalidate the next node! ", n->node_id(), + edge_id); + auto cpg = cp_mgr().cp_guard(); + write_node_impl(n, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + + } else { + LOGTRACEMOD(wbcache, "This is not the root node, so we can ignore this repair call for buf {}", + idx_buf->to_string()); + } + } + void repair_node(IndexBufferPtr const& idx_buf) override { if (idx_buf->is_meta_buf()) { // We cannot repair the meta buf on its own, we need to repair the root node which modifies the // meta_buf. It is ok to ignore this call, because repair will be done from root before meta_buf is // attempted to repair, which would have updated the meta_buf already. + LOGTRACEMOD(wbcache, "Ignoring repair on meta buf {} root id {} ", idx_buf->to_string(), + this->root_node_id()); return; } BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */, @@ -134,13 +163,14 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // Only for interior nodes we need to repair its links if (!bn->is_leaf()) { LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string()); - repair_links(bn, (void *) cpg.context(cp_consumer_t::INDEX_SVC)); + repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); } if (idx_buf->m_up_buffer && idx_buf->m_up_buffer->is_meta_buf()) { // Our up buffer is a meta buffer, which means that we are the new root node, we need to update the // meta_buf with new root as well - on_root_changed(bn, (void *) cpg.context(cp_consumer_t::INDEX_SVC)); + LOGTRACEMOD(wbcache, "root change for after repairing {}\n\n", idx_buf->to_string()); + on_root_changed(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); } } @@ -227,10 +257,11 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { wb_cache().free_buf(n->m_idx_buf, r_cast< CPContext* >(context)); } - btree_status_t - on_root_changed(BtreeNodePtr const &new_root, void *context) override { + btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override { // todo: if(m_sb->root_node == new_root->node_id() && m_sb->root_link_version == new_root->link_version()){ // return btree_status_t::success;} + LOGTRACEMOD(wbcache, "root changed for index old_root={} new_root={}", m_sb->root_node, + new_root->node_id()); m_sb->root_node = new_root->node_id(); m_sb->root_link_version = new_root->link_version(); @@ -240,7 +271,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } auto& root_buf = static_cast< IndexBtreeNode* >(new_root.get())->m_idx_buf; - wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, r_cast(context)); + wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, r_cast< CPContext* >(context)); return btree_status_t::success; } @@ -257,7 +288,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } // Get all original child ids as a support to check if we are beyond the last child node - std::set orig_child_ids; + std::set< bnodeid_t > orig_child_ids; for (uint32_t i = 0; i < parent_node->total_entries(); ++i) { BtreeLinkInfo link_info; parent_node->get_nth_value(i, &link_info, true); @@ -391,9 +422,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } while (true); - if (child_node) { - this->unlock_node(child_node, locktype_t::READ); - } + if (child_node) { this->unlock_node(child_node, locktype_t::READ); } if (parent_node->total_entries() == 0 && !parent_node->has_valid_edge()) { // We shouldn't have an empty interior node in the tree, let's delete it. diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp index c8801c9d2..87ad63672 100644 --- a/src/include/homestore/index_service.hpp +++ b/src/include/homestore/index_service.hpp @@ -82,6 +82,7 @@ class IndexService { uint64_t used_size() const; uint32_t node_size() const; void repair_index_node(uint32_t ordinal, IndexBufferPtr const& node_buf); + void update_root(uint32_t ordinal, IndexBufferPtr const& node_buf); IndexWBCacheBase& wb_cache() { if (!m_wb_cache) { throw std::runtime_error("Attempted to access a null pointer wb_cache"); } diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index 578fae997..122667726 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -145,7 +145,7 @@ void IndexCPContext::to_string_dot(const std::string& filename) { LOGINFO("cp dag is stored in file {}", filename); } -uint16_t IndexCPContext::num_dags() { +uint16_t IndexCPContext::num_dags() { // count number of buffers whose up_buffers are nullptr uint16_t count = 0; std::unique_lock lg{m_flush_buffer_mtx}; @@ -190,15 +190,18 @@ std::string IndexCPContext::to_string_with_dags() { // Now walk through the list of graphs and prepare formatted string std::string str{fmt::format("IndexCPContext cpid={} dirty_buf_count={} dirty_buf_list_size={} #_of_dags={}\n", m_cp->id(), m_dirty_buf_count.get(), m_dirty_buf_list.size(), group_roots.size())}; + int cnt = 1; for (const auto& root : group_roots) { - std::vector< std::pair< std::shared_ptr< DagNode >, int > > stack; - stack.emplace_back(root, 0); + std::vector< std::tuple< std::shared_ptr< DagNode >, int, int > > stack; + stack.emplace_back(root, 0, cnt++); while (!stack.empty()) { - auto [node, level] = stack.back(); + auto [node, level, index] = stack.back(); stack.pop_back(); - fmt::format_to(std::back_inserter(str), "{}{} \n", std::string(level * 4, ' '), node->buf->to_string()); + fmt::format_to(std::back_inserter(str), "{}{}-{} \n", std::string(level * 4, ' '), index, + node->buf->to_string()); + int c = node->down_nodes.size(); for (const auto& d : node->down_nodes) { - stack.emplace_back(d, level + 1); + stack.emplace_back(d, level + 1, c--); } } } @@ -266,15 +269,11 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, #ifndef NDEBUG // if (!is_sibling_link || (buf->m_up_buffer == real_up_buf)) { return buf;} // Already linked with same buf or its not a sibling link to override - if (real_up_buf->is_in_down_buffers(buf)) { - return buf; - } + if (real_up_buf->is_in_down_buffers(buf)) { return buf; } #endif if (buf->m_up_buffer != real_up_buf) { - if (buf->m_up_buffer) { - buf->m_up_buffer->remove_down_buffer(buf); - } + if (buf->m_up_buffer) { buf->m_up_buffer->remove_down_buffer(buf); } real_up_buf->add_down_buffer(buf); buf->m_up_buffer = real_up_buf; } diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index 49755a4ef..73b96b064 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -132,6 +132,15 @@ void IndexService::repair_index_node(uint32_t ordinal, IndexBufferPtr const& nod } } +void IndexService::update_root(uint32_t ordinal, IndexBufferPtr const& node_buf) { + auto tbl = get_index_table(ordinal); + if (tbl) { + tbl->repair_root_node(node_buf); + } else { + HS_DBG_ASSERT(false, "Index corresponding to ordinal={} has not been loaded yet, unexpected", ordinal); + } +} + uint32_t IndexService::node_size() const { return m_vdev->atomic_page_size(); } uint64_t IndexService::used_size() const { @@ -154,31 +163,39 @@ IndexBuffer::~IndexBuffer() { } std::string IndexBuffer::to_string() const { - if (m_is_meta_buf) { - return fmt::format("Buf={} [Meta] index={} state={} create/dirty_cp={}/{} down_wait#={} freed={}", - voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, int_cast(state()), - m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), m_node_freed); - } else { - // store m_down_buffers in a string - std::string down_bufs = ""; + static std::vector< std::string > state_str = {"CLEAN", "DIRTY", "FLUSHING"}; + // store m_down_buffers in a string + std::string down_bufs = ""; #ifndef NDEBUG - { - std::lock_guard lg(m_down_buffers_mtx); - for (auto const &down_buf: m_down_buffers) { + { + std::lock_guard lg(m_down_buffers_mtx); + if (m_down_buffers.empty()) { + fmt::format_to(std::back_inserter(down_bufs), "EMPTY"); + } else { + for (auto const& down_buf : m_down_buffers) { if (auto ptr = down_buf.lock()) { fmt::format_to(std::back_inserter(down_bufs), "[{}]", voidptr_cast(ptr.get())); } } + fmt::format_to(std::back_inserter(down_bufs), " #down bufs={}", m_down_buffers.size()); } + } #endif - return fmt::format("Buf={} index={} state={} create/dirty_cp={}/{} down_wait#={}{} up={} node=[{}] down=[{}]", - voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, int_cast(state()), - m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), - m_node_freed ? " Freed" : "", voidptr_cast(const_cast< IndexBuffer* >(m_up_buffer.get())), - (m_bytes == nullptr) ? "not attached yet" - : r_cast< persistent_hdr_t const* >(m_bytes)->to_compact_string(), - down_bufs); + if (m_is_meta_buf) { + return fmt::format("[Meta] Buf={} index={} state={} create/dirty_cp={}/{} down_wait#={}{} down={{{}}}", + voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, + state_str[int_cast(state())], m_created_cp_id, m_dirtied_cp_id, + m_wait_for_down_buffers.get(), m_node_freed ? " Freed" : "", down_bufs); + } else { + + return fmt::format( + "Buf={} index={} state={} create/dirty_cp={}/{} down_wait#={}{} up={} node=[{}] down={{{}}}", + voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, state_str[int_cast(state())], + m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), m_node_freed ? " Freed" : "", + voidptr_cast(const_cast< IndexBuffer* >(m_up_buffer.get())), + (m_bytes == nullptr) ? "not attached yet" : r_cast< persistent_hdr_t const* >(m_bytes)->to_compact_string(), + down_bufs); } } @@ -194,7 +211,7 @@ std::string IndexBuffer::to_string_dot() const { return str; } -void IndexBuffer::add_down_buffer(const IndexBufferPtr &buf) { +void IndexBuffer::add_down_buffer(const IndexBufferPtr& buf) { m_wait_for_down_buffers.increment(); #ifndef NDEBUG { @@ -204,10 +221,11 @@ void IndexBuffer::add_down_buffer(const IndexBufferPtr &buf) { #endif } -void IndexBuffer::remove_down_buffer(const IndexBufferPtr &buf) { +void IndexBuffer::remove_down_buffer(const IndexBufferPtr& buf) { m_wait_for_down_buffers.decrement(); #ifndef NDEBUG - bool found{false}; { + bool found{false}; + { std::lock_guard lg(m_down_buffers_mtx); for (auto it = buf->m_up_buffer->m_down_buffers.begin(); it != buf->m_up_buffer->m_down_buffers.end(); ++it) { if (it->lock() == buf) { @@ -222,12 +240,10 @@ void IndexBuffer::remove_down_buffer(const IndexBufferPtr &buf) { } #ifndef NDEBUG -bool IndexBuffer::is_in_down_buffers(const IndexBufferPtr &buf) { - std::lock_guard lg(m_down_buffers_mtx); - for (auto const &dbuf: m_down_buffers) { - if (dbuf.lock() == buf) { - return true; - } +bool IndexBuffer::is_in_down_buffers(const IndexBufferPtr& buf) { + std::lock_guard< std::mutex > lg(m_down_buffers_mtx); + for (auto const& dbuf : m_down_buffers) { + if (dbuf.lock() == buf) { return true; } } return false; } diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 04383d8ac..caf00b3d1 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -420,11 +420,11 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { } buf->m_node_freed = true; resource_mgr().inc_free_blk(m_node_size); - m_vdev->free_blk(buf->m_blkid, s_cast(cp_ctx)); + m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(cp_ctx)); } //////////////////// Recovery Related section ///////////////////////////////// -void IndexWBCache::load_buf(IndexBufferPtr const &buf) { +void IndexWBCache::load_buf(IndexBufferPtr const& buf) { if (buf->m_bytes == nullptr) { buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid()); @@ -432,6 +432,78 @@ void IndexWBCache::load_buf(IndexBufferPtr const &buf) { } } +struct DagNode { + IndexBufferPtr buffer; + std::vector< shared< DagNode > > children; +}; + +using DagPtr = std::shared_ptr< DagNode >; +using DagMap = std::map< IndexBufferPtr, DagPtr >; + +static DagMap generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap) { + std::vector< IndexBufferPtr > bufs; + std::ranges::transform(bufmap, std::back_inserter(bufs), [](const auto& pair) { return pair.second; }); + + auto buildReverseMapping = [](const std::vector< IndexBufferPtr >& buffers) { + std::unordered_map< IndexBufferPtr, std::vector< IndexBufferPtr > > parentToChildren; + for (const auto& buffer : buffers) { + if (buffer->m_up_buffer) { parentToChildren[buffer->m_up_buffer].push_back(buffer); } + } + return parentToChildren; + }; + + std::function< DagPtr(IndexBufferPtr, std::unordered_map< IndexBufferPtr, std::vector< IndexBufferPtr > >&) > + buildDag; + buildDag = + [&buildDag](IndexBufferPtr buffer, + std::unordered_map< IndexBufferPtr, std::vector< IndexBufferPtr > >& parentToChildren) -> DagPtr { + auto dagNode = std::make_shared< DagNode >(); + dagNode->buffer = buffer; + if (parentToChildren.count(buffer)) { + for (const auto& child : parentToChildren[buffer]) { + dagNode->children.push_back(buildDag(child, parentToChildren)); + } + } + return dagNode; + }; + + auto generateDagMap = [&](const std::vector< IndexBufferPtr >& buffers) { + DagMap dagMap; + auto parentToChildren = buildReverseMapping(buffers); + for (const auto& buffer : buffers) { + if (!buffer->m_up_buffer) { // This is a root buffer + auto dagRoot = buildDag(buffer, parentToChildren); + dagMap[buffer] = dagRoot; + } + } + return dagMap; + }; + + return generateDagMap(bufs); +} + +static std::string to_string_dag_bufs(DagMap& dags, cp_id_t cp_id = 0) { + std::string str{fmt::format("#_of_dags={}\n", dags.size())}; + int cnt = 1; + for (const auto& [_, dag] : dags) { + std::vector< std::tuple< std::shared_ptr< DagNode >, int, int > > stack; + stack.emplace_back(dag, 0, cnt++); + while (!stack.empty()) { + auto [node, level, index] = stack.back(); + stack.pop_back(); + auto snew = node->buffer->m_created_cp_id == cp_id ? "NEW" : ""; + auto sfree = node->buffer->m_node_freed ? "FREED" : ""; + fmt::format_to(std::back_inserter(str), "{}{}-{} {} {}\n", std::string(level * 4, ' '), index, + node->buffer->to_string(), snew, sfree); + int c = node->children.size(); + for (const auto& d : node->children) { + stack.emplace_back(d, level + 1, c--); + } + } + } + return str; +} + void IndexWBCache::recover(sisl::byte_view sb) { // If sb is empty, its possible a first time boot. if ((sb.bytes() == nullptr) || (sb.size() == 0)) { @@ -452,9 +524,9 @@ void IndexWBCache::recover(sisl::byte_view sb) { #ifdef _PRERELEASE auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, - std::vector const &pending_bufs) { + std::vector< IndexBufferPtr > const& pending_bufs) { std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); - for (auto const &[_, buf]: bufs) { + for (auto const& [_, buf] : bufs) { load_buf(buf); fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); } @@ -462,7 +534,7 @@ void IndexWBCache::recover(sisl::byte_view sb) { // list of new_bufs if (!pending_bufs.empty()) { fmt::format_to(std::back_inserter(log), "\n\tpending_bufs (#of bufs = {})\n", pending_bufs.size()); - for (auto const &buf: pending_bufs) { + for (auto const& buf : pending_bufs) { fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); } } @@ -471,6 +543,8 @@ void IndexWBCache::recover(sisl::byte_view sb) { std::string log = fmt::format("Recovering bufs (#of bufs = {}) before processing them\n", bufs.size()); LOGTRACEMOD(wbcache, "{}\n{}", log, detailed_log(bufs, {})); + auto dags = generate_dag_buffers(bufs); + LOGTRACEMOD(wbcache, "Before recovery: {}", to_string_dag_bufs(dags, icp_ctx->id())); #endif // At this point, we have the DAG structure (up/down dependency graph), exactly the same as prior to crash, with one @@ -484,15 +558,15 @@ void IndexWBCache::recover(sisl::byte_view sb) { // the same blkid which could clash with the blkid next in the buf list. // // On the second pass, we only take part of the parents/siblings and then repair them, if needed. - std::vector pending_bufs; - std::vector deleted_bufs; - for (auto const &[_, buf]: bufs) { + std::vector< IndexBufferPtr > pending_bufs; + std::vector< IndexBufferPtr > deleted_bufs; + for (auto const& [_, buf] : bufs) { if (buf->m_node_freed) { // Freed node load_buf(buf); if (was_node_committed(buf)) { // Mark this buffer as deleted, so that we can avoid using it anymore when repairing its parent's link - r_cast(buf->m_bytes)->node_deleted = true; + r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted = true; write_buf(nullptr, buf, icp_ctx); deleted_bufs.push_back(buf); pending_bufs.push_back(buf->m_up_buffer); @@ -513,9 +587,13 @@ void IndexWBCache::recover(sisl::byte_view sb) { m_vdev->commit_blk(buf->m_blkid); pending_bufs.push_back(buf->m_up_buffer); } else { - // Just ignore it + // Up buffer is not committed, we need to repair it first buf->m_up_buffer->remove_down_buffer(buf); - buf->m_up_buffer = nullptr; + // buf->m_up_buffer = nullptr; + if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) { + // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers + update_up_buffer_counters(buf->m_up_buffer); + } } } } @@ -524,25 +602,44 @@ void IndexWBCache::recover(sisl::byte_view sb) { LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", pending_bufs.size(), bufs.size(), icp_ctx->id()); LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, pending_bufs)); + LOGTRACEMOD(wbcache, "After recovery: {}", to_string_dag_bufs(dags, icp_ctx->id())); #endif - for (auto const &buf: pending_bufs) { + for (auto const& buf : pending_bufs) { recover_buf(buf); - if (buf->m_bytes != nullptr && r_cast(buf->m_bytes)->node_deleted) { + if (buf->m_bytes != nullptr && r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted) { // This buffer was marked as deleted during repair, so we also need to free it deleted_bufs.push_back(buf); } } - for (auto const &buf: deleted_bufs) { - m_vdev->free_blk(buf->m_blkid, s_cast(icp_ctx)); + for (auto const& buf : deleted_bufs) { + m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); } m_in_recovery = false; m_vdev->recovery_completed(); } -void IndexWBCache::recover_buf(IndexBufferPtr const &buf) { +// if buf->m_wait_for_down_buffers.testz() is true (which means that it has no dependency on any other buffer) then we +// can decrement the wait_for_down_buffers of its up buffer. If the up buffer has up buffer, then we need to decrement +// its wait_for_down_buffers. If the up buffer of up buffer has wait_for_down_buffers as 0, then we need to decrement +// its wait_for_down_buffers. This process continues until we reach the root buffer. If the root buffer has +// wait_for_down_buffers as 0, then we need to decrement its wait_for_down_buffers. +void IndexWBCache::update_up_buffer_counters(IndexBufferPtr const& buf) { + if (buf == nullptr || !buf->m_wait_for_down_buffers.testz() || buf->m_up_buffer == nullptr) { + LOGINFOMOD(wbcache, "Finish decrementing wait_for_down_buffers"); + return; + } + auto grand_buf = buf->m_up_buffer; + grand_buf->remove_down_buffer(buf); + LOGINFOMOD(wbcache, + "Decrementing wait_for_down_buffers for buffer {} due to zero dependency of child {}, Keep going up", + grand_buf->to_string(), buf->to_string()); + update_up_buffer_counters(grand_buf); +} + +void IndexWBCache::recover_buf(IndexBufferPtr const& buf) { if (!buf->m_wait_for_down_buffers.decrement_testz()) { // TODO: remove the buf_>m_up_buffer from down_buffers list of buf->m_up_buffer return; @@ -557,6 +654,12 @@ void IndexWBCache::recover_buf(IndexBufferPtr const &buf) { } else { LOGTRACEMOD(wbcache, "Index Recovery detected up node [{}] as committed no need to repair that", buf->to_string()); + if (buf->m_up_buffer && buf->m_up_buffer->is_meta_buf()) { + // Our up buffer is a meta buffer, which means old root is dirtied and may need no repair but possible of + // new root on upper level so needs to be retore the edge + LOGTRACEMOD(wbcache, "check root change for without repairing {}", buf->to_string()); + index_service().update_root(buf->m_index_ordinal, buf); + } } if (buf->m_up_buffer) { recover_buf(buf->m_up_buffer); } @@ -656,10 +759,8 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const if (buf->is_meta_buf()) { LOGTRACEMOD(wbcache, "Flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), buf->to_string()); - auto const &sb = r_cast(buf.get())->m_sb; - if (!sb.is_empty()) { - meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); - } + auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; + if (!sb.is_empty()) { meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); } process_write_completion(cp_ctx, buf); } else if (buf->m_node_freed) { LOGTRACEMOD(wbcache, "Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), @@ -667,15 +768,13 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const process_write_completion(cp_ctx, buf); } else { LOGTRACEMOD(wbcache, "Flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); - m_vdev->async_write(r_cast(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) - .thenValue([buf, cp_ctx](auto) { - try { - auto &pthis = s_cast(wb_cache()); - pthis.process_write_completion(cp_ctx, buf); - } catch (const std::runtime_error &e) { - LOGERROR("Failed to access write-back cache: {}", e.what()); - } - }); + m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) + .thenValue([buf, cp_ctx](auto) { + try { + auto& pthis = s_cast< IndexWBCache& >(wb_cache()); + pthis.process_write_completion(cp_ctx, buf); + } catch (const std::runtime_error& e) { LOGERROR("Failed to access write-back cache: {}", e.what()); } + }); if (!part_of_batch) { m_vdev->submit_batch(); } } diff --git a/src/lib/index/wb_cache.hpp b/src/lib/index/wb_cache.hpp index 25a4c8201..7d10d7f54 100644 --- a/src/lib/index/wb_cache.hpp +++ b/src/lib/index/wb_cache.hpp @@ -41,6 +41,7 @@ class IndexWBCache : public IndexWBCacheBase { std::mutex m_flush_mtx; void* m_meta_blk; bool m_in_recovery{false}; + public: IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size); @@ -78,5 +79,6 @@ class IndexWBCache : public IndexWBCacheBase { void recover_buf(IndexBufferPtr const& buf); bool was_node_committed(IndexBufferPtr const& buf); void load_buf(IndexBufferPtr const& buf); + void update_up_buffer_counters(IndexBufferPtr const& buf); }; } // namespace homestore diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 560bf0f83..c474db233 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -37,29 +37,29 @@ SISL_LOGGING_DECL(test_index_crash_recovery) SISL_OPTION_GROUP( test_index_crash_recovery, (num_iters, "", "num_iters", "number of iterations for rand ops", - ::cxxopts::value()->default_value("500"), "number"), + ::cxxopts::value< uint32_t >()->default_value("500"), "number"), (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value()->default_value("5000"), "number"), - (run_time, "", "run_time", "run time for io", ::cxxopts::value()->default_value("360000"), "seconds"), + ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), + (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), (num_rounds, "", "num_rounds", "number of rounds to test with", - ::cxxopts::value()->default_value("100"), "number"), + ::cxxopts::value< uint32_t >()->default_value("100"), "number"), (num_entries_per_rounds, "", "num_entries_per_rounds", "number of entries per rounds", - ::cxxopts::value()->default_value("40"), "number"), - (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", - ::cxxopts::value()->default_value("20"), ""), - (min_keys_in_node, "", "min_keys_in_node", "min_keys_in_node", - ::cxxopts::value()->default_value("6"), ""), + ::cxxopts::value< uint32_t >()->default_value("40"), "number"), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("20"), + ""), + (min_keys_in_node, "", "min_keys_in_node", "min_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("6"), + ""), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", - ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), + ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", - ::cxxopts::value()->default_value("1000"), "number"), + ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), (load_from_file, "", "load_from_file", "load from file", ::cxxopts::value< bool >()->default_value("0"), ""), (save_to_file, "", "save_to_file", "save to file", ::cxxopts::value< bool >()->default_value("0"), ""), (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", - ::cxxopts::value< bool >()->default_value("1"), ""), + ::cxxopts::value< bool >()->default_value("1"), ""), (seed, "", "seed", "random engine seed, use random if not defined", - ::cxxopts::value< uint64_t >()->default_value("0"), "number")) + ::cxxopts::value< uint64_t >()->default_value("0"), "number")) void log_obj_life_counter() { std::string str; @@ -249,7 +249,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT m_test->m_cfg.m_leaf_node_type = T::leaf_node_type; m_test->m_cfg.m_int_node_type = T::interior_node_type; m_test->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); - m_test->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as(); + m_test->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as< uint32_t >(); m_test->m_bt = std::make_shared< typename T::BtreeType >(std::move(sb), m_test->m_cfg); return m_test->m_bt; } @@ -277,7 +277,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->m_cfg = BtreeConfig(hs()->index_service().node_size()); this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); - this->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as(); + this->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as< uint32_t >(); LOGINFO("Node size {}, max_keys_in_node {}, min_keys_in_node {}", this->m_cfg.node_size(), this->m_cfg.m_max_keys_in_node, this->m_cfg.m_min_keys_in_node); auto uuid = boost::uuids::random_generator()(); @@ -338,7 +338,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT void reapply_after_crash() { ShadowMap< K, V > snapshot_map{this->m_shadow_map.max_keys()}; snapshot_map.load(m_shadow_filename); - LOGINFO("\tSnapshot before crash\n{}", snapshot_map.to_string()); + // LOGINFO("\tSnapshot before crash\n{}", snapshot_map.to_string()); auto diff = this->m_shadow_map.diff(snapshot_map); // visualize tree after crash @@ -346,13 +346,14 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT // this->visualize_keys(recovered_tree_filename); // LOGINFO(" tree after recovered stored in {}", recovered_tree_filename); - std::string dif_str = "KEY \tADDITION\n"; - for (const auto& [k, addition] : diff) { - dif_str += fmt::format(" {} \t{}\n", k.key(), addition); + std::string dif_str = "Keys["; + for (const auto& [k, _] : diff) { + dif_str += fmt::format("{} ", k.key()); } + dif_str += "]"; LOGINFO("Diff between shadow map and snapshot map\n{}\n", dif_str); - for (const auto &[k, addition]: diff) { + for (const auto& [k, addition] : diff) { // this->print_keys(fmt::format("reapply: before inserting key {}", k.key())); // this->visualize_keys(recovered_tree_filename); if (addition) { @@ -401,15 +402,15 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } void crash_and_recover(uint32_t s_key, uint32_t e_key) { - this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); trigger_cp(false); this->wait_for_crash_recovery(); // this->visualize_keys("tree_after_crash_" + std::to_string(s_key) + "_" + std::to_string(e_key) + ".dot"); - this->print_keys("Post crash and recovery, btree structure: "); + // this->print_keys("Post crash and recovery, btree structure: "); this->reapply_after_crash(); - this->print_keys("Post reapply, btree structure: "); + // this->print_keys("Post reapply, btree structure: "); this->get_all(); LOGINFO("Expect to have [{},{}) in tree and it is actually{} ", s_key, e_key, tree_key_count()); @@ -420,24 +421,28 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT std::set< uint64_t > new_keys; std::transform(operations.begin(), operations.end(), std::inserter(new_keys, new_keys.end()), [](const Operation& operation) { return operation.first; }); - uint32_t count = 1; + uint32_t count = 0; this->m_shadow_map.foreach ([this, new_keys, &count](K key, V value) { // discard the new keys to check if (new_keys.find(key.key()) != new_keys.end()) { return; } + count++; auto copy_key = std::make_unique< K >(); *copy_key = key; auto out_v = std::make_unique< V >(); auto req = BtreeSingleGetRequest{copy_key.get(), out_v.get()}; req.enable_route_tracing(); const auto ret = this->m_bt->get(req); + if (ret != btree_status_t::success) { + this->print_keys(fmt::format("Sanity check: key {}", key.key())); + this->dump_to_file("sanity_fail.txt"); + } ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map"; }); LOGINFO("Sanity check passed for {} keys!", count); - } void crash_and_recover(OperationList& operations, std::string filename = "") { - this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); LOGINFO("Before Crash: {} keys in shadow map and it is actually {} keys in tree - operations size {}", this->m_shadow_map.size(), tree_key_count(), operations.size()); @@ -456,7 +461,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Visualize the tree file after recovery : {}", rec_filename); this->visualize_keys(rec_filename); } - this->print_keys("Post crash and recovery, btree structure: "); + // this->print_keys("Post crash and recovery, btree structure: "); sanity_check(operations); // Added to the index service right after recovery. Not needed here // test_common::HSTestHelper::trigger_cp(true); @@ -468,7 +473,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Visualize the tree after reapply {}", re_filename); this->visualize_keys(re_filename); } - this->print_keys("Post reapply, btree structure: "); + // this->print_keys("Post reapply, btree structure: "); this->get_all(); LOGINFO("After reapply: {} keys in shadow map and actually {} in tress", this->m_shadow_map.size(), @@ -629,7 +634,7 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { test_common::HSTestHelper::trigger_cp(true); this->get_all(); this->m_shadow_map.save(this->m_shadow_filename); - this->print_keys("reapply: after preload"); + // this->print_keys("reapply: after preload"); this->visualize_keys("tree_after_preload.dot"); for (uint32_t round = 1; @@ -716,28 +721,27 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), num_entries, this->tree_key_count() * 100.0 / num_entries); } - this->print_keys(fmt::format("reapply: after round {}", round)); + // this->print_keys(fmt::format("reapply: after round {}", round)); if (renew_btree_after_crash) { this->reset_btree(); }; } } // Basic reverse and forward order remove with different flip points TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { - vector flip_points = { - "crash_flush_on_merge_at_parent", - "crash_flush_on_merge_at_left_child", + vector< std::string > flip_points = { + "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child", // "crash_flush_on_freed_child", }; for (size_t i = 0; i < flip_points.size(); ++i) { this->reset_btree(); - auto &flip_point = flip_points[i]; + auto& flip_point = flip_points[i]; LOGINFO("=== Testing flip point: {} - {} ===", i + 1, flip_point); // Populate some keys [1,num_entries) and trigger cp to persist - LOGINFO("Step {}-1: Populate some keys and flush", i+1); - auto const num_entries = SISL_OPTIONS["num_entries"].as(); + LOGINFO("Step {}-1: Populate some keys and flush", i + 1); + auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); for (auto k = 0u; k < num_entries; ++k) { this->put(k, btree_put_type::INSERT, true /* expect_success */); } @@ -748,7 +752,8 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { // Split keys into batches and remove the last one in reverse order LOGINFO("Step {}-2: Set crash flag, remove some keys in reverse order", i + 1); - int batch_num = 4; { + int batch_num = 4; + { int n = batch_num; auto r = num_entries * n / batch_num - 1; auto l = num_entries * (n - 1) / batch_num; @@ -759,8 +764,7 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { LOGINFO("Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, r, l); this->set_basic_flip(flip_point); - for (auto [k, _]: ops) { - LOGINFO("Removing key {}", k); + for (auto [k, _] : ops) { this->remove_one(k, true); } this->visualize_keys("tree_merge_before_first_crash.dot"); @@ -781,8 +785,7 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { LOGINFO("Step {}-3-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); this->set_basic_flip(flip_point); - for (auto [k, _]: ops) { - LOGINFO("Removing key {}", k); + for (auto [k, _] : ops) { this->remove_one(k, true); } this->visualize_keys("tree_merge_before_second_crash.dot"); @@ -803,8 +806,7 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { LOGINFO("Step {}-4-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); this->set_basic_flip(flip_point); - for (auto [k, _]: ops) { - LOGINFO("Removing key {}", k); + for (auto [k, _] : ops) { this->remove_one(k, true); } this->visualize_keys("tree_merge_before_third_crash.dot"); @@ -828,9 +830,8 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { // vector flips = { // "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child", // }; -// SequenceGenerator generator(0 /*putFreq*/, 100 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); -// OperationList operations; -// for (size_t i = 0; i < flips.size(); ++i) { +// SequenceGenerator generator(0 /*putFreq*/, 100 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 +// /*end_range*/); OperationList operations; for (size_t i = 0; i < flips.size(); ++i) { // this->reset_btree(); // LOGINFO("Step {}-1: Init btree", i + 1); // for (auto k = 0u; k < num_entries; ++k) { diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index dd2f8f010..d4734ac82 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -51,10 +51,10 @@ def parse_arguments(): parser.add_argument('--dev_list', help='Device list', default='') parser.add_argument('--cleanup_after_shutdown', help='Cleanup after shutdown', type=bool, default=False) parser.add_argument('--init_device', help='Initialize device', type=bool, default=True) - parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=5) + parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=10) parser.add_argument('--min_keys_in_node', help='Minimum num of keys in btree nodes', type=int, default=2) - parser.add_argument('--num_rounds', help='number of rounds for crash test', type=int, default=10000) - parser.add_argument('--num_entries_per_rounds', help='number of rounds for crash test', type=int, default=60) + parser.add_argument('--num_rounds', help='number of rounds for crash test', type=int, default=1000) + parser.add_argument('--num_entries_per_rounds', help='number of rounds for crash test', type=int, default=100) # Parse the known arguments and ignore any unknown arguments args, unknown = parser.parse_known_args() @@ -94,10 +94,10 @@ def long_running_clean_shutdown(options, type=0): def long_running_crash_put(options): print("Long running crash put started") - options['num_entries'] = 131072 # 128K + options['num_entries'] = 1310720 # 1280K options['init_device'] = True options['run_time'] = 14400 # 4 hours - options['preload_size'] = 100 + options['preload_size'] = 1024 print(f"options: {options}") run_crash_test(options) print("Long running crash put completed") From b4ddbaa19ed3414bba0b8198436cfe361536ac75 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Mon, 23 Dec 2024 14:54:28 +0800 Subject: [PATCH 048/170] reset rreq time every time we reuse a rreq --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 75fb41167..e6bb7fea6 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.2" + version = "6.6.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index a39d6035b..c408f82d1 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -489,8 +489,9 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ auto rreq = it->second; if (!happened) { - // We already have the entry in the map, check if we are already allocated the blk by previous caller, in - // that case we need to return the req. + // We already have the entry in the map, reset its start time to prevent it from being incorrectly gc during use. + rreq->set_created_time(); + // Check if we are already allocated the blk by previous caller, in that case we need to return the req. if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { // Do validation if we have the correct mapping // RD_REL_ASSERT(blob_equals(user_header, rreq->header), "User header mismatch for repl_key={}", From b27a240139934e84e80d8ed5535850a1092aa327 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Wed, 25 Dec 2024 06:32:34 +0800 Subject: [PATCH 049/170] fix HomeRaftLogStore::last_entry (#619) 1 If no log entry exists: a dummy constant entry with value set to null and term set to zero. 2 m_last_durable_lsn is initialized as -1, and only be updated in end_of_batch. we should set it to the tail_lsn of log store after all the log entries are replayed --- src/include/homestore/logstore/log_store.hpp | 1 + .../log_store/home_raft_log_store.cpp | 7 ++++-- .../replication/repl_dev/raft_repl_dev.cpp | 24 +++++++++++-------- src/lib/replication/repl_dev/raft_repl_dev.h | 5 ++-- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/include/homestore/logstore/log_store.hpp b/src/include/homestore/logstore/log_store.hpp index a2091f114..18a806545 100644 --- a/src/include/homestore/logstore/log_store.hpp +++ b/src/include/homestore/logstore/log_store.hpp @@ -231,6 +231,7 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { bool rollback(logstore_seq_num_t to_lsn); auto start_lsn() const { return m_start_lsn.load(std::memory_order_acquire); } + auto tail_lsn() const { return m_tail_lsn.load(std::memory_order_acquire); } nlohmann::json dump_log_store(const log_dump_req& dump_req = log_dump_req()); diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index 1c09afa91..f0f792276 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -147,8 +147,11 @@ nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::last_entry() const { auto log_bytes = m_log_store->read_sync(max_seq); nle = to_nuraft_log_entry(log_bytes); } catch (const std::exception& e) { - REPL_STORE_LOG(ERROR, "last_entry() out_of_range={}", max_seq); - throw e; + // all the log entries are truncated, so we should return a dummy log entry. + REPL_STORE_LOG(ERROR, "last_entry() out_of_range={}, {}", max_seq, e.what()); + // according to the contract, we should return a dummy log entry if the index is out of range. + // https://github.com/eBay/NuRaft/blob/50e2f949503081262cb21923e633eaa8dacad8fa/include/libnuraft/log_store.hxx#L56 + nle = m_dummy_log_entry; } return nle; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index c408f82d1..7f2e07f3b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -39,7 +39,10 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_data_journal = std::make_shared< ReplLogStore >( *this, *m_state_machine, m_rd_sb->logdev_id, m_rd_sb->logstore_id, [this](logstore_seq_num_t lsn, log_buffer buf, void* key) { on_log_found(lsn, buf, key); }, - [this](std::shared_ptr< HomeLogStore > hs, logstore_seq_num_t lsn) { m_log_store_replay_done = true; }); + [this](std::shared_ptr< HomeLogStore > hs, logstore_seq_num_t lsn) { + m_log_store_replay_done = true; + set_log_store_last_durable_lsn(hs->tail_lsn()); + }); m_next_dsn = m_rd_sb->last_applied_dsn + 1; m_commit_upto_lsn = m_rd_sb->durable_commit_lsn; m_last_flushed_commit_lsn = m_commit_upto_lsn; @@ -292,9 +295,10 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } } - auto status = rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, - data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, - header, key, data.size, m_listener); + auto status = rreq->init( + repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, + data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, header, + key, data.size, m_listener); // Add the request to the repl_dev_rreq map, it will be accessed throughout the life cycle of this request auto const [it, happened] = m_repl_key_req_map.emplace(rreq->rkey(), rreq); @@ -628,9 +632,7 @@ bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rre if (!all_futs_ready && timeout_rreqs != nullptr) { timeout_rreqs->clear(); for (size_t i{0}; i < futs.size(); ++i) { - if (!futs[i].isReady()) { - timeout_rreqs->emplace_back(only_wait_reqs[i]); - } + if (!futs[i].isReady()) { timeout_rreqs->emplace_back(only_wait_reqs[i]); } } all_futs_ready = timeout_rreqs->empty(); } @@ -926,7 +928,7 @@ void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { } } - void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { +void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { if (!rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { commit_blk(rreq); } // Remove the request from repl_key map. @@ -972,8 +974,8 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) // Ensure non-volatile lsn not exist because handle_error should not be called after append entries. auto exist_rreq = m_state_machine->lsn_to_req(rreq->lsn()); if (exist_rreq != nullptr && !exist_rreq->is_volatile()) { - HS_REL_ASSERT(false, "Unexpected: LSN={} is already ready to commit, exist_rreq=[{}]", - rreq->lsn(), exist_rreq->to_string()); + HS_REL_ASSERT(false, "Unexpected: LSN={} is already ready to commit, exist_rreq=[{}]", rreq->lsn(), + exist_rreq->to_string()); } if (err == ReplServiceError::DATA_DUPLICATED) { RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); @@ -1458,6 +1460,8 @@ void RaftReplDev::gc_repl_reqs() { } } +void RaftReplDev::set_log_store_last_durable_lsn(store_lsn_t lsn) { m_data_journal->set_last_durable_lsn(lsn); } + void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { auto repl_lsn = to_repl_lsn(lsn); // apply the log entry if the lsn is between checkpoint lsn and durable commit lsn diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 28706f716..0521d1aac 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -313,9 +313,10 @@ class RaftReplDev : public ReplDev, */ void handle_error(repl_req_ptr_t const& rreq, ReplServiceError err); - bool wait_for_data_receive(std::vector < repl_req_ptr_t > const &rreqs, uint64_t timeout_ms, - std::vector < repl_req_ptr_t > *timeout_rreqs = nullptr); + bool wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms, + std::vector< repl_req_ptr_t >* timeout_rreqs = nullptr); void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); + void set_log_store_last_durable_lsn(store_lsn_t lsn); void commit_blk(repl_req_ptr_t rreq); void replace_member(repl_req_ptr_t rreq); void reset_quorum_size(uint32_t commit_quorum); From 5c551f06a2a5a91ef36cf5780116cfbab1b031e4 Mon Sep 17 00:00:00 2001 From: ywz <649521587@qq.com> Date: Mon, 30 Dec 2024 18:38:57 +0800 Subject: [PATCH 050/170] minor fix for homeobject's homestore_test --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index e6bb7fea6..03d856077 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.3" + version = "6.6.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 0521d1aac..df2668abc 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -191,7 +191,7 @@ class RaftReplDev : public ReplDev, bool is_destroy_pending() const; bool is_destroyed() const; Clock::time_point destroyed_time() const { return m_destroyed_time; } - bool is_ready_for_traffic() const { + bool is_ready_for_traffic() const override { auto committed_lsn = m_commit_upto_lsn.load(); auto gate = m_traffic_ready_lsn.load(); bool ready = committed_lsn >= gate; From d9d91cfc23e7b0a6508ecf67330d6c5fca6645f2 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Tue, 31 Dec 2024 16:45:34 +0800 Subject: [PATCH 051/170] update last_commit_lsn in commit_config --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 14 ++++++++++++++ src/lib/replication/repl_dev/raft_repl_dev.h | 1 + .../replication/repl_dev/raft_state_machine.cpp | 2 ++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 03d856077..085ac29aa 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.4" + version = "6.6.5" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 7f2e07f3b..15e01ea6e 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -959,6 +959,20 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { if (!rreq->is_proposer()) { rreq->clear(); } } +void RaftReplDev::handle_config_commit(const repl_lsn_t lsn, raft_cluster_config_ptr_t& new_conf) { + // when reaching here, the new config has already been applied to the cluster. + // since we didn't create repl req for config change, we just need to update m_commit_upto_lsn here. + + // keep this variable in case it is needed later + (void) new_conf; + auto prev_lsn = m_commit_upto_lsn.load(std::memory_order_relaxed); + RD_DBG_ASSERT_GT(lsn, prev_lsn, + "Out of order commit of lsns, it is not expected in RaftReplDev. cur_lsns={}, prev_lsns={}", + lsn, prev_lsn); + RD_DBG_ASSERT(m_commit_upto_lsn.compare_exchange_strong(prev_lsn, lsn), + "Raft Channel: unexpected log {} commited before config {} committed", prev_lsn, lsn); +} + void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) { if (err == ReplServiceError::OK) { return; } RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index df2668abc..f45ddc61c 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -207,6 +207,7 @@ class RaftReplDev : public ReplDev, //////////////// Methods needed for other Raft classes to access ///////////////// void use_config(json_superblk raft_config_sb); void handle_commit(repl_req_ptr_t rreq, bool recovery = false); + void handle_config_commit(const repl_lsn_t lsn, raft_cluster_config_ptr_t& new_conf); void handle_rollback(repl_req_ptr_t rreq); repl_req_ptr_t repl_key_to_req(repl_key const& rkey) const; repl_req_ptr_t applier_create_req(repl_key const& rkey, journal_type_t code, sisl::blob const& user_header, diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 10fb9285f..77fd54501 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -230,6 +230,8 @@ void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_pt RD_LOG(INFO, "Raft channel: server ids in new cluster conf : {}, my_id {}, group_id {}", oss.str(), my_id, m_rd.group_id_str()); #endif + + m_rd.handle_config_commit(s_cast< repl_lsn_t >(log_idx), new_conf); } void RaftStateMachine::rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) { From 348e05d65112dcf92104492a79eb994103051454 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Thu, 26 Dec 2024 17:31:26 +0800 Subject: [PATCH 052/170] Remove log store truncation from resource mgr. Currently both resource_mgr and raft can call log store's truncate, but resource_mgr will not truncate logs whose lsn less than compact lsn. That means resource_mgr just re-truncate logs which will be / has been truncated in compact. But if resource_mgr and raft call truncate concurrently, crash will happen. So this commit remove it. --- conanfile.py | 2 +- src/lib/common/resource_mgr.cpp | 18 +++++----- .../log_store/home_raft_log_store.cpp | 4 +++ .../log_store/home_raft_log_store.h | 2 ++ .../replication/repl_dev/raft_repl_dev.cpp | 8 ++--- src/lib/replication/repl_dev/raft_repl_dev.h | 2 ++ src/tests/test_common/raft_repl_test_base.hpp | 2 +- src/tests/test_log_dev.cpp | 34 ++++++++++++++++++- src/tests/test_raft_repl_dev.cpp | 2 +- 9 files changed, 57 insertions(+), 17 deletions(-) diff --git a/conanfile.py b/conanfile.py index 085ac29aa..41e73fe71 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.5" + version = "6.6.6" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/common/resource_mgr.cpp b/src/lib/common/resource_mgr.cpp index 0ba4803c4..36b2a0a17 100644 --- a/src/lib/common/resource_mgr.cpp +++ b/src/lib/common/resource_mgr.cpp @@ -47,14 +47,16 @@ void ResourceMgr::stop() { // void ResourceMgr::trigger_truncate() { if (hs()->has_repl_data_service()) { - // first make sure all repl dev's underlying raft log store make corresponding reservation during - // truncate -- set the safe truncate boundary for each raft log store; - hs()->repl_service().iterate_repl_devs([](cshared< ReplDev >& rd) { - // lock is already taken by repl service layer; - std::dynamic_pointer_cast< RaftReplDev >(rd)->truncate( - HS_DYNAMIC_CONFIG(resource_limits.raft_logstore_reserve_threshold)); - }); - + /* + * DO NOT NEED : raft will truncate logs. + * // first make sure all repl dev's underlying raft log store make corresponding reservation during + * // truncate -- set the safe truncate boundary for each raft log store; + * hs()->repl_service().iterate_repl_devs([](cshared< ReplDev >& rd) { + * // lock is already taken by repl service layer; + * std::dynamic_pointer_cast< RaftReplDev >(rd)->truncate( + * HS_DYNAMIC_CONFIG(resource_limits.raft_logstore_reserve_threshold)); + * }); + */ // next do device truncate which go through all logdevs and truncate them; hs()->logstore_service().device_truncate(); } diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index f0f792276..5b14d3be8 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -49,6 +49,9 @@ static uint64_t extract_term(const log_buffer& log_bytes) { return (*r_cast< uint64_t const* >(raw_ptr)); } +#if 0 +// Since truncate_lsn can not accross compact_lsn passed down by raft server +// and compact will truncate logs upto compact_lsn, we don't need to re-truncate in this function now. void HomeRaftLogStore::truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_lsn) { auto const last_lsn = last_index(); auto const start_lsn = start_index(); @@ -79,6 +82,7 @@ void HomeRaftLogStore::truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_ls m_log_store->truncate(truncate_lsn); } } +#endif HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore_id, log_found_cb_t const& log_found_cb, log_replay_done_cb_t const& log_replay_done_cb) : diff --git a/src/lib/replication/log_store/home_raft_log_store.h b/src/lib/replication/log_store/home_raft_log_store.h index 3c4c021ef..d2c0fd57b 100644 --- a/src/lib/replication/log_store/home_raft_log_store.h +++ b/src/lib/replication/log_store/home_raft_log_store.h @@ -204,6 +204,7 @@ class HomeRaftLogStore : public nuraft::log_store { */ ulong last_index() const; +#if 0 /** * Truncates the log store * @@ -212,6 +213,7 @@ class HomeRaftLogStore : public nuraft::log_store { * LSN; */ void truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_lsn); +#endif void wait_for_log_store_ready(); void set_last_durable_lsn(repl_lsn_t lsn); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 15e01ea6e..fb8317b02 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -966,11 +966,9 @@ void RaftReplDev::handle_config_commit(const repl_lsn_t lsn, raft_cluster_config // keep this variable in case it is needed later (void) new_conf; auto prev_lsn = m_commit_upto_lsn.load(std::memory_order_relaxed); - RD_DBG_ASSERT_GT(lsn, prev_lsn, - "Out of order commit of lsns, it is not expected in RaftReplDev. cur_lsns={}, prev_lsns={}", - lsn, prev_lsn); - RD_DBG_ASSERT(m_commit_upto_lsn.compare_exchange_strong(prev_lsn, lsn), - "Raft Channel: unexpected log {} commited before config {} committed", prev_lsn, lsn); + if (prev_lsn >= lsn || !m_commit_upto_lsn.compare_exchange_strong(prev_lsn, lsn)) { + RD_LOGE("Raft Channel: unexpected log {} commited before config {} committed", prev_lsn, lsn); + } } void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index f45ddc61c..5e66e18f8 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -250,6 +250,7 @@ class RaftReplDev : public ReplDev, */ void on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done); +#if 0 /** * Truncates the replication log by providing a specified number of reserved entries. * @@ -258,6 +259,7 @@ class RaftReplDev : public ReplDev, void truncate(uint32_t num_reserved_entries) { m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load()); } +#endif void wait_for_logstore_ready() { m_data_journal->wait_for_log_store_ready(); } diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 2f7ab9f1c..11c6d6bc2 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -420,7 +420,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { void truncate(int num_reserved_entries) { auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); - raft_repl_dev->truncate(num_reserved_entries); + // raft_repl_dev->truncate(num_reserved_entries); LOGINFO("Manually truncated"); } diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp index da0a7e458..bb6fa8f29 100644 --- a/src/tests/test_log_dev.cpp +++ b/src/tests/test_log_dev.cpp @@ -201,8 +201,12 @@ class LogDevTest : public ::testing::Test { read_all_verify(log_store); } - void truncate_validate(std::shared_ptr< HomeLogStore > log_store) { + void truncate_validate(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t* last_lsn = nullptr) { auto upto = log_store->get_contiguous_completed_seq_num(-1); + if (last_lsn) { + ASSERT_EQ(upto, *last_lsn); + } + LOGINFO("truncate_validate upto {}", upto); log_store->truncate(upto); read_all_verify(log_store); @@ -305,6 +309,34 @@ TEST_F(LogDevTest, Rollback) { rollback_records_validate(log_store, 0 /* expected_count */); } +TEST_F(LogDevTest, ReTruncate) { + LOGINFO("Step 1: Create a single logstore to start re-truncate test"); + auto logdev_id = logstore_service().create_new_logdev(); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto log_store = logstore_service().create_new_log_store(logdev_id, false); + auto store_id = log_store->get_store_id(); + + LOGINFO("Step 2: Issue sequential inserts with q depth of 10"); + logstore_seq_num_t cur_lsn = 0; + kickstart_inserts(log_store, cur_lsn, 500); + + LOGINFO("Step 3: Truncate all entries"); + logstore_seq_num_t ls_last_lsn = 499; + truncate_validate(log_store, &ls_last_lsn); + ASSERT_EQ(log_store->start_lsn(), ls_last_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), ls_last_lsn); + ASSERT_EQ(log_store->truncated_upto(), ls_last_lsn); + + LOGINFO("Step 4: Truncate again"); + truncate_validate(log_store, &ls_last_lsn); + ASSERT_EQ(log_store->start_lsn(), ls_last_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), ls_last_lsn); + ASSERT_EQ(log_store->truncated_upto(), ls_last_lsn); + + LOGINFO("Step 5: Read and verify all entries again"); + read_all_verify(log_store); +} + TEST_F(LogDevTest, CreateRemoveLogDev) { auto num_logdev = SISL_OPTIONS["num_logdevs"].as< uint32_t >(); std::vector< std::shared_ptr< HomeLogStore > > log_stores; diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index 51ca8e470..c419e6b1d 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -409,7 +409,7 @@ TEST_F(RaftReplDevTest, BaselineTest) { // Leader does manual snapshot and truncate LOGINFO("Leader create snapshot and truncate"); this->create_snapshot(); - this->truncate(0); + // this->truncate(0); } } From 7734ec85c62145bfeffb129542d936ca81e8f58c Mon Sep 17 00:00:00 2001 From: yuwmao Date: Mon, 6 Jan 2025 11:06:02 +0800 Subject: [PATCH 053/170] Support async snapshot io config --- conanfile.py | 2 +- src/lib/common/homestore_config.fbs | 4 ++++ src/lib/replication/service/raft_repl_service.cpp | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 41e73fe71..d2172ce07 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.6" + version = "6.6.7" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index da058fdb6..337e551e7 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -268,6 +268,10 @@ table Consensus { // Log difference from leader's point of view, to determine if the // follower is laggy and if so, leader will stop pushing data until it drops under this threshold. laggy_threshold: int64 = 2000; + + // Reading snapshot objects will be done by a background thread asynchronously + // instead of synchronous read by Raft worker threads + use_bg_thread_for_snapshot_io_: bool = true; } table HomeStoreSettings { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 2ed7a3bc1..23ff2db89 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -110,6 +110,7 @@ void RaftReplService::start() { // There is no callback available for handling and localizing the log entries within the pack, which could // result in data corruption. r_params.use_new_joiner_type_ = true; + r_params.use_bg_thread_for_snapshot_io_ = HS_DYNAMIC_CONFIG(consensus.use_bg_thread_for_snapshot_io_); r_params.return_method_ = nuraft::raft_params::async_handler; m_msg_mgr->register_mgr_type(params.default_group_type_, r_params); From 50712d1522b10cb4e7b0b6db784227fe3ded18ba Mon Sep 17 00:00:00 2001 From: yuwmao Date: Thu, 9 Jan 2025 10:11:36 +0800 Subject: [PATCH 054/170] Avoid replaying the last flushed log entry --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conanfile.py b/conanfile.py index d2172ce07..6e2e44975 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.7" + version = "6.6.8" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index fb8317b02..3fd68ee24 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1477,7 +1477,7 @@ void RaftReplDev::set_log_store_last_durable_lsn(store_lsn_t lsn) { m_data_journ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { auto repl_lsn = to_repl_lsn(lsn); // apply the log entry if the lsn is between checkpoint lsn and durable commit lsn - if (repl_lsn < m_rd_sb->checkpoint_lsn) { return; } + if (repl_lsn <= m_rd_sb->checkpoint_lsn) { return; } // 1. Get the log entry and prepare rreq auto const lentry = to_nuraft_log_entry(buf); @@ -1489,8 +1489,8 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry received from RAFT peer"); - RD_LOGT("Raft Channel: Applying Raft log_entry upon recovery: server_id={}, term={}, journal_entry=[{}] ", - jentry->server_id, lentry->get_term(), jentry->to_string()); + RD_LOGT("Raft Channel: Applying Raft log_entry upon recovery: server_id={}, term={}, lsn={}, journal_entry=[{}] ", + jentry->server_id, lentry->get_term(), repl_lsn, jentry->to_string()); auto entry_to_hdr = [](repl_journal_entry* jentry) { return sisl::blob{uintptr_cast(jentry) + sizeof(repl_journal_entry), jentry->user_header_size}; From 16b0e36648bfdb4820c4a145d358de04ed8a9cc1 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Thu, 2 Jan 2025 17:52:53 +0800 Subject: [PATCH 055/170] skip appending dummy logs to log dev 1. directly update indices when there are holes in baseline resync 2. update definition of logdev_key::out_of_bound_ld_key and use it to imply log dev can truncate freely. 3. remove is_active check in HomeLogStore::flush to unblock flush when there are holes in m_records --- conanfile.py | 2 +- src/include/homestore/logstore/log_store.hpp | 10 + .../homestore/logstore/log_store_internal.hpp | 3 +- src/lib/logstore/log_dev.cpp | 18 +- src/lib/logstore/log_dev.hpp | 5 +- src/lib/logstore/log_store.cpp | 33 +- .../log_store/home_raft_log_store.cpp | 10 +- src/tests/test_log_dev.cpp | 288 +++++++++++++++++- 8 files changed, 338 insertions(+), 31 deletions(-) diff --git a/conanfile.py b/conanfile.py index 6e2e44975..5923e4163 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.8" + version = "6.6.10" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/logstore/log_store.hpp b/src/include/homestore/logstore/log_store.hpp index 18a806545..91735be79 100644 --- a/src/include/homestore/logstore/log_store.hpp +++ b/src/include/homestore/logstore/log_store.hpp @@ -173,6 +173,15 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { logdev_key get_trunc_ld_key() const { return m_trunc_ld_key; } + /** + * @brief Get the truncation information for this log store. It is called during log device truncation + * + * @return tuple of (start_lsn, trunc_ld_key, tail_lsn) If the log store is empty, it will return + * an out_of_bound_ld_key as trunc_ld_key. + * + * @note ensure that no new logs are flushed between calling this function and completing the truncation, + * as this could result in an inaccurate out_of_bound_ld_key. + * */ std::tuple< logstore_seq_num_t, logdev_key, logstore_seq_num_t > truncate_info() const; sisl::StreamTracker< logstore_record >& log_records() { return m_records; } @@ -232,6 +241,7 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { auto start_lsn() const { return m_start_lsn.load(std::memory_order_acquire); } auto tail_lsn() const { return m_tail_lsn.load(std::memory_order_acquire); } + auto next_lsn() const { return m_next_lsn.load(std::memory_order_acquire); } nlohmann::json dump_log_store(const log_dump_req& dump_req = log_dump_req()); diff --git a/src/include/homestore/logstore/log_store_internal.hpp b/src/include/homestore/logstore/log_store_internal.hpp index 551f15ea8..9b7019cfb 100644 --- a/src/include/homestore/logstore/log_store_internal.hpp +++ b/src/include/homestore/logstore/log_store_internal.hpp @@ -85,7 +85,8 @@ struct logdev_key { std::string to_string() const { return fmt::format("Logid={} devoffset={}", idx, dev_offset); } static const logdev_key& out_of_bound_ld_key() { - static constexpr logdev_key s_out_of_bound_ld_key{std::numeric_limits< logid_t >::max(), 0}; + static constexpr logdev_key s_out_of_bound_ld_key{std::numeric_limits< logid_t >::max(), + std::numeric_limits< off_t >::max()}; return s_out_of_bound_ld_key; } }; diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 01a6b4181..fdebd9bd8 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -135,6 +135,7 @@ void LogDev::stop() { m_log_idx.store(0); m_pending_flush_size.store(0); m_last_flush_idx = -1; + m_last_flush_ld_key = logdev_key{0, 0}; m_last_truncate_idx = -1; m_last_crc = INVALID_CRC32_VALUE; @@ -501,6 +502,7 @@ void LogDev::on_flush_completion(LogGroup* lg) { free_log_group(lg); m_log_records->truncate(upto_indx); m_last_flush_idx = upto_indx; + m_last_flush_ld_key = logdev_key{from_indx, dev_offset}; // since we support out-of-order lsn write, so no need to guarantee the order of logstore write completion for (auto const& [idx, req] : req_map) { @@ -530,20 +532,18 @@ uint64_t LogDev::truncate() { auto lstore = store.log_store; if (lstore == nullptr) { continue; } auto const [trunc_lsn, trunc_ld_key, tail_lsn] = lstore->truncate_info(); - if (trunc_lsn == tail_lsn) { - THIS_LOGDEV_LOG(DEBUG, "Store_id={} didn't have any writes since last truncation, skipping ", store_id); - m_logdev_meta.remove_all_rollback_records(store_id, m_stopped /* persist_now */); - continue; - } - HS_DBG_ASSERT_GE(trunc_ld_key.idx, m_last_truncate_idx, "Trying to truncate logid which is already truncated"); m_logdev_meta.update_store_superblk(store_id, logstore_superblk(trunc_lsn + 1), m_stopped /* persist_now */); - // We found a new minimum logdev_key that we can truncate to - if (trunc_ld_key.idx > 0 && trunc_ld_key.idx < min_safe_ld_key.idx) { min_safe_ld_key = trunc_ld_key; } + if (trunc_ld_key.idx < min_safe_ld_key.idx) { min_safe_ld_key = trunc_ld_key; } + } + + // All log stores are empty, we can truncate logs depends on the last flushed logdev_key + if (min_safe_ld_key == logdev_key::out_of_bound_ld_key()) { + min_safe_ld_key = m_last_flush_ld_key; } // There are no writes or no truncation called for any of the store, so we can't truncate anything - if (min_safe_ld_key == logdev_key::out_of_bound_ld_key() || min_safe_ld_key.idx <= m_last_truncate_idx) return 0; + if (min_safe_ld_key.idx <= 0 || min_safe_ld_key.idx <= m_last_truncate_idx) return 0; uint64_t const num_records_to_truncate = uint64_cast(min_safe_ld_key.idx - m_last_truncate_idx); diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index 2875d7823..5a8fafc2c 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -795,8 +795,9 @@ class LogDev : public std::enable_shared_from_this< LogDev > { std::multimap< logid_t, logstore_id_t > m_garbage_store_ids; Clock::time_point m_last_flush_time; - logid_t m_last_flush_idx{-1}; // Track last flushed, last device offset and truncated log idx - logid_t m_last_truncate_idx{std::numeric_limits< logid_t >::min()}; // logdev truncate up to this idx + logid_t m_last_flush_idx{-1}; // Track last flushed, last device offset and truncated log idx + logdev_key m_last_flush_ld_key{0,0}; // Left interval of the last flush, 0 indicates the very beginning of logdev + logid_t m_last_truncate_idx{-1}; // Logdev truncate up to this idx crc32_t m_last_crc{INVALID_CRC32_VALUE}; // LogDev Info block related fields diff --git a/src/lib/logstore/log_store.cpp b/src/lib/logstore/log_store.cpp index 1f2a4434b..427207e12 100644 --- a/src/lib/logstore/log_store.cpp +++ b/src/lib/logstore/log_store.cpp @@ -189,12 +189,27 @@ void HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate #endif + // In normal write and compact path, upto_lsn is expected to be no larger than m_tail_lsn after the flush. + // So upto_lsn > m_tail_lsn is expected to exist only in baseline resync path. + // In baseline resync path, we truncate all entries up to upto_lsn, and update m_tail_lsn and m_next_lsn + // to make sure logstore's idx is always = raft's idx - 1. if (upto_lsn > m_tail_lsn) { THIS_LOGSTORE_LOG(WARN, - "Truncating issued on lsn={} which is greater than tail_lsn={}, truncating upto tail_lsn", + "Truncating issued on lsn={} which is greater than tail_lsn={}", upto_lsn, m_tail_lsn.load(std::memory_order_relaxed)); - m_trunc_ld_key = m_records.at(m_tail_lsn).m_trunc_key; - upto_lsn = m_tail_lsn; + // update m_tail_lsn if it is less than upto_lsn + auto current_tail_lsn = m_tail_lsn.load(std::memory_order_relaxed); + while (current_tail_lsn < upto_lsn && + !m_tail_lsn.compare_exchange_weak(current_tail_lsn, upto_lsn, std::memory_order_relaxed)) {} + + // update m_next_lsn if it is less than upto_lsn + 1 + auto current_next_lsn = m_next_lsn.load(std::memory_order_relaxed); + while (current_next_lsn < upto_lsn + 1 && + !m_next_lsn.compare_exchange_weak(current_next_lsn, upto_lsn + 1, std::memory_order_relaxed)) {} + + // insert an empty record to make sure m_records has enough size to truncate + logdev_key empty_ld_key; + m_records.create_and_complete(upto_lsn, logstore_record(empty_ld_key, empty_ld_key)); } else { m_trunc_ld_key = m_records.at(upto_lsn).m_trunc_key; THIS_LOGSTORE_LOG(TRACE, "Truncating logstore upto lsn={} , m_trunc_ld_key index {} offset {}", upto_lsn, @@ -207,7 +222,12 @@ void HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate std::tuple< logstore_seq_num_t, logdev_key, logstore_seq_num_t > HomeLogStore::truncate_info() const { auto const trunc_lsn = m_start_lsn.load(std::memory_order_relaxed) - 1; - return std::make_tuple(trunc_lsn, m_trunc_ld_key, m_tail_lsn.load(std::memory_order_relaxed)); + auto const tail_lsn = m_tail_lsn.load(std::memory_order_relaxed); + + // If the store is empty, return out_of_bound_ld_key as trunc_ld_key, allowing the caller to truncate freely. + // Otherwise, return the actual trunc_ld_key. + return (trunc_lsn == tail_lsn) ? std::make_tuple(trunc_lsn, logdev_key::out_of_bound_ld_key(), tail_lsn) + : std::make_tuple(trunc_lsn, m_trunc_ld_key, tail_lsn); } void HomeLogStore::fill_gap(logstore_seq_num_t seq_num) { @@ -277,10 +297,7 @@ void HomeLogStore::flush(logstore_seq_num_t upto_lsn) { return; } - if (upto_lsn == invalid_lsn()) { upto_lsn = m_records.active_upto(); } - - // if we have flushed already, we are done, else issue a flush - if (m_records.status(upto_lsn).is_active) m_logdev->flush_under_guard(); + m_logdev->flush_under_guard(); } bool HomeLogStore::rollback(logstore_seq_num_t to_lsn) { diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index 5b14d3be8..d32fc61d6 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -361,12 +361,10 @@ bool HomeRaftLogStore::compact(ulong compact_lsn) { // release this assert if for some use case, we should tolorant this case; // for now, don't expect this case to happen. // RELEASE_ASSERT(false, "compact_lsn={} is beyond the current max_lsn={}", compact_lsn, cur_max_lsn); - REPL_STORE_LOG(DEBUG, "Adding dummy entries during compact from={} upto={}", cur_max_lsn + 1, - to_store_lsn(compact_lsn)); - // We need to fill the remaining entries with dummy data. - for (auto lsn{cur_max_lsn + 1}; lsn <= to_store_lsn(compact_lsn); ++lsn) { - append(m_dummy_log_entry); - } + + // if compact_lsn is beyond the current max_lsn, it indicates a hole from cur_max_lsn to compact_lsn. + // we directly compact and truncate up to compact_lsn assuming there are dummy logs. + REPL_STORE_LOG(DEBUG, "Compact with log holes from {} to={}", cur_max_lsn + 1, to_store_lsn(compact_lsn)); } m_log_store->truncate(to_store_lsn(compact_lsn)); return true; diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp index bb6fa8f29..189bc5a86 100644 --- a/src/tests/test_log_dev.cpp +++ b/src/tests/test_log_dev.cpp @@ -201,10 +201,11 @@ class LogDevTest : public ::testing::Test { read_all_verify(log_store); } - void truncate_validate(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t* last_lsn = nullptr) { + void truncate_validate(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t* trunc_lsn = nullptr) { auto upto = log_store->get_contiguous_completed_seq_num(-1); - if (last_lsn) { - ASSERT_EQ(upto, *last_lsn); + if (trunc_lsn && *trunc_lsn != upto) { + LOGWARN("Truncate issued upto {} but real upto lsn in log store is {}", *trunc_lsn, upto); + upto = *trunc_lsn; } LOGINFO("truncate_validate upto {}", upto); @@ -217,6 +218,24 @@ class LogDevTest : public ::testing::Test { auto actual_count = log_store->get_logdev()->log_dev_meta().num_rollback_records(log_store->get_store_id()); ASSERT_EQ(actual_count, expected_count); } + + logid_t get_last_truncate_idx(logdev_id_t logdev_id) { + auto status = logstore_service().get_logdev(logdev_id)->get_status(0); + if (status.contains("last_truncate_log_idx")) { + return s_cast(status["last_truncate_log_idx"]); + } + LOGERROR("Failed to get last_truncate_log_idx from logdev status for logdev_id {}", logdev_id); + return static_cast(-1); + } + + logid_t get_current_log_idx(logdev_id_t logdev_id) { + auto status = logstore_service().get_logdev(logdev_id)->get_status(0); + if (status.contains("current_log_idx")) { + return s_cast(status["current_log_idx"]); + } + LOGERROR("Failed to get current_log_idx from logdev status for logdev_id {}", logdev_id); + return static_cast(-1); + } }; TEST_F(LogDevTest, WriteSyncThenRead) { @@ -314,7 +333,6 @@ TEST_F(LogDevTest, ReTruncate) { auto logdev_id = logstore_service().create_new_logdev(); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); - auto store_id = log_store->get_store_id(); LOGINFO("Step 2: Issue sequential inserts with q depth of 10"); logstore_seq_num_t cur_lsn = 0; @@ -337,6 +355,268 @@ TEST_F(LogDevTest, ReTruncate) { read_all_verify(log_store); } +TEST_F(LogDevTest, TruncateWithExceedingLSN) { + LOGINFO("Step 1: Create a single logstore to start truncate with exceeding LSN test"); + auto logdev_id = logstore_service().create_new_logdev(); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto log_store = logstore_service().create_new_log_store(logdev_id, false); + + LOGINFO("Step 2: Insert 500 entries"); + logstore_seq_num_t cur_lsn = 0; + kickstart_inserts(log_store, cur_lsn, 500); + + LOGINFO("Step 3: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 4: Truncate 100 entries"); + logstore_seq_num_t trunc_lsn = 99; + truncate_validate(log_store, &trunc_lsn); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), 499); + ASSERT_EQ(log_store->next_lsn(), 500); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + + LOGINFO("Step 5: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 6: Truncate all with exceeding lsn"); + trunc_lsn = 1999999; + truncate_validate(log_store, &trunc_lsn); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), trunc_lsn); + ASSERT_EQ(log_store->next_lsn(), 2000000); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + + LOGINFO("Step 7 Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 8: Append 500 entries"); + cur_lsn = log_store->next_lsn(); + kickstart_inserts(log_store, cur_lsn, 500); + ASSERT_EQ(log_store->next_lsn(), 2000500); + + LOGINFO("Step 9: Read and verify all entries"); + read_all_verify(log_store); +} + +TEST_F(LogDevTest, TruncateAfterRestart) { + LOGINFO("Step 1: Create a single logstore to start truncate with overlapping LSN test"); + auto logdev_id = logstore_service().create_new_logdev(); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto log_store = logstore_service().create_new_log_store(logdev_id, false); + auto store_id = log_store->get_store_id(); + + auto restart = [&]() { + std::promise< bool > p; + auto starting_cb = [&]() { + logstore_service().open_logdev(logdev_id); + logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) { + log_store = store; + p.set_value(true); + }); + }; + start_homestore(true /* restart */, starting_cb); + p.get_future().get(); + }; + + LOGINFO("Step 2: Insert 500 entries"); + logstore_seq_num_t cur_lsn = 0; + kickstart_inserts(log_store, cur_lsn, 500); + + LOGINFO("Step 3: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 4: Truncate 100 entries"); + logstore_seq_num_t trunc_lsn = 99; + truncate_validate(log_store, &trunc_lsn); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), 499); + ASSERT_EQ(log_store->next_lsn(), 500); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + + LOGINFO("Step 5: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 6: Restart and verify all entries"); + restart(); + read_all_verify(log_store); + auto const [last_trunc_lsn, trunc_ld_key, tail_lsn] = log_store->truncate_info(); + ASSERT_EQ(last_trunc_lsn, trunc_lsn); + ASSERT_EQ(trunc_ld_key.idx, 0); + ASSERT_EQ(tail_lsn, log_store->tail_lsn()); + + LOGINFO("Step 7: call log dev truncate again and read verify") + logstore_service().device_truncate(); + read_all_verify(log_store); +} + +TEST_F(LogDevTest, TruncateAcrossMultipleStores) { + LOGINFO("Step 1: Create 3 log stores to start truncate across multiple stores test"); + auto logdev_id = logstore_service().create_new_logdev(); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto store1 = logstore_service().create_new_log_store(logdev_id, false); + auto store2 = logstore_service().create_new_log_store(logdev_id, false); + auto store3 = logstore_service().create_new_log_store(logdev_id, false); + + + LOGINFO("Step 2: Insert 100 entries to store {}", store1->get_store_id()); + logstore_seq_num_t cur_lsn = 0; + kickstart_inserts(store1, cur_lsn, 100); + ASSERT_EQ(get_current_log_idx(logdev_id), 100); + + LOGINFO("Step 3: Insert 200 entries to store {}", store2->get_store_id()); + cur_lsn = 0; + kickstart_inserts(store2, cur_lsn, 200); + ASSERT_EQ(get_current_log_idx(logdev_id), 300); + + LOGINFO("Step 4: Insert 200 entries to store {}", store3->get_store_id()); + cur_lsn = 0; + kickstart_inserts(store3, cur_lsn, 200); + ASSERT_EQ(get_current_log_idx(logdev_id), 500); + + LOGINFO("Step 5: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 0); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), -1); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 0); + ASSERT_EQ(store2->tail_lsn(), 199); + ASSERT_EQ(store2->truncated_upto(), -1); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 0); + ASSERT_EQ(store3->tail_lsn(), 199); + ASSERT_EQ(store3->truncated_upto(), -1); + // log dev should not truncate any logs due to no truncate in log stores happened + ASSERT_EQ(get_last_truncate_idx(logdev_id), -1); + + LOGINFO("Step 6: Truncate 100 entries in store {}", store2->get_store_id()); + logstore_seq_num_t trunc_lsn = 99; + truncate_validate(store2, &trunc_lsn); + + LOGINFO("Step 7: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 0); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), -1); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 100); + ASSERT_EQ(store2->tail_lsn(), 199); + ASSERT_EQ(store2->truncated_upto(), 99); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 0); + ASSERT_EQ(store3->tail_lsn(), 199); + ASSERT_EQ(store3->truncated_upto(), -1); + // log dev should not truncate any logs due to store1 has valid logs + ASSERT_EQ(get_last_truncate_idx(logdev_id), -1); + + LOGINFO("Step 8: Truncate 500 entries in store {}", store3->get_store_id()); + trunc_lsn = 499; + truncate_validate(store3, &trunc_lsn); + + LOGINFO("Step 9: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 0); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), -1); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 100); + ASSERT_EQ(store2->tail_lsn(), 199); + ASSERT_EQ(store2->truncated_upto(), 99); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // log dev should truncate not truncate any logs due to store1 has valid logs + ASSERT_EQ(get_last_truncate_idx(logdev_id), -1); + + LOGINFO("Step 10: Truncate 100 entries in store {}", store1->get_store_id()); + trunc_lsn = 99; + truncate_validate(store1, &trunc_lsn); + + LOGINFO("Step 11: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 100); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), 99); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 100); + ASSERT_EQ(store2->tail_lsn(), 199); + ASSERT_EQ(store2->truncated_upto(), 99); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // log dev should truncate logs upto 199, as store2 has valid logs + ASSERT_EQ(get_last_truncate_idx(logdev_id), 199); + + LOGINFO("Step 12: Truncate 300 entries in store {}", store2->get_store_id()); + trunc_lsn = 299; + truncate_validate(store2, &trunc_lsn); + + LOGINFO("Step 13: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 100); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), 99); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 300); + ASSERT_EQ(store2->tail_lsn(), 299); + ASSERT_EQ(store2->truncated_upto(), 299); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // log dev should truncate all logs as all stores are empty + ASSERT_EQ(get_last_truncate_idx(logdev_id), 499); + + LOGINFO("Step 14: Insert 100 entries in store {}", store1->get_store_id()); + cur_lsn = 100; + kickstart_inserts(store1, cur_lsn, 100); + ASSERT_EQ(get_current_log_idx(logdev_id), 600); + + LOGINFO("Step 15: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 100); + ASSERT_EQ(store1->tail_lsn(), 199); + ASSERT_EQ(store1->truncated_upto(), 99); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 300); + ASSERT_EQ(store2->tail_lsn(), 299); + ASSERT_EQ(store2->truncated_upto(), 299); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // log dev should not truncate since no new truncate happened + ASSERT_EQ(get_last_truncate_idx(logdev_id), 499); + + LOGINFO("Step 16: Truncate 500 entries in store {}", store1->get_store_id()); + trunc_lsn = 499; + truncate_validate(store1, &trunc_lsn); + + LOGINFO("Step 17: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 500); + ASSERT_EQ(store1->tail_lsn(), 499); + ASSERT_EQ(store1->truncated_upto(), 499); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 300); + ASSERT_EQ(store2->tail_lsn(), 299); + ASSERT_EQ(store2->truncated_upto(), 299); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // make sure new logs can truncate successfully when there are empty log stores + ASSERT_EQ(get_last_truncate_idx(logdev_id), 599); +} + TEST_F(LogDevTest, CreateRemoveLogDev) { auto num_logdev = SISL_OPTIONS["num_logdevs"].as< uint32_t >(); std::vector< std::shared_ptr< HomeLogStore > > log_stores; From c739f116b07413345f58c70ac82a2f56fe9a30e3 Mon Sep 17 00:00:00 2001 From: Hooper <62418134+Hooper9973@users.noreply.github.com> Date: Thu, 16 Jan 2025 16:33:01 +0800 Subject: [PATCH 056/170] Adjust cp_io num_fiber to Prevent Deadlock (#630) Description: Resolved a potential deadlock issue with sync_io fibers. When multiple sync_io fibers are active, a fiber (e.g., fiber1) may acquire a thread-level mutex and perform synchronous I/O using io_uring. This causes fiber1 to call boost::fibers::promise::get_future(), blocking itself and allowing other fibers in the same thread to be scheduled. If another fiber (e.g., fiber2) is scheduled and attempts to acquire the same mutex, a deadlock occurs. By adjusting the num_fiber in cp_io, we prevent this deadlock scenario. --- conanfile.py | 2 +- src/lib/checkpoint/cp_mgr.cpp | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 5923e4163..6a0208560 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.10" + version = "6.6.11" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp index 7fd6f7460..cf89d1adf 100644 --- a/src/lib/checkpoint/cp_mgr.cpp +++ b/src/lib/checkpoint/cp_mgr.cpp @@ -297,8 +297,12 @@ void CPManager::start_cp_thread() { }; auto ctx = std::make_shared< Context >(); - // Start a reactor with 9 fibers (8 for sync io) - iomanager.create_reactor("cp_io", iomgr::INTERRUPT_LOOP, 8u, [this, ctx](bool is_started) { + // Start a reactor with 2 fibers (1 for sync io) + // Prevent deadlock with sync_io fibers. + // Multiple sync_io fibers may acquire a thread-level mutex and perform synchronous I/O using io_uring. + // This can block the fiber and allow other fibers to be scheduled. + // If another fiber tries to acquire the same mutex, a deadlock can occur. + iomanager.create_reactor("cp_io", iomgr::INTERRUPT_LOOP, 2u, [this, ctx](bool is_started) { if (is_started) { { std::unique_lock< std::mutex > lk{ctx->mtx}; From edbc307e963e1a72c777bb3d778ecb1cfd5114c2 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Thu, 16 Jan 2025 15:53:30 +0800 Subject: [PATCH 057/170] Ensure end_of_append_batch is Called for All Raft Log Types Previously, HomeRaftLogStore::end_of_append_batch was only invoked for app_logs, which required requests in m_lsn_req_map. This behavior caused issues when only non-app raft logs (e.g., conf logs) were appended, as the function would not be called, leaving m_last_durable_lsn outdated. Consequently, next_slot() could return incorrect values based on the stale m_last_durable_lsn. This update ensures that HomeRaftLogStore::end_of_append_batch is called for all raft log types, guaranteeing that all logs are flushed and m_last_durable_lsn is consistently updated everytime log_store's end_of_append_batch is executed --- conanfile.py | 2 +- .../replication/log_store/repl_log_store.cpp | 44 +++++++++---------- 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/conanfile.py b/conanfile.py index 6a0208560..3b6edff8e 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.11" + version = "6.6.12" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index 97d70ff92..072d06b99 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -44,11 +44,8 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { auto proposer_reqs = sisl::VectorPool< repl_req_ptr_t >::alloc(); for (int64_t lsn = int64_cast(start_lsn); lsn <= end_lsn; ++lsn) { auto rreq = m_sm.lsn_to_req(lsn); - // Skip this call in proposer, since this method will synchronously flush the data, which is not required for - // leader. Proposer will call the flush as part of commit after receiving quorum, upon which time, there is a - // high possibility the log entry is already flushed. Skip it for rreq == nullptr which is the case for raft - // config entries. - if ((rreq == nullptr) /*|| rreq->is_proposer()*/) { + // Skip it for rreq == nullptr which is the case for raft config entries. + if ((rreq == nullptr)) { continue; } else if (rreq->is_proposer()) { proposer_reqs->emplace_back(std::move(rreq)); @@ -60,41 +57,40 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { RD_LOGT("Raft Channel: end_of_append_batch start_lsn={} count={} num_data_to_be_written={} {}", start_lsn, count, reqs->size(), proposer_reqs->size()); - // All requests are from proposer for data write, so as mentioned above we can skip the flush for now if (!reqs->empty()) { // Check the map if data corresponding to all of these requsts have been received and written. If not, schedule // a fetch and write. Once all requests are completed and written, these requests are poped out of the map and // the future will be ready. + auto cur_time = std::chrono::steady_clock::now(); auto fut = m_rd.notify_after_data_written(reqs); // Wait for the fetch and write to be completed successfully. // It is essential to complete the data write before appending to the log. If the logs are flushed // before the data is written, a restart and subsequent log replay occurs, as the in-memory state is lost, // it leaves us uncertain about whether the data was actually written, potentially leading to data inconsistency. std::move(fut).wait(); + HISTOGRAM_OBSERVE(m_rd.metrics(), data_channel_wait_latency_us, get_elapsed_time_us(cur_time)); + } - // Flushing log now. - auto cur_time = std::chrono::steady_clock::now(); - HomeRaftLogStore::end_of_append_batch(start_lsn, count); - HISTOGRAM_OBSERVE(m_rd.metrics(), raft_end_of_append_batch_latency_us, get_elapsed_time_us(cur_time)); + // Flushing logs now. + auto cur_time = std::chrono::steady_clock::now(); + HomeRaftLogStore::end_of_append_batch(start_lsn, count); + HISTOGRAM_OBSERVE(m_rd.metrics(), raft_end_of_append_batch_latency_us, get_elapsed_time_us(cur_time)); - cur_time = std::chrono::steady_clock::now(); - HISTOGRAM_OBSERVE(m_rd.metrics(), data_channel_wait_latency_us, get_elapsed_time_us(cur_time)); + // Mark all the reqs completely written + for (auto const& rreq : *reqs) { + if (rreq) { rreq->add_state(repl_req_state_t::LOG_FLUSHED); } + } - // Mark all the reqs also completely written - for (auto const& rreq : *reqs) { - if (rreq) { rreq->add_state(repl_req_state_t::LOG_FLUSHED); } - } - } else if (!proposer_reqs->empty()) { - RD_LOGT("Raft Channel: end_of_append_batch, I am proposer, only flush log s from {} , count {}", start_lsn, - count); - // Mark all the reqs also completely written - HomeRaftLogStore::end_of_append_batch(start_lsn, count); - for (auto const& rreq : *proposer_reqs) { - if (rreq) { rreq->add_state(repl_req_state_t::LOG_FLUSHED); } + // Data corresponding to proposer reqs have already been written before propose reqs to raft, + // so skip waiting data written and mark reqs as flushed here. + for (auto const& rreq : *proposer_reqs) { + if (rreq) { + RD_LOGT("Raft Channel: end_of_append_batch, I am proposer for lsn {}, only flushed log for it", rreq->lsn()); + rreq->add_state(repl_req_state_t::LOG_FLUSHED); } } - // Convert volatile logs to non-volatile logs in state machine + // Convert volatile logs to non-volatile logs in state machine. for (int64_t lsn = int64_cast(start_lsn); lsn <= end_lsn; ++lsn) { auto rreq = m_sm.lsn_to_req(lsn); if (rreq != nullptr) { From 4714820e1187cc9e07d48488bc95ddb3d3eeda30 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Thu, 2 Jan 2025 17:52:53 +0800 Subject: [PATCH 058/170] Fix logic for setting flush_ld_key while loading logs, makes it consistent with the logic in on_write_completion This change prevents truncating more logs than expected. --- conanfile.py | 2 +- src/lib/logstore/log_dev.cpp | 3 +- src/tests/test_log_dev.cpp | 82 ++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 3b6edff8e..e0db8d331 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.12" + version = "6.6.13" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index fdebd9bd8..bcb933e1a 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -205,8 +205,7 @@ void LogDev::do_load(off_t device_cursor) { // Loop through each record within the log group and do a callback decltype(header->nrecords()) i{0}; HS_REL_ASSERT_GT(header->nrecords(), 0, "nrecords greater then zero"); - const auto flush_ld_key = - logdev_key{header->start_idx() + header->nrecords(), group_dev_offset + header->total_size()}; + const auto flush_ld_key = logdev_key{header->start_idx(), group_dev_offset}; while (i < header->nrecords()) { const auto* rec = header->nth_record(i); const uint32_t data_offset = (rec->offset + (rec->get_inlined() ? 0 : header->oob_data_offset)); diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp index 189bc5a86..7bde7bc12 100644 --- a/src/tests/test_log_dev.cpp +++ b/src/tests/test_log_dev.cpp @@ -158,6 +158,31 @@ class LogDevTest : public ::testing::Test { } } + void insert_batch_sync(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& lsn, int64_t batch, uint32_t fixed_size = 0) { + bool io_memory{false}; + std::vector data_vector; + + for (int64_t i = 0; i < batch; ++i) { + auto* d = prepare_data(lsn + i, io_memory, fixed_size); + data_vector.push_back(d); // Store the pointer in the vector + log_store->write_async(lsn + i, {uintptr_cast(d), d->total_size(), false}, nullptr, nullptr); + LOGINFO("Written async data for LSN -> {}:{}", log_store->get_store_id(), lsn + i); + } + + log_store->flush(); + LOGINFO("Flush data from {} to {}", lsn, lsn + batch); + lsn += batch; + + // Free all the allocated memory after the batch insert + for (auto* d : data_vector) { + if (io_memory) { + iomanager.iobuf_free(uintptr_cast(d)); + } else { + std::free(voidptr_cast(d)); + } + } + } + void kickstart_inserts(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& cur_lsn, int64_t batch, uint32_t fixed_size = 0) { auto last = cur_lsn + batch; @@ -617,6 +642,63 @@ TEST_F(LogDevTest, TruncateAcrossMultipleStores) { ASSERT_EQ(get_last_truncate_idx(logdev_id), 599); } +TEST_F(LogDevTest, TruncateLogsAfterFlushAndRestart) { + LOGINFO("Step 1: Create a single logstore to start truncate-logs-after-flush-and-restart test"); + auto logdev_id = logstore_service().create_new_logdev(); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto log_store = logstore_service().create_new_log_store(logdev_id, false); + auto store_id = log_store->get_store_id(); + + auto restart = [&]() { + std::promise < bool > p; + auto starting_cb = [&]() { + logstore_service().open_logdev(logdev_id); + logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) { + log_store = store; + p.set_value(true); + }); + }; + start_homestore(true /* restart */, starting_cb); + p.get_future().get(); + }; + + LOGINFO("Step 2: Insert 100 entries"); + logstore_seq_num_t cur_lsn = 0; + insert_batch_sync(log_store, cur_lsn, 100, 0); + + LOGINFO("Step 3: Read and verify all entries"); + read_all_verify(log_store); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 99); + + LOGINFO("Step 4: Append 100 entries"); + insert_batch_sync(log_store, cur_lsn, 100, 0); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199); + + LOGINFO("Step 5: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 6: restart and verify"); + restart(); + read_all_verify(log_store); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199); + + LOGINFO("Step 7: Truncate 50 entries"); + logstore_seq_num_t trunc_lsn = 49; + truncate_validate(log_store, &trunc_lsn); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), 199); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199); + + LOGINFO("Step 8: restart and verify"); + restart(); + read_all_verify(log_store); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), 199); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199); +} + TEST_F(LogDevTest, CreateRemoveLogDev) { auto num_logdev = SISL_OPTIONS["num_logdevs"].as< uint32_t >(); std::vector< std::shared_ptr< HomeLogStore > > log_stores; From e853f25d1b59991345ca1f9e962216065fd1ea97 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Wed, 22 Jan 2025 21:38:27 +0800 Subject: [PATCH 059/170] Add flush meta for single log store --- conanfile.py | 2 +- src/lib/replication/log_store/home_raft_log_store.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index e0db8d331..9495749f5 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.13" + version = "6.6.14" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index d32fc61d6..55cd690e4 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -366,7 +366,7 @@ bool HomeRaftLogStore::compact(ulong compact_lsn) { // we directly compact and truncate up to compact_lsn assuming there are dummy logs. REPL_STORE_LOG(DEBUG, "Compact with log holes from {} to={}", cur_max_lsn + 1, to_store_lsn(compact_lsn)); } - m_log_store->truncate(to_store_lsn(compact_lsn)); + m_log_store->truncate(to_store_lsn(compact_lsn), false); return true; } From dbb30c0d8ea7b52086339c3cbde278443febf7fb Mon Sep 17 00:00:00 2001 From: yuwmao Date: Thu, 23 Jan 2025 14:13:58 +0800 Subject: [PATCH 060/170] Rename apply_snp_resync_data to save_snp_resync_data - apply_snp_resync_data is similar to apply_snapshot in raft --- src/lib/replication/repl_dev/raft_repl_dev.cpp | 2 +- src/lib/replication/repl_dev/raft_repl_dev.h | 2 +- src/lib/replication/repl_dev/raft_state_machine.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 3fd68ee24..b8ea8a8fd 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1570,7 +1570,7 @@ void RaftReplDev::create_snp_resync_data(raft_buf_ptr_t& data_out) { std::memcpy(data_out->data_begin(), &msg, msg_size); } -bool RaftReplDev::apply_snp_resync_data(nuraft::buffer& data) { +bool RaftReplDev::save_snp_resync_data(nuraft::buffer& data) { auto msg = r_cast< snp_repl_dev_data* >(data.data_begin()); if (msg->magic_num != HOMESTORE_RESYNC_DATA_MAGIC || msg->protocol_version != HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 5e66e18f8..619da7843 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -324,7 +324,7 @@ class RaftReplDev : public ReplDev, void replace_member(repl_req_ptr_t rreq); void reset_quorum_size(uint32_t commit_quorum); void create_snp_resync_data(raft_buf_ptr_t& data_out); - bool apply_snp_resync_data(nuraft::buffer& data); + bool save_snp_resync_data(nuraft::buffer& data); }; } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 77fd54501..e0193ef03 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -354,9 +354,9 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, bool is_last_obj) { if (is_hs_snp_obj(obj_id)) { // Homestore preserved msg - if (m_rd.apply_snp_resync_data(data)) { + if (m_rd.save_snp_resync_data(data)) { obj_id = snp_obj_id_type_app; - LOGDEBUG("apply_snp_resync_data success, next obj_id={}", obj_id); + LOGDEBUG("save_snp_resync_data success, next obj_id={}", obj_id); } return; } From d7fc2b42cca40c7e065d4be1d12224096b2a082b Mon Sep 17 00:00:00 2001 From: koujl <108138320+koujl@users.noreply.github.com> Date: Fri, 24 Jan 2025 16:03:44 +0800 Subject: [PATCH 061/170] Fix bugs in snapshot transmission (#632) * Create a snapshot after adding a new member to prevent transmitting a snapshot with outdated configuration. * Trigger cp_flush on last_obj in case apply_snapshot() is skipped due to crash. --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 9 +++++++++ src/lib/replication/repl_dev/raft_state_machine.cpp | 11 ++++++++--- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/conanfile.py b/conanfile.py index 9495749f5..11524e12d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.14" + version = "6.6.15" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index b8ea8a8fd..7508139ee 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -178,6 +178,15 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ RD_LOGI("Replace member added member={} to group_id={}", boost::uuids::to_string(member_in.id), group_id_str()); + // If enabled, create a snapshot here to ensure the new member will use the latest snapshot with itself in the config + if (raft_server()->get_current_params().snapshot_distance_ > 0) { + if (auto idx = raft_server()->create_snapshot(); idx > 0) { + RD_LOGI("Created snapshot idx={} after adding member", idx); + } else { + RD_LOGW("Failed to create snapshot after adding member"); + } + } + // Step 3. Append log entry to mark the old member is out and new member is added. auto rreq = repl_req_ptr_t(new repl_req_ctx{}); replace_members_ctx members; diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index e0193ef03..47912898e 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -367,11 +367,14 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, snp_data->is_last_obj = is_last_obj; // We are doing a copy here. - sisl::io_blob_safe blob{s_cast< size_t >(data.size())}; + sisl::io_blob_safe blob{static_cast(data.size())}; std::memcpy(blob.bytes(), data.data_begin(), data.size()); snp_data->blob = std::move(blob); m_rd.m_listener->write_snapshot_obj(snp_ctx, snp_data); + if (is_last_obj) { + hs()->cp_mgr().trigger_cp_flush(true).wait(); // ensure DSN is flushed to disk + } // Update the object offset. obj_id = snp_data->offset; @@ -380,17 +383,19 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, if (iomgr_flip::instance()->test_flip("baseline_resync_restart_new_follower")) { LOGINFO("Hit flip baseline_resync_restart_new_follower crashing"); hs()->crash_simulator().crash(); - return; } #endif } bool RaftStateMachine::apply_snapshot(nuraft::snapshot& s) { + // NOTE: Currently, NuRaft considers the snapshot applied once compaction and truncation are completed, even if a + // crash occurs before apply_snapshot() is called. Therefore, the LSN must be updated here to ensure it is + // persisted AFTER log truncation. m_rd.set_last_commit_lsn(s.get_last_log_idx()); m_rd.m_data_journal->set_last_durable_lsn(s.get_last_log_idx()); + auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); auto res = m_rd.m_listener->apply_snapshot(snp_ctx); - // make sure the changes are flushed. hs()->cp_mgr().trigger_cp_flush(true /* force */).get(); return res; } From b0ee4aa88631bfafb0001edca4bbc63281a60aae Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Tue, 4 Feb 2025 14:33:00 -0800 Subject: [PATCH 062/170] Change long index setting (#640) --- conanfile.py | 2 +- src/tests/test_scripts/index_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index 11524e12d..be71a281b 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.15" + version = "6.6.16" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index d4734ac82..564bd61c5 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -78,7 +78,7 @@ def long_runnig_index(options, type=0): def long_running_clean_shutdown(options, type=0): print("Long running clean shutdown started") - + options['run_time'] = options['run_time'] // 10 try: run_test(options, type) options['init_device'] = False From 36b33523bc42a1fc169b0436872c441e3b7ddb95 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Sat, 8 Feb 2025 10:08:26 +0800 Subject: [PATCH 063/170] Persist logstore superblock when logdev truncation is unnecessary This change ensures that the logstore can get the correct start LSN during recovery. Avoid the following scenario: T1: Follower1 appends logs up to 100, then is stopped by a sigkill. T2: Upon restart, since the leader's log range is 1000-2500, a baseline resync is triggered using snapshot 2000. T3: Follower1 completes the baseline resync (start_lsn=2001, tail_lsn=2000), but m_trunc_ld_key is not updated since we cannot get a valid device offset for LSN 2000. T4: Follower1 appends logs from 2001 to 2500, making tail_lsn greater than 2000. T5: During logdev truncation, the truncation info is found at first. Since trunc_lsn < tail_lsn, it returns m_trunc_ld_key (still {0,0}), then exits without persist the logstore sb. T6: Follower1 is killed again, and upon restart, its start index in the store superblock remains 0, incorrectly interpreting the range as [1,2500]. --- conanfile.py | 2 +- src/lib/logstore/log_dev.cpp | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index be71a281b..c17158cde 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.16" + version = "6.6.17" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index bcb933e1a..740580bb5 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -542,7 +542,18 @@ uint64_t LogDev::truncate() { } // There are no writes or no truncation called for any of the store, so we can't truncate anything - if (min_safe_ld_key.idx <= 0 || min_safe_ld_key.idx <= m_last_truncate_idx) return 0; + if (min_safe_ld_key.idx <= 0 || min_safe_ld_key.idx <= m_last_truncate_idx) { + // Persist the logstore superblock to ensure correct start LSN during recovery. Avoid such scenario: + // 1. Follower1 appends logs up to 100, then is stopped by a sigkill. + // 2. Upon restart, a baseline resync is triggered using snapshot 2000. + // 3. Baseline resync completed with start_lsn=2001, but m_trunc_ld_key remains {0,0} since we cannot get a valid + // device offset for LSN 2000 to update it. + // 4. Follower1 appends logs from 2001 to 2500, making tail_lsn > 2000. + // 5. Get m_trunc_ld_key={0,0}, goto here and return 0 without persist. + // 6. Follower1 is killed again, after restart, its start index remains 0, misinterpreting the range as [1,2500]. + m_logdev_meta.persist(); + return 0; + } uint64_t const num_records_to_truncate = uint64_cast(min_safe_ld_key.idx - m_last_truncate_idx); From 3e38fa8a54104ea1541e6cc2d5d6836b24a209e1 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Sat, 8 Feb 2025 17:59:23 +0800 Subject: [PATCH 064/170] Add timeout cfg for snapshot sync context --- conanfile.py | 2 +- src/lib/common/homestore_config.fbs | 6 +++++- src/lib/replication/service/raft_repl_service.cpp | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index c17158cde..c86cbf553 100644 --- a/conanfile.py +++ b/conanfile.py @@ -54,7 +54,7 @@ def build_requirements(self): def requirements(self): self.requires("iomgr/[^11.3]@oss/master", transitive_headers=True) self.requires("sisl/[^12.2]@oss/master", transitive_headers=True) - self.requires("nuraft_mesg/[^3.7.1]@oss/main", transitive_headers=True) + self.requires("nuraft_mesg/[^3.7.2]@oss/main", transitive_headers=True) self.requires("farmhash/cci.20190513@", transitive_headers=True) if self.settings.arch in ['x86', 'x86_64']: diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index 337e551e7..ef27b3a5c 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -259,6 +259,10 @@ table Consensus { // ReplDev Reqs timeout in seconds. repl_req_timeout_sec: uint32 = 300; + // Timeout for snapshot sync context in ms. If the follower doesn't response + // within this timeout during snapshot resync, the leader will release snapshot sync context. + snapshot_sync_ctx_timeout_ms: int32 = 60000; + // Frequency to flush durable commit LSN in millis flush_durable_commit_interval_ms: uint64 = 500; @@ -271,7 +275,7 @@ table Consensus { // Reading snapshot objects will be done by a background thread asynchronously // instead of synchronous read by Raft worker threads - use_bg_thread_for_snapshot_io_: bool = true; + use_bg_thread_for_snapshot_io: bool = true; } table HomeStoreSettings { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 23ff2db89..6206c3dde 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -105,12 +105,13 @@ void RaftReplService::start() { .with_snapshot_enabled(HS_DYNAMIC_CONFIG(consensus.snapshot_freq_distance)) .with_leadership_expiry(HS_DYNAMIC_CONFIG(consensus.leadership_expiry_ms)) .with_reserved_log_items(HS_DYNAMIC_CONFIG(consensus.num_reserved_log_items)) + .with_snapshot_sync_ctx_timeout(HS_DYNAMIC_CONFIG(consensus.snapshot_sync_ctx_timeout_ms)) .with_auto_forwarding(false); // new_joiner_type fully disabled log pack behavior. // There is no callback available for handling and localizing the log entries within the pack, which could // result in data corruption. r_params.use_new_joiner_type_ = true; - r_params.use_bg_thread_for_snapshot_io_ = HS_DYNAMIC_CONFIG(consensus.use_bg_thread_for_snapshot_io_); + r_params.use_bg_thread_for_snapshot_io_ = HS_DYNAMIC_CONFIG(consensus.use_bg_thread_for_snapshot_io); r_params.return_method_ = nuraft::raft_params::async_handler; m_msg_mgr->register_mgr_type(params.default_group_type_, r_params); From 3a340409747e5fd7e0f502392bcf1651c8ac20b6 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Tue, 11 Feb 2025 21:38:23 +0800 Subject: [PATCH 065/170] Remove snapshot creation when add_member done Nuraft Reconfigure issue has been fixed, we don't need to create snapshot. --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/conanfile.py b/conanfile.py index c86cbf553..7d930b3af 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.17" + version = "6.6.18" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 7508139ee..b8ea8a8fd 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -178,15 +178,6 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ RD_LOGI("Replace member added member={} to group_id={}", boost::uuids::to_string(member_in.id), group_id_str()); - // If enabled, create a snapshot here to ensure the new member will use the latest snapshot with itself in the config - if (raft_server()->get_current_params().snapshot_distance_ > 0) { - if (auto idx = raft_server()->create_snapshot(); idx > 0) { - RD_LOGI("Created snapshot idx={} after adding member", idx); - } else { - RD_LOGW("Failed to create snapshot after adding member"); - } - } - // Step 3. Append log entry to mark the old member is out and new member is added. auto rreq = repl_req_ptr_t(new repl_req_ctx{}); replace_members_ctx members; From fb6fd087ed4eda475baee597c8e4527b1ad0ecf9 Mon Sep 17 00:00:00 2001 From: ywz <649521587@qq.com> Date: Wed, 12 Feb 2025 16:54:43 +0800 Subject: [PATCH 066/170] Add function to support purging existing logs. (#643) This change is necessary for baseline resync and can be called by the upper layer to purge existing logs, which resolves the following issue: If a follower restarts during baseline resync, it will replay the remaining logs first. However, shard info has already been cleared at the beginning of resync (from the HO side), making it impossible to retrieve shard info while replaying logs, which results in errors. Co-authored-by: yawzhang --- src/include/homestore/replication/repl_dev.h | 3 +++ src/lib/replication/log_store/home_raft_log_store.cpp | 7 +++++++ src/lib/replication/log_store/home_raft_log_store.h | 6 ++++++ src/lib/replication/repl_dev/raft_repl_dev.h | 4 ++++ src/lib/replication/repl_dev/solo_repl_dev.h | 1 + 5 files changed, 21 insertions(+) diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index d05be3fde..937450336 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -477,6 +477,9 @@ class ReplDev { /// @return true if ready, false otherwise virtual bool is_ready_for_traffic() const = 0; + /// @brief Clean up resources on this repl dev. + virtual void purge() = 0; + virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } virtual void detach_listener() { diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index 55cd690e4..be7039059 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -380,6 +380,13 @@ ulong HomeRaftLogStore::last_durable_index() { return to_repl_lsn(m_last_durable_lsn); } +void HomeRaftLogStore::purge_all_logs() { + auto last_lsn = m_log_store->get_contiguous_issued_seq_num(m_last_durable_lsn); + REPL_STORE_LOG(INFO, "Store={} LogDev={}: Purging all logs in the log store, last_lsn={}", + m_logstore_id, m_logdev_id, last_lsn); + m_log_store->truncate(last_lsn, false /* in_memory_truncate_only */); +} + void HomeRaftLogStore::wait_for_log_store_ready() { m_log_store_future.wait(); } void HomeRaftLogStore::set_last_durable_lsn(repl_lsn_t lsn) { m_last_durable_lsn = to_store_lsn(lsn); } diff --git a/src/lib/replication/log_store/home_raft_log_store.h b/src/lib/replication/log_store/home_raft_log_store.h index d2c0fd57b..7fb96a5d4 100644 --- a/src/lib/replication/log_store/home_raft_log_store.h +++ b/src/lib/replication/log_store/home_raft_log_store.h @@ -215,6 +215,12 @@ class HomeRaftLogStore : public nuraft::log_store { void truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_lsn); #endif + /** + * Purge all logs in the log store + * It is a dangerous operation and is only used in baseline resync now (purge all logs and restore by snapshot). + */ + void purge_all_logs(); + void wait_for_log_store_ready(); void set_last_durable_lsn(repl_lsn_t lsn); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 619da7843..b6cd9d744 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -198,6 +198,10 @@ class RaftReplDev : public ReplDev, if (!ready) { RD_LOGD("Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); } return ready; } + void purge() override { + // clean up existing logs in log store + m_data_journal->purge_all_logs(); + } //////////////// Accessor/shortcut methods /////////////////////// nuraft_mesg::repl_service_ctx* group_msg_service(); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index e5f33fb63..f252dd209 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -54,6 +54,7 @@ class SoloReplDev : public ReplDev { return std::vector< peer_info >{peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0}}; } bool is_ready_for_traffic() const override { return true; } + void purge() override {} uuid_t group_id() const override { return m_group_id; } From cabbc4e7d84c5534264504f54b1093496a644579 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Thu, 13 Feb 2025 14:32:26 +0800 Subject: [PATCH 067/170] Improve active peer determination logic Exclude the possibility of a peer performing baseline resync to avoid potential conflicts. --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 7d930b3af..a37363bac 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.18" + version = "6.6.19" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index b8ea8a8fd..ed3a1a4a2 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1088,13 +1088,17 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { uint64_t least_active_repl_idx = my_committed_idx > HS_DYNAMIC_CONFIG(consensus.laggy_threshold) ? my_committed_idx - HS_DYNAMIC_CONFIG(consensus.laggy_threshold) : 0; + // peer's last log idx should also >= leader's start_index-1(ensure existence), otherwise leader can't append log entries to it + // and baseline resync will be triggerred. Try to avoid conflict between baseline resync and normal replication. + least_active_repl_idx = std::max(least_active_repl_idx, m_data_journal->start_index() - 1); for (auto p : repl_status) { if (p.id_ == m_my_repl_id) { continue; } if (p.replication_idx_ >= least_active_repl_idx) { res.insert(p.id_); } else { - RD_LOGW("Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}", p.id_, - my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_); + RD_LOGW("Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}", + p.id_, + my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_, least_active_repl_idx); } } return res; From 8d8eaa4eca89d301cba2525cbacaa3385708dd62 Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Thu, 13 Feb 2025 16:12:42 -0700 Subject: [PATCH 068/170] Return grpc error if a non originator receives fetch data request --- conanfile.py | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 54 ++++++++++--------- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/conanfile.py b/conanfile.py index a37363bac..b0c3d3640 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.19" + version = "6.6.20" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index ed3a1a4a2..2d7da6c72 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -776,32 +776,38 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_ auto const& originator = req->blkid_originator(); auto const& remote_blkid = req->remote_blkid(); - // release this assert if in the future we want to fetch from non-originator; - RD_REL_ASSERT_EQ(originator, server_id(), - "Not expect to receive fetch data from remote when I am not the originator of this request"); + // Edit this check if in the future we want to fetch from non-originator; + if (originator != server_id()) { + auto const error_msg = fmt::format("Did not expect to receive fetch data from " + "remote when I am not the originator of this request, originator={}, my_server_id={}" + , originator, server_id()); + RD_LOGW("{}", error_msg); + auto status = ::grpc::Status(::grpc::INVALID_ARGUMENT, error_msg); + rpc_data->set_status(status); + rpc_data->send_response(); + return; + } // fetch data based on the remote_blkid - if (originator == server_id()) { - // We are the originator of the blkid, read data locally; - MultiBlkId local_blkid; - - // convert remote_blkid serialized data to local blkid - local_blkid.deserialize(sisl::blob{remote_blkid->Data(), remote_blkid->size()}, true /* copy */); - - RD_LOGD("Data Channel: FetchData received: dsn={} lsn={} my_blkid={}", req->dsn(), lsn, - local_blkid.to_string()); - - // prepare the sgs data buffer to read into; - auto const total_size = local_blkid.blk_count() * get_blk_size(); - sisl::sg_list sgs; - sgs.size = total_size; - sgs.iovs.emplace_back( - iovec{.iov_base = iomanager.iobuf_alloc(get_blk_size(), total_size), .iov_len = total_size}); - - // accumulate the sgs for later use (send back to the requester)); - sgs_vec.push_back(sgs); - futs.emplace_back(async_read(local_blkid, sgs, total_size)); - } + // We are the originator of the blkid, read data locally; + MultiBlkId local_blkid; + + // convert remote_blkid serialized data to local blkid + local_blkid.deserialize(sisl::blob{remote_blkid->Data(), remote_blkid->size()}, true /* copy */); + + RD_LOGD("Data Channel: FetchData received: dsn={} lsn={} my_blkid={}", req->dsn(), lsn, + local_blkid.to_string()); + + // prepare the sgs data buffer to read into; + auto const total_size = local_blkid.blk_count() * get_blk_size(); + sisl::sg_list sgs; + sgs.size = total_size; + sgs.iovs.emplace_back( + iovec{.iov_base = iomanager.iobuf_alloc(get_blk_size(), total_size), .iov_len = total_size}); + + // accumulate the sgs for later use (send back to the requester)); + sgs_vec.push_back(sgs); + futs.emplace_back(async_read(local_blkid, sgs, total_size)); } folly::collectAllUnsafe(futs).thenValue( From fb28db428177d4d268784a8c61ce214293c3c108 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Tue, 11 Feb 2025 19:58:37 +0800 Subject: [PATCH 069/170] Remove optimization on blk free operation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimization in blk free may cause the following issue, just remove it and wait for GC handling: - T1: blob1 is written with LSN 1 — [blkid=10, chunk=1, cnt=5]. - T2: blob1 is deleted with LSN 10, causing the last_append_offset to revert to 10. - T3: blob2 is written with LSN 11 — [blkid=10, chunk=1, cnt=5]. - T4: The SM is terminated and restarted. - T5: LSN 1 is replayed, committing block [blkid=10, chunk=1, cnt=5]. - T6: LSN 11 is replayed, committing block [blkid=10, chunk=1, cnt=5]. - T7: LSN 10 is committed, freeing block [blkid=10, chunk=1, cnt=5]. - T8: LSN 11 is committed again, but since the blocks have already been freed, they are not available for LSN 11. --- conanfile.py | 2 +- src/lib/blkalloc/append_blk_allocator.cpp | 28 ++--------------------- 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/conanfile.py b/conanfile.py index a37363bac..b0c3d3640 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.19" + version = "6.6.20" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index 1380a5ff6..eca445381 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -127,33 +127,9 @@ void AppendBlkAllocator::cp_flush(CP* cp) { } } -// -// free operation does: -// 1. book keeping "total freeable" space -// 2. if the blk being freed happens to be last block, move last_append_offset backwards accordingly; -// +// free operation books keeping "total freeable" space void AppendBlkAllocator::free(const BlkId& bid) { - // If we are freeing the last block, just move the offset back - blk_num_t cur_last_offset = m_last_append_offset.load(); - auto const input_last_offset = bid.blk_num() + bid.blk_count(); - blk_num_t new_last_offset; - bool freeing_in_middle{false}; - do { - if (input_last_offset == cur_last_offset) { - new_last_offset = bid.blk_num(); - freeing_in_middle = false; - } else { - new_last_offset = cur_last_offset; - freeing_in_middle = true; - } - } while (!m_last_append_offset.compare_exchange_weak(cur_last_offset, new_last_offset)); - - if (freeing_in_middle) { - // Freeing something in the middle, increment the count - m_freeable_nblks.fetch_add(bid.blk_count()); - } else { - m_commit_offset.store(m_last_append_offset.load()); - } + m_freeable_nblks.fetch_add(bid.blk_count()); m_is_dirty.store(true); } From d31858b6e626456ca99ec521a5d9a65f9695625a Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Thu, 20 Feb 2025 08:03:23 +0800 Subject: [PATCH 070/170] graceful shutdown (#636) graceful shutdown this PR introduce graceful shutdown to homestore. the basic idea is to introduce a counter and flag for each service or component to make sure all the pending requests are handled and reject later api calls from upper layer before stop. --- conanfile.py | 2 +- src/include/homestore/blkdata_service.hpp | 15 +++ .../homestore/btree/detail/btree_internal.hpp | 2 +- .../homestore/index/index_internal.hpp | 3 +- src/include/homestore/index/index_table.hpp | 45 ++++++- src/include/homestore/index_service.hpp | 26 +++- src/include/homestore/logstore/log_store.hpp | 35 ++++-- src/include/homestore/logstore_service.hpp | 14 +++ .../homestore/replication/repl_decls.h | 1 + src/include/homestore/replication/repl_dev.h | 32 ++++- src/lib/blkdata_svc/blkdata_service.cpp | 61 ++++++++-- src/lib/homestore.cpp | 35 ++++-- src/lib/index/index_service.cpp | 41 ++++++- src/lib/logstore/log_dev.cpp | 115 +++++++++++------- src/lib/logstore/log_dev.hpp | 34 +++--- src/lib/logstore/log_store.cpp | 99 ++++++++++++--- src/lib/logstore/log_store_service.cpp | 57 +++++++-- .../replication/repl_dev/raft_repl_dev.cpp | 61 ++++++++-- src/lib/replication/repl_dev/raft_repl_dev.h | 3 +- .../replication/repl_dev/solo_repl_dev.cpp | 8 +- src/lib/replication/repl_dev/solo_repl_dev.h | 4 +- .../replication/service/generic_repl_svc.cpp | 12 +- .../replication/service/generic_repl_svc.h | 18 ++- .../replication/service/raft_repl_service.cpp | 60 ++++++++- .../replication/service/raft_repl_service.h | 3 +- 25 files changed, 621 insertions(+), 165 deletions(-) diff --git a/conanfile.py b/conanfile.py index b0c3d3640..b9275769f 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.20" + version = "6.6.21" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index fff670f44..33a5fe2ac 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -200,6 +200,8 @@ class BlkDataService { uint64_t get_used_capacity() const; + void stop(); + private: /** * @brief Initializes the block data service. @@ -224,6 +226,19 @@ class BlkDataService { std::unique_ptr< BlkReadTracker > m_blk_read_tracker; std::shared_ptr< ChunkSelector > m_custom_chunk_selector; uint32_t m_blk_size; + +private: + // graceful shutdown related + std::atomic_bool m_stopping{false}; + mutable std::atomic_uint64_t pending_request_num{0}; + + bool is_stopping() const { return m_stopping.load(); } + void start_stopping() { m_stopping = true; } + + uint64_t get_pending_request_num() const { return pending_request_num.load(); } + + void incr_pending_request_num() const { pending_request_num++; } + void decr_pending_request_num() const { pending_request_num--; } }; extern BlkDataService& data_service(); diff --git a/src/include/homestore/btree/detail/btree_internal.hpp b/src/include/homestore/btree/detail/btree_internal.hpp index 14533a8e5..7dbc50c0a 100644 --- a/src/include/homestore/btree/detail/btree_internal.hpp +++ b/src/include/homestore/btree/detail/btree_internal.hpp @@ -202,7 +202,7 @@ VENUM(btree_store_type, uint8_t, MEM = 0, SSD = 1) #endif ENUM(btree_status_t, uint32_t, success, not_found, retry, has_more, node_read_failed, already_exists, filtered_out, - space_not_avail, cp_mismatch, merge_not_required, merge_failed, crc_mismatch, not_supported, node_freed) + space_not_avail, cp_mismatch, merge_not_required, merge_failed, crc_mismatch, not_supported, node_freed, stopping) /*ENUM(btree_node_write_type, uint8_t, new_node, // Node write whenever a new node is created. diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp index 989e650c4..1ff444650 100644 --- a/src/include/homestore/index/index_internal.hpp +++ b/src/include/homestore/index/index_internal.hpp @@ -71,7 +71,8 @@ class IndexTableBase { virtual void recovery_completed() = 0; virtual uint32_t ordinal() const = 0; virtual uint64_t used_size() const = 0; - virtual void destroy() = 0; + virtual btree_status_t destroy() = 0; + virtual void stop() = 0; virtual void repair_node(IndexBufferPtr const& buf) = 0; virtual void repair_root_node(IndexBufferPtr const& buf) = 0; }; diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 83411b5c0..a693ddc9e 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -36,7 +36,28 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { superblk< index_table_sb > m_sb; shared< MetaIndexBuffer > m_sb_buffer; + // graceful shutdown +private: + std::atomic_bool m_stopping{false}; + mutable std::atomic_uint64_t pending_request_num{0}; + + bool is_stopping() const { return m_stopping.load(); } + void start_stopping() { m_stopping = true; } + + uint64_t get_pending_request_num() const { return pending_request_num.load(); } + + void incr_pending_request_num() const { pending_request_num++; } + void decr_pending_request_num() const { pending_request_num--; } + public: + void stop() { + start_stopping(); + while (true) { + if (!get_pending_request_num()) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + } + IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg) : Btree< K, V >{cfg}, m_sb{"index"} { // Create a superblk for the index table and create MetaIndexBuffer corresponding to that @@ -77,10 +98,14 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } - void destroy() override { + btree_status_t destroy() override { + if (is_stopping()) return btree_status_t::stopping; + incr_pending_request_num(); auto cpg = cp_mgr().cp_guard(); Btree< K, V >::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); m_sb.destroy(); + decr_pending_request_num(); + return btree_status_t::success; } uuid_t uuid() const override { return m_sb->uuid; } @@ -92,6 +117,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { template < typename ReqT > btree_status_t put(ReqT& put_req) { + if (is_stopping()) return btree_status_t::stopping; + incr_pending_request_num(); auto ret = btree_status_t::success; do { auto cpg = cp_mgr().cp_guard(); @@ -99,11 +126,14 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { ret = Btree< K, V >::put(put_req); if (ret == btree_status_t::cp_mismatch) { LOGTRACEMOD(wbcache, "CP Mismatch, retrying put"); } } while (ret == btree_status_t::cp_mismatch); + decr_pending_request_num(); return ret; } template < typename ReqT > btree_status_t remove(ReqT& remove_req) { + if (is_stopping()) return btree_status_t::stopping; + incr_pending_request_num(); auto ret = btree_status_t::success; do { auto cpg = cp_mgr().cp_guard(); @@ -111,6 +141,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { ret = Btree< K, V >::remove(remove_req); if (ret == btree_status_t::cp_mismatch) { LOGTRACEMOD(wbcache, "CP Mismatch, retrying remove"); } } while (ret == btree_status_t::cp_mismatch); + decr_pending_request_num(); + return ret; + } + + template < typename ReqT > + btree_status_t get(ReqT& greq) const { + if (is_stopping()) return btree_status_t::stopping; + incr_pending_request_num(); + auto ret = Btree< K, V >::get(greq); + decr_pending_request_num(); return ret; } @@ -260,8 +300,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override { // todo: if(m_sb->root_node == new_root->node_id() && m_sb->root_link_version == new_root->link_version()){ // return btree_status_t::success;} - LOGTRACEMOD(wbcache, "root changed for index old_root={} new_root={}", m_sb->root_node, - new_root->node_id()); + LOGTRACEMOD(wbcache, "root changed for index old_root={} new_root={}", m_sb->root_node, new_root->node_id()); m_sb->root_node = new_root->node_id(); m_sb->root_link_version = new_root->link_version(); diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp index 87ad63672..801cace13 100644 --- a/src/include/homestore/index_service.hpp +++ b/src/include/homestore/index_service.hpp @@ -56,6 +56,7 @@ class IndexService { public: IndexService(std::unique_ptr< IndexServiceCallbacks > cbs); + ~IndexService(); // Creates the vdev that is needed to initialize the device void create_vdev(uint64_t size, HSDevType devType, uint32_t num_chunks); @@ -63,7 +64,10 @@ class IndexService { // Open the existing vdev which is represnted by the vdev_info_block shared< VirtualDev > open_vdev(const vdev_info& vb, bool load_existing); - // Start the Index Service + // for now, we don't support start after stop and there is no use case for this. + // TODO: support start after stop if necessary + + // Start the Index Service void start(); // Stop the Index Service @@ -71,8 +75,8 @@ class IndexService { // Add/Remove Index Table to/from the index service uint64_t num_tables(); - void add_index_table(const std::shared_ptr< IndexTableBase >& tbl); - void remove_index_table(const std::shared_ptr< IndexTableBase >& tbl); + bool add_index_table(const std::shared_ptr< IndexTableBase >& tbl); + bool remove_index_table(const std::shared_ptr< IndexTableBase >& tbl); std::shared_ptr< IndexTableBase > get_index_table(uuid_t uuid) const; std::shared_ptr< IndexTableBase > get_index_table(uint32_t ordinal) const; @@ -81,6 +85,9 @@ class IndexService { uint64_t used_size() const; uint32_t node_size() const; + + // the following methods are used wb_cache , which will not used by upper layer. so graceful shutdown just skips + // them for now. void repair_index_node(uint32_t ordinal, IndexBufferPtr const& node_buf); void update_root(uint32_t ordinal, IndexBufferPtr const& node_buf); @@ -88,6 +95,19 @@ class IndexService { if (!m_wb_cache) { throw std::runtime_error("Attempted to access a null pointer wb_cache"); } return *m_wb_cache; } + +private: + // graceful shutdown related + std::atomic_bool m_stopping{false}; + mutable std::atomic_uint64_t pending_request_num{0}; + + bool is_stopping() const { return m_stopping.load(); } + void start_stopping() { m_stopping = true; } + + uint64_t get_pending_request_num() const { return pending_request_num.load(); } + + void incr_pending_request_num() const { pending_request_num++; } + void decr_pending_request_num() const { pending_request_num--; } }; extern IndexService& index_service(); diff --git a/src/include/homestore/logstore/log_store.hpp b/src/include/homestore/logstore/log_store.hpp index 91735be79..cfeecc05f 100644 --- a/src/include/homestore/logstore/log_store.hpp +++ b/src/include/homestore/logstore/log_store.hpp @@ -89,7 +89,7 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * @param cb [OPTIONAL] Callback if caller wants specific callback as against common/default callback registed. * The callback returns the request back with status of execution */ - void write_async(logstore_req* req, const log_req_comp_cb_t& cb = nullptr); + logstore_seq_num_t write_async(logstore_req* req, const log_req_comp_cb_t& cb = nullptr); /** * @brief Write the blob at the user specified seq number @@ -99,7 +99,8 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * @param cookie : Any cookie or context which will passed back in the callback * @param cb Callback upon completion which is called with the status, seq_num and cookie that was passed. */ - void write_async(logstore_seq_num_t seq_num, const sisl::io_blob& b, void* cookie, const log_write_comp_cb_t& cb); + logstore_seq_num_t write_async(logstore_seq_num_t seq_num, const sisl::io_blob& b, void* cookie, + const log_write_comp_cb_t& cb); /** * @brief This method appends the blob into the log and makes a callback at the end of the append. @@ -125,7 +126,7 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * @param seq_num: Seq number to write to * @param b : Blob of data */ - void write_and_flush(logstore_seq_num_t seq_num, const sisl::io_blob& b); + logstore_seq_num_t write_and_flush(logstore_seq_num_t seq_num, const sisl::io_blob& b); /** * @brief Read the log provided the sequence number synchronously. This is not the most efficient way to read @@ -150,9 +151,11 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * completed, a device truncation can be triggered for all the logstores. The device truncation is more * expensive and grouping them together yields better results. * + * @return True on success + * * Note: this flag currently is not used, meaning all truncate is in memory only; */ - void truncate(logstore_seq_num_t upto_seq_num, bool in_memory_truncate_only = true); + bool truncate(logstore_seq_num_t upto_seq_num, bool in_memory_truncate_only = true); /** * @brief Fill the gap in the seq_num with a dummy value. This ensures that get_contiguous_issued and completed @@ -160,8 +163,9 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * result in out_of_range exception. * * @param seq_num: Seq_num to fill to. + * @return True on success */ - void fill_gap(logstore_seq_num_t seq_num); + bool fill_gap(logstore_seq_num_t seq_num); /** * @brief Get the last truncated seqnum upto which we have truncated. If called after recovery, it returns the @@ -192,8 +196,9 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * @param start_idx idx to start with; * @param cb called with current idx and log buffer. * Return value of the cb: true means proceed, false means stop; + * @return True on success */ - void foreach (int64_t start_idx, const std::function< bool(logstore_seq_num_t, log_buffer) >& cb); + bool foreach (int64_t start_idx, const std::function< bool(logstore_seq_num_t, log_buffer) >& cb); /** * @brief Get the store id of this HomeLogStore @@ -227,8 +232,9 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * * @param seq_num Sequence number upto which logs are to be flushed. If not provided, will wait to flush all seq * numbers issued prior. + * @return True on success */ - void flush(logstore_seq_num_t upto_seq_num = invalid_lsn()); + bool flush(logstore_seq_num_t upto_seq_num = invalid_lsn()); /** * @brief Rollback the given instance to the given sequence number @@ -277,6 +283,8 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { auto get_comp_cb() const { return m_comp_cb; } + void stop(); + private: logstore_id_t m_store_id; std::shared_ptr< LogDev > m_logdev; @@ -295,5 +303,18 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { LogStoreServiceMetrics& m_metrics; logdev_key m_trunc_ld_key{0, 0}; + +private: + // graceful shutdown related fields + std::atomic_bool m_stopping{false}; + mutable std::atomic_uint64_t pending_request_num{0}; + + bool is_stopping() const { return m_stopping.load(); } + void start_stopping() { m_stopping = true; } + + uint64_t get_pending_request_num() const { return pending_request_num.load(); } + + void incr_pending_request_num() const { pending_request_num++; } + void decr_pending_request_num() const { pending_request_num--; } }; } // namespace homestore diff --git a/src/include/homestore/logstore_service.hpp b/src/include/homestore/logstore_service.hpp index 18c1e75e3..fe65c7c13 100644 --- a/src/include/homestore/logstore_service.hpp +++ b/src/include/homestore/logstore_service.hpp @@ -68,6 +68,7 @@ class LogStoreService { public: LogStoreService(); + ~LogStoreService(); LogStoreService(const LogStoreService&) = delete; LogStoreService(LogStoreService&&) noexcept = delete; LogStoreService& operator=(const LogStoreService&) = delete; @@ -194,6 +195,19 @@ class LogStoreService { LogStoreServiceMetrics m_metrics; std::unordered_set< logdev_id_t > m_unopened_logdev; superblk< logstore_service_super_block > m_sb; + +private: + // graceful shutdown related + std::atomic_bool m_stopping{false}; + mutable std::atomic_uint64_t pending_request_num{0}; + + bool is_stopping() const { return m_stopping.load(); } + void start_stopping() { m_stopping = true; } + + uint64_t get_pending_request_num() const { return pending_request_num.load(); } + + void incr_pending_request_num() const { pending_request_num++; } + void decr_pending_request_num() const { pending_request_num--; } }; extern LogStoreService& logstore_service(); diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 160733c0d..83f806c40 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -30,6 +30,7 @@ VENUM(ReplServiceError, int32_t, SERVER_IS_LEAVING = -10, TERM_MISMATCH = -11, RETRY_REQUEST = -12, + STOPPING = -13, RESULT_NOT_EXIST_YET = -10000, NOT_IMPLEMENTED = -10001, NO_SPACE_LEFT = -20000, diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 937450336..a14dd8824 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -36,8 +36,8 @@ VENUM(repl_req_state_t, uint32_t, DATA_WRITTEN = 1 << 2, // Data has been written to the storage LOG_RECEIVED = 1 << 3, // Log is received and waiting for data LOG_FLUSHED = 1 << 4, // Log has been flushed - ERRORED = 1 << 5, // Error has happened and cleaned up - DATA_COMMITTED = 1 << 6 // Data has already been committed, used in duplication handling, will skip commit_blk + ERRORED = 1 << 5, // Error has happened and cleaned up + DATA_COMMITTED = 1 << 6 // Data has already been committed, used in duplication handling, will skip commit_blk ) VENUM(journal_type_t, uint16_t, @@ -144,7 +144,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: repl_req_ctx() { m_start_time = Clock::now(); } virtual ~repl_req_ctx(); ReplServiceError init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size, cshared< ReplDevListener >& listener); + sisl::blob const& key, uint32_t data_size, cshared< ReplDevListener >& listener); /////////////////////// All getters /////////////////////// repl_key const& rkey() const { return m_rkey; } @@ -444,7 +444,7 @@ class ReplDev { /// /// @param lsn - LSN of the old blkids that is being freed /// @param blkids - blkids to be freed. - virtual void async_free_blks(int64_t lsn, MultiBlkId const& blkid) = 0; + virtual folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid) = 0; /// @brief Try to switch the current replica where this method called to become a leader. /// @return True if it is successful, false otherwise. @@ -489,8 +489,32 @@ class ReplDev { } } + // we have no shutdown for repl_dev, since shutdown repl_dev is done by repl_service + void stop() { + start_stopping(); + while (true) { + auto pending_request_num = get_pending_request_num(); + if (!pending_request_num) break; + + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + } + protected: shared< ReplDevListener > m_listener; + + // graceful shutdown related +protected: + std::atomic_bool m_stopping{false}; + mutable std::atomic_uint64_t pending_request_num{0}; + + bool is_stopping() const { return m_stopping.load(); } + void start_stopping() { m_stopping = true; } + + uint64_t get_pending_request_num() const { return pending_request_num.load(); } + + void incr_pending_request_num() const { pending_request_num++; } + void decr_pending_request_num() const { pending_request_num--; } }; } // namespace homestore diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index 5e80ac7e0..58cc36c61 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -79,6 +79,8 @@ static auto collect_all_futures(std::vector< folly::Future< std::error_code > >& folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& blkid, uint8_t* buf, uint32_t size, bool part_of_batch) { + if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + incr_pending_request_num(); auto do_read = [this](BlkId const& bid, uint8_t* buf, uint32_t size, bool part_of_batch) { m_blk_read_tracker->insert(bid); @@ -89,6 +91,7 @@ folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& bl }; if (blkid.num_pieces() == 1) { + decr_pending_request_num(); return do_read(blkid.to_single_blkid(), buf, size, part_of_batch); } else { static thread_local std::vector< folly::Future< std::error_code > > s_futs; @@ -100,13 +103,15 @@ folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& bl s_futs.emplace_back(do_read(*bid, buf, sz, part_of_batch)); buf += sz; } - + decr_pending_request_num(); return collect_all_futures(s_futs); } } folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch) { + if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + incr_pending_request_num(); // TODO: sg_iovs_t should not be passed by value. We need it pass it as const&, but that is failing because // iovs.data() will then return "const iovec*", but unfortunately all the way down to iomgr, we take iovec* // instead it can easily take "const iovec*". Until we change this is made as copy by value @@ -121,6 +126,7 @@ folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& bl }; if (blkid.num_pieces() == 1) { + decr_pending_request_num(); return do_read(blkid.to_single_blkid(), sgs.iovs, size, part_of_batch); } else { static thread_local std::vector< folly::Future< std::error_code > > s_futs; @@ -132,7 +138,7 @@ folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& bl uint32_t const sz = bid->blk_count() * m_blk_size; s_futs.emplace_back(do_read(*bid, sg_it.next_iovs(sz), sz, part_of_batch)); } - + decr_pending_request_num(); return collect_all_futures(s_futs); } } @@ -140,17 +146,25 @@ folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& bl folly::Future< std::error_code > BlkDataService::async_alloc_write(const sisl::sg_list& sgs, const blk_alloc_hints& hints, MultiBlkId& out_blkids, bool part_of_batch) { + if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + incr_pending_request_num(); const auto status = alloc_blks(sgs.size, hints, out_blkids); if (status != BlkAllocStatus::SUCCESS) { + decr_pending_request_num(); return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::resource_unavailable_try_again)); } - return async_write(sgs, out_blkids, part_of_batch); + auto ret = async_write(sgs, out_blkids, part_of_batch); + decr_pending_request_num(); + return ret; } folly::Future< std::error_code > BlkDataService::async_write(const char* buf, uint32_t size, MultiBlkId const& blkid, bool part_of_batch) { + if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + incr_pending_request_num(); if (blkid.num_pieces() == 1) { // Shortcut to most common case + decr_pending_request_num(); return m_vdev->async_write(buf, size, blkid.to_single_blkid(), part_of_batch); } else { static thread_local std::vector< folly::Future< std::error_code > > s_futs; @@ -163,17 +177,21 @@ folly::Future< std::error_code > BlkDataService::async_write(const char* buf, ui s_futs.emplace_back(m_vdev->async_write(ptr, sz, *bid, part_of_batch)); ptr += sz; } + decr_pending_request_num(); return collect_all_futures(s_futs); } } folly::Future< std::error_code > BlkDataService::async_write(sisl::sg_list const& sgs, MultiBlkId const& blkid, bool part_of_batch) { + if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + incr_pending_request_num(); // TODO: Async write should pass this by value the sgs.size parameter as well, currently vdev write routine // walks through again all the iovs and then getting the len to pass it down to iomgr. This defeats the purpose of // taking size parameters (which was done exactly done to avoid this walk through) if (blkid.num_pieces() == 1) { // Shortcut to most common case + decr_pending_request_num(); return m_vdev->async_writev(sgs.iovs.data(), sgs.iovs.size(), blkid.to_single_blkid(), part_of_batch); } else { static thread_local std::vector< folly::Future< std::error_code > > s_futs; @@ -185,31 +203,47 @@ folly::Future< std::error_code > BlkDataService::async_write(sisl::sg_list const const auto iovs = sg_it.next_iovs(bid->blk_count() * m_blk_size); s_futs.emplace_back(m_vdev->async_writev(iovs.data(), iovs.size(), *bid, part_of_batch)); } + decr_pending_request_num(); return collect_all_futures(s_futs); } } BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, MultiBlkId& out_blkids) { + if (is_stopping()) return BlkAllocStatus::FAILED; + incr_pending_request_num(); HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested"); blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size); - return m_vdev->alloc_blks(nblks, hints, out_blkids); + auto ret = m_vdev->alloc_blks(nblks, hints, out_blkids); + decr_pending_request_num(); + return ret; } BlkAllocStatus BlkDataService::commit_blk(MultiBlkId const& blkid) { + if (is_stopping()) return BlkAllocStatus::FAILED; + incr_pending_request_num(); + if (blkid.num_pieces() == 1) { // Shortcut to most common case - return m_vdev->commit_blk(blkid); + auto ret = m_vdev->commit_blk(blkid); + decr_pending_request_num(); + return ret; } auto it = blkid.iterate(); while (auto const bid = it.next()) { auto alloc_status = m_vdev->commit_blk(*bid); - if (alloc_status != BlkAllocStatus::SUCCESS) return alloc_status; + if (alloc_status != BlkAllocStatus::SUCCESS) { + decr_pending_request_num(); + return alloc_status; + } } + decr_pending_request_num(); return BlkAllocStatus::SUCCESS; } folly::Future< std::error_code > BlkDataService::async_free_blk(MultiBlkId const& bids) { + if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + incr_pending_request_num(); // create blk read waiter instance; folly::Promise< std::error_code > promise; auto f = promise.getFuture(); @@ -225,7 +259,7 @@ folly::Future< std::error_code > BlkDataService::async_free_blk(MultiBlkId const p.setValue(std::error_code{}); }); } - + decr_pending_request_num(); return f; } @@ -235,6 +269,19 @@ void BlkDataService::start() { std::move(std::make_unique< DataSvcCPCallbacks >(m_vdev))); } +void BlkDataService::stop() { + start_stopping(); + // we have no way to track the completion of each async io in detail which should be done in iomanager level, so we + // just wait for 3 seconds, and we expect each io will be completed within this time. + + // TODO: find a better solution to track the completion of these aysnc calls + std::this_thread::sleep_for(std::chrono::milliseconds(3000)); + while (true) { + if (!get_pending_request_num()) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } +} + uint64_t BlkDataService::get_total_capacity() const { return m_vdev->size(); } uint64_t BlkDataService::get_used_capacity() const { return m_vdev->used_size(); } diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 35782bf8d..793bc90d8 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -294,30 +294,39 @@ void HomeStore::shutdown() { LOGINFO("Homestore shutdown is started"); - m_cp_mgr->shutdown(); - m_cp_mgr.reset(); - m_resource_mgr->stop(); - if (has_repl_data_service()) { + // 1 stop all the services, after which all the upper layer api call are rejected and there is not on-going request. + // Note that, after stopping, all the service are alive. + if (has_repl_data_service()) // Log and Data services are stopped by repl service s_cast< GenericReplService* >(m_repl_service.get())->stop(); + else { + if (has_log_service()) m_log_service->stop(); + if (has_data_service()) m_data_service->stop(); + } + + if (has_index_service()) m_index_service->stop(); + + // 2 call cp_manager shutdown, which will which trigger cp flush to make sure all the in-memory data of all the + // services are flushed to disk. since all the upper layer api call are rejected and there is not on-going request, + // so after cp flush is done, we can guarantee all the necessary data are persisted to disk. + m_cp_mgr->shutdown(); + m_cp_mgr.reset(); + + // 3 call reset/shutdown to clear all the services and after that all the services are dead, excluding metasevice + if (has_repl_data_service()) { m_log_service.reset(); m_data_service.reset(); m_repl_service.reset(); } else { - if (has_log_service()) { - m_log_service->stop(); - m_log_service.reset(); - } - if (has_data_service()) { m_data_service.reset(); } + if (has_log_service()) m_log_service.reset(); + if (has_data_service()) m_data_service.reset(); } - if (has_index_service()) { - m_index_service->stop(); - // m_index_service.reset(); - } + if (has_index_service()) m_index_service.reset(); + // 4 close metaservice and device_manager. if (has_meta_service()) { m_meta_service->stop(); m_meta_service.reset(); diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index 73b96b064..8e8f47bef 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -39,7 +39,9 @@ IndexService::IndexService(std::unique_ptr< IndexServiceCallbacks > cbs) : m_svc meta_service().register_handler( "wb_cache", - [this](meta_blk* mblk, sisl::byte_view buf, size_t size) { m_wbcache_sb = std::pair{mblk, std::move(buf)}; }, + [this](meta_blk* mblk, sisl::byte_view buf, size_t size) { + m_wbcache_sb = std::pair{mblk, std::move(buf)}; + }, nullptr); } @@ -92,35 +94,62 @@ void IndexService::start() { hs()->cp_mgr().trigger_cp_flush(true /* force */); } -void IndexService::stop() { m_wb_cache.reset(); } +IndexService::~IndexService() { m_wb_cache.reset(); } + +void IndexService::stop() { + start_stopping(); + while (true) { + if (!get_pending_request_num()) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + std::unique_lock lg(m_index_map_mtx); + for (auto& [_, table] : m_index_map) + table->stop(); +} uint64_t IndexService::num_tables() { std::unique_lock lg(m_index_map_mtx); return m_index_map.size(); } -void IndexService::add_index_table(const std::shared_ptr< IndexTableBase >& tbl) { +bool IndexService::add_index_table(const std::shared_ptr< IndexTableBase >& tbl) { + if (is_stopping()) return false; + incr_pending_request_num(); std::unique_lock lg(m_index_map_mtx); m_index_map.insert(std::make_pair(tbl->uuid(), tbl)); m_ordinal_index_map.insert(std::make_pair(tbl->ordinal(), tbl)); + decr_pending_request_num(); + return true; } -void IndexService::remove_index_table(const std::shared_ptr< IndexTableBase >& tbl) { +bool IndexService::remove_index_table(const std::shared_ptr< IndexTableBase >& tbl) { + if (is_stopping()) return false; + incr_pending_request_num(); std::unique_lock lg(m_index_map_mtx); m_index_map.erase(tbl->uuid()); m_ordinal_index_map.erase(tbl->ordinal()); + decr_pending_request_num(); + return true; } std::shared_ptr< IndexTableBase > IndexService::get_index_table(uuid_t uuid) const { + if (is_stopping()) return nullptr; + incr_pending_request_num(); std::unique_lock lg(m_index_map_mtx); auto const it = m_index_map.find(uuid); - return (it != m_index_map.cend()) ? it->second : nullptr; + auto ret = (it != m_index_map.cend()) ? it->second : nullptr; + decr_pending_request_num(); + return ret; } std::shared_ptr< IndexTableBase > IndexService::get_index_table(uint32_t ordinal) const { + if (is_stopping()) return nullptr; + incr_pending_request_num(); std::unique_lock lg(m_index_map_mtx); auto const it = m_ordinal_index_map.find(ordinal); - return (it != m_ordinal_index_map.cend()) ? it->second : nullptr; + auto ret = (it != m_ordinal_index_map.cend()) ? it->second : nullptr; + decr_pending_request_num(); + return ret; } void IndexService::repair_index_node(uint32_t ordinal, IndexBufferPtr const& node_buf) { diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 740580bb5..3716cb70e 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -47,8 +47,6 @@ LogDev::LogDev(logdev_id_t id, flush_mode_t flush_mode) : m_logdev_id{id}, m_flu m_flush_size_multiple = HS_DYNAMIC_CONFIG(logstore->flush_size_multiple_logdev); } -LogDev::~LogDev() = default; - void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) { // Each logdev has one journal descriptor. m_vdev = vdev; @@ -62,7 +60,6 @@ void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) { m_log_group_pool[i].start(m_flush_size_multiple, m_vdev->align_size()); } m_log_records = std::make_unique< sisl::StreamTracker< log_record > >(); - m_stopped = false; // First read the info block if (format) { @@ -106,31 +103,13 @@ void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) { } } -void LogDev::stop() { +LogDev::~LogDev() { THIS_LOGDEV_LOG(INFO, "Logdev stopping id {}", m_logdev_id); HS_LOG_ASSERT((m_pending_flush_size.load() == 0), "LogDev stop attempted while writes to logdev are pending completion"); - { - std::unique_lock lg = flush_guard(); - m_stopped = true; - // waiting under lock to make sure no new flush is started - while (m_pending_callback.load() > 0) { - THIS_LOGDEV_LOG(INFO, "Waiting for pending callbacks to complete, pending callbacks {}", - m_pending_callback.load()); - std::this_thread::sleep_for(std::chrono::milliseconds{1000}); - } - } - // after we call stop, we need to do any pending device truncations - truncate(); if (allow_timer_flush()) stop_timer(); - - { - folly::SharedMutexWritePriority::WriteHolder holder(m_store_map_mtx); - m_id_logstore_map.clear(); - } - - m_log_records = nullptr; + m_log_records.reset(nullptr); m_logdev_meta.reset(); m_log_idx.store(0); m_pending_flush_size.store(0); @@ -147,9 +126,29 @@ void LogDev::stop() { m_hs.reset(); } -bool LogDev::is_stopped() { - std::unique_lock lg = flush_guard(); - return m_stopped; +void LogDev::stop() { + start_stopping(); + while (true) { + if (!get_pending_request_num()) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + { + std::unique_lock lg = flush_guard(); + // waiting under lock to make sure no new flush is started + while (m_pending_callback.load() > 0) { + THIS_LOGDEV_LOG(INFO, "Waiting for pending callbacks to complete, pending callbacks {}", + m_pending_callback.load()); + std::this_thread::sleep_for(std::chrono::milliseconds{1000}); + } + } + + folly::SharedMutexWritePriority::ReadHolder holder(m_store_map_mtx); + for (auto& [_, store] : m_id_logstore_map) + store.log_store->stop(); + + // after we call stop, we need to do any pending device truncations + truncate(); + m_id_logstore_map.clear(); } void LogDev::destroy() { @@ -256,14 +255,19 @@ void LogDev::assert_next_pages(log_stream_reader& lstream) { int64_t LogDev::append_async(logstore_id_t store_id, logstore_seq_num_t seq_num, const sisl::io_blob& data, void* cb_context) { + if (is_stopping()) return -1; + incr_pending_request_num(); const auto idx = m_log_idx.fetch_add(1, std::memory_order_acq_rel); m_pending_flush_size.fetch_add(data.size(), std::memory_order_relaxed); m_log_records->create(idx, store_id, seq_num, data, cb_context); if (allow_inline_flush()) flush_if_necessary(); + decr_pending_request_num(); return idx; } log_buffer LogDev::read(const logdev_key& key) { + if (is_stopping()) return -1; + incr_pending_request_num(); std::unique_lock lg = flush_guard(); auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread); auto ec = m_vdev_jd->sync_pread(buf->bytes(), initial_read_size, key.dev_offset); @@ -288,11 +292,13 @@ log_buffer LogDev::read(const logdev_key& key) { m_vdev_jd->sync_pread(new_buf->bytes(), rounded_size, key.dev_offset + rounded_data_offset); ret_view = sisl::byte_view{new_buf, s_cast< uint32_t >(data_offset - rounded_data_offset), record_header->size}; } - + decr_pending_request_num(); return ret_view; } void LogDev::read_record_header(const logdev_key& key, serialized_log_record& return_record_header) { + if (is_stopping()) return; + incr_pending_request_num(); std::unique_lock lg = flush_guard(); auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread); auto ec = m_vdev_jd->sync_pread(buf->bytes(), initial_read_size, key.dev_offset); @@ -305,6 +311,7 @@ void LogDev::read_record_header(const logdev_key& key, serialized_log_record& re return_record_header = serialized_log_record(record_header->size, record_header->offset, record_header->get_inlined(), record_header->store_seq_num, record_header->store_id); + decr_pending_request_num(); } void LogDev::verify_log_group_header(const logid_t idx, const log_group_header* header) { @@ -342,7 +349,9 @@ void LogDev::unreserve_store_id(logstore_id_t store_id) { m_garbage_store_ids.emplace(log_id, store_id); } -void LogDev::get_registered_store_ids(std::vector< logstore_id_t >& registered, std::vector< logstore_id_t >& garbage) { +bool LogDev::get_registered_store_ids(std::vector< logstore_id_t >& registered, std::vector< logstore_id_t >& garbage) { + if (is_stopping()) return false; + incr_pending_request_num(); std::unique_lock lg{m_meta_mutex}; for (const auto& id : m_logdev_meta.reserved_store_ids()) { registered.push_back(id); @@ -352,6 +361,8 @@ void LogDev::get_registered_store_ids(std::vector< logstore_id_t >& registered, for (const auto& elem : m_garbage_store_ids) { garbage.push_back(elem.second); } + decr_pending_request_num(); + return true; } /* @@ -389,9 +400,12 @@ bool LogDev::can_flush_in_this_thread() { } bool LogDev::flush_if_necessary(int64_t threshold_size) { + if (is_stopping()) return false; + incr_pending_request_num(); if (!can_flush_in_this_thread()) { iomanager.run_on_forget(logstore_service().flush_thread(), [this, threshold_size]() { flush_if_necessary(threshold_size); }); + decr_pending_request_num(); return false; } @@ -407,10 +421,11 @@ bool LogDev::flush_if_necessary(int64_t threshold_size) { if (flush_by_size || flush_by_time) { std::unique_lock lck(m_flush_mtx, std::try_to_lock); if (lck.owns_lock()) { - if (m_stopped) return false; + decr_pending_request_num(); return flush(); } } + decr_pending_request_num(); return false; } @@ -437,9 +452,9 @@ bool LogDev::flush() { return false; } - // the amount of logs which one logGroup can flush has a upper limit. here we want to make sure all the logs that - // need to be flushed will definitely be flushed to physical dev, so we need this loop to create multiple log groups - // if necessary + // the amount of logs which one logGroup can flush has a upper limit. here we want to make sure all the logs + // that need to be flushed will definitely be flushed to physical dev, so we need this loop to create multiple + // log groups if necessary for (; m_last_flush_idx < new_idx;) { LogGroup* lg = prepare_flush(new_idx - m_last_flush_idx + 4); // Estimate 4 more extra in case of parallel writes @@ -517,6 +532,8 @@ void LogDev::on_flush_completion(LogGroup* lg) { } uint64_t LogDev::truncate() { + auto stopping = is_stopping(); + incr_pending_request_num(); // Order of this lock has to be preserved. We take externally visible lock which is flush lock first. This // prevents any further update to tail_lsn and also flushes conurrently with truncation. Then we take the store // map lock, which is contained in this class and then meta_mutex. Reason for this is, we take meta_mutex under @@ -531,15 +548,13 @@ uint64_t LogDev::truncate() { auto lstore = store.log_store; if (lstore == nullptr) { continue; } auto const [trunc_lsn, trunc_ld_key, tail_lsn] = lstore->truncate_info(); - m_logdev_meta.update_store_superblk(store_id, logstore_superblk(trunc_lsn + 1), m_stopped /* persist_now */); + m_logdev_meta.update_store_superblk(store_id, logstore_superblk(trunc_lsn + 1), stopping /* persist_now */); // We found a new minimum logdev_key that we can truncate to if (trunc_ld_key.idx < min_safe_ld_key.idx) { min_safe_ld_key = trunc_ld_key; } } // All log stores are empty, we can truncate logs depends on the last flushed logdev_key - if (min_safe_ld_key == logdev_key::out_of_bound_ld_key()) { - min_safe_ld_key = m_last_flush_ld_key; - } + if (min_safe_ld_key == logdev_key::out_of_bound_ld_key()) { min_safe_ld_key = m_last_flush_ld_key; } // There are no writes or no truncation called for any of the store, so we can't truncate anything if (min_safe_ld_key.idx <= 0 || min_safe_ld_key.idx <= m_last_truncate_idx) { @@ -552,6 +567,7 @@ uint64_t LogDev::truncate() { // 5. Get m_trunc_ld_key={0,0}, goto here and return 0 without persist. // 6. Follower1 is killed again, after restart, its start index remains 0, misinterpreting the range as [1,2500]. m_logdev_meta.persist(); + decr_pending_request_num(); return 0; } @@ -562,7 +578,7 @@ uint64_t LogDev::truncate() { // Update the start offset to be read upon restart m_last_truncate_idx = min_safe_ld_key.idx; - m_logdev_meta.set_start_dev_offset(min_safe_ld_key.dev_offset, min_safe_ld_key.idx, m_stopped /* persist_now */); + m_logdev_meta.set_start_dev_offset(min_safe_ld_key.dev_offset, min_safe_ld_key.idx, stopping /* persist_now */); // When a logstore is removed, it unregisteres the store and keeps the store id in garbage list. We can capture // these store_ids upto the log_idx which is truncated and then unreserve those. Now on we can re-use the @@ -572,22 +588,27 @@ uint64_t LogDev::truncate() { HS_PERIODIC_LOG(DEBUG, logstore, "Garbage collecting log_store={} in log_dev={} log_idx={}", it->second, m_logdev_id, it->first); - m_logdev_meta.unreserve_store(it->second, m_stopped /* persist_now */); + m_logdev_meta.unreserve_store(it->second, stopping /* persist_now */); it = m_garbage_store_ids.erase(it); } // We can remove the rollback records of those upto which logid is getting truncated - m_logdev_meta.remove_rollback_record_upto(min_safe_ld_key.idx, m_stopped /* persist_now */); + m_logdev_meta.remove_rollback_record_upto(min_safe_ld_key.idx, stopping /* persist_now */); THIS_LOGDEV_LOG(DEBUG, "LogDev::truncate remove rollback {}", min_safe_ld_key.idx); // All logdev meta information is updated in-memory, persist now m_logdev_meta.persist(); + decr_pending_request_num(); return num_records_to_truncate; } -void LogDev::rollback(logstore_id_t store_id, logid_range_t id_range) { +bool LogDev::rollback(logstore_id_t store_id, logid_range_t id_range) { + if (is_stopping()) return false; + incr_pending_request_num(); std::unique_lock lg{m_meta_mutex}; m_logdev_meta.add_rollback_record(store_id, id_range, true); + decr_pending_request_num(); + return true; } /////////////////////////////// LogStore Section /////////////////////////////////////// @@ -615,6 +636,8 @@ void LogDev::handle_unopened_log_stores(bool format) { } std::shared_ptr< HomeLogStore > LogDev::create_new_log_store(bool append_mode) { + if (is_stopping()) return nullptr; + incr_pending_request_num(); auto const store_id = reserve_store_id(); std::shared_ptr< HomeLogStore > lstore; lstore = std::make_shared< HomeLogStore >(shared_from_this(), store_id, append_mode, 0); @@ -626,6 +649,7 @@ std::shared_ptr< HomeLogStore > LogDev::create_new_log_store(bool append_mode) { m_id_logstore_map.insert(std::pair(store_id, logstore_info{.log_store = lstore, .append_mode = append_mode})); } HS_LOG(DEBUG, logstore, "Created log store log_dev={} log_store={}", m_logdev_id, store_id); + decr_pending_request_num(); return lstore; } @@ -648,17 +672,22 @@ folly::Future< shared< HomeLogStore > > LogDev::open_log_store(logstore_id_t sto return it->second.promise.getFuture(); } -void LogDev::remove_log_store(logstore_id_t store_id) { +bool LogDev::remove_log_store(logstore_id_t store_id) { + if (is_stopping()) return false; + incr_pending_request_num(); LOGINFO("Removing log_dev={} log_store={}", m_logdev_id, store_id); { folly::SharedMutexWritePriority::WriteHolder holder(m_store_map_mtx); auto ret = m_id_logstore_map.erase(store_id); if (ret == 0) { LOGWARN("try to remove invalid store_id {}-{}", m_logdev_id, store_id); - return; + decr_pending_request_num(); + return false; } } unreserve_store_id(store_id); + decr_pending_request_num(); + return true; } void LogDev::on_log_store_found(logstore_id_t store_id, const logstore_superblk& sb) { @@ -734,7 +763,7 @@ nlohmann::json LogDev::get_status(int verbosity) const { js["last_truncate_log_idx"] = m_last_truncate_idx; js["time_since_last_log_flush_ns"] = get_elapsed_time_ns(m_last_flush_time); if (verbosity == 2) { - js["logdev_stopped?"] = m_stopped; + js["logdev_stopped?"] = is_stopping(); js["logdev_sb_start_offset"] = m_logdev_meta.get_start_dev_offset(); js["logdev_sb_num_stores_reserved"] = m_logdev_meta.num_stores_reserved(); } diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index 5a8fafc2c..719a58861 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -604,17 +604,11 @@ class LogDev : public std::enable_shared_from_this< LogDev > { void start(bool format, std::shared_ptr< JournalVirtualDev > vdev); /** - * @brief Stop the logdev. It resets all the parameters it is using and thus can be started later + * @brief Stop the logdev. it waits for all the pending writes to be completed and reject new api calls. * */ void stop(); - /** - * @brief return whether the logdev is stopped or not - * - */ - bool is_stopped(); - /** * @brief Destroy the logdev metablks. * @@ -678,7 +672,7 @@ class LogDev : public std::enable_shared_from_this< LogDev > { * @param store_id : Store id whose logids are to be rolled back or invalidated * @param id_range : Log id range to rollback/invalidate */ - void rollback(logstore_id_t store_id, logid_range_t id_range); + bool rollback(logstore_id_t store_id, logid_range_t id_range); /** * @brief This method get all the store ids that are registered already and out of them which are being garbaged @@ -687,7 +681,7 @@ class LogDev : public std::enable_shared_from_this< LogDev > { * @param registered out - Reference to the vector where all registered ids are pushed * @param garbage out - Reference to the vector where all garbage ids */ - void get_registered_store_ids(std::vector< logstore_id_t >& registered, std::vector< logstore_id_t >& garbage); + bool get_registered_store_ids(std::vector< logstore_id_t >& registered, std::vector< logstore_id_t >& garbage); nlohmann::json dump_log_store(const log_dump_req& dum_req); nlohmann::json get_status(int verbosity) const; @@ -716,7 +710,7 @@ class LogDev : public std::enable_shared_from_this< LogDev > { /// @brief Remove the log store and its associated resources /// @param store_id Store id that was created/opened - void remove_log_store(logstore_id_t store_id); + bool remove_log_store(logstore_id_t store_id); /// @return externally visible lock to avoid flush concurrently auto flush_guard() { return std::unique_lock(m_flush_mtx); } @@ -781,7 +775,6 @@ class LogDev : public std::enable_shared_from_this< LogDev > { std::unique_ptr< sisl::StreamTracker< log_record > > m_log_records; // Container stores all in-memory log records std::atomic< logid_t > m_log_idx{0}; // Generator of log idx std::atomic< int64_t > m_pending_flush_size{0}; // How much flushable logs are pending - bool m_stopped{false}; // Is Logdev stopped. We don't need lock here, because it is updated under flush lock logdev_id_t m_logdev_id; std::shared_ptr< JournalVirtualDev > m_vdev; shared< JournalVirtualDev::Descriptor > m_vdev_jd; // Journal descriptor. @@ -795,9 +788,9 @@ class LogDev : public std::enable_shared_from_this< LogDev > { std::multimap< logid_t, logstore_id_t > m_garbage_store_ids; Clock::time_point m_last_flush_time; - logid_t m_last_flush_idx{-1}; // Track last flushed, last device offset and truncated log idx - logdev_key m_last_flush_ld_key{0,0}; // Left interval of the last flush, 0 indicates the very beginning of logdev - logid_t m_last_truncate_idx{-1}; // Logdev truncate up to this idx + logid_t m_last_flush_idx{-1}; // Track last flushed, last device offset and truncated log idx + logdev_key m_last_flush_ld_key{0, 0}; // Left interval of the last flush, 0 indicates the very beginning of logdev + logid_t m_last_truncate_idx{-1}; // Logdev truncate up to this idx crc32_t m_last_crc{INVALID_CRC32_VALUE}; // LogDev Info block related fields @@ -816,6 +809,19 @@ class LogDev : public std::enable_shared_from_this< LogDev > { // same thread. iomgr::FiberManagerLib::mutex m_flush_mtx; std::atomic_uint64_t m_pending_callback{0}; + +private: + // graceful shutdown related fields + std::atomic_bool m_stopping{false}; + mutable std::atomic_uint64_t pending_request_num{0}; + + bool is_stopping() const { return m_stopping.load(); } + void start_stopping() { m_stopping = true; } + + uint64_t get_pending_request_num() const { return pending_request_num.load(); } + + void incr_pending_request_num() const { pending_request_num++; } + void decr_pending_request_num() const { pending_request_num--; } }; // LogDev } // namespace homestore diff --git a/src/lib/logstore/log_store.cpp b/src/lib/logstore/log_store.cpp index 427207e12..26a5dba5f 100644 --- a/src/lib/logstore/log_store.cpp +++ b/src/lib/logstore/log_store.cpp @@ -44,7 +44,9 @@ HomeLogStore::HomeLogStore(std::shared_ptr< LogDev > logdev, logstore_id_t id, b m_fq_name{fmt::format("{} log_dev={}", id, logdev->get_id())}, m_metrics{logstore_service().metrics()} {} -void HomeLogStore::write_async(logstore_req* req, const log_req_comp_cb_t& cb) { +logstore_seq_num_t HomeLogStore::write_async(logstore_req* req, const log_req_comp_cb_t& cb) { + if (is_stopping()) return 0; + incr_pending_request_num(); HS_LOG_ASSERT((cb || m_comp_cb), "Expected either cb is not null or default cb registered"); req->cb = (cb ? cb : m_comp_cb); req->start_time = Clock::now(); @@ -58,43 +60,59 @@ void HomeLogStore::write_async(logstore_req* req, const log_req_comp_cb_t& cb) { m_records.create(req->seq_num); COUNTER_INCREMENT(m_metrics, logstore_append_count, 1); HISTOGRAM_OBSERVE(m_metrics, logstore_record_size, req->data.size()); - m_logdev->append_async(m_store_id, req->seq_num, req->data, static_cast< void* >(req)); + auto ret = m_logdev->append_async(m_store_id, req->seq_num, req->data, static_cast< void* >(req)); + decr_pending_request_num(); + return ret; } -void HomeLogStore::write_async(logstore_seq_num_t seq_num, const sisl::io_blob& b, void* cookie, - const log_write_comp_cb_t& cb) { +logstore_seq_num_t HomeLogStore::write_async(logstore_seq_num_t seq_num, const sisl::io_blob& b, void* cookie, + const log_write_comp_cb_t& cb) { + if (is_stopping()) return 0; + incr_pending_request_num(); // Form an internal request and issue the write auto* req = logstore_req::make(this, seq_num, b); req->cookie = cookie; - write_async(req, [cb](logstore_req* req, logdev_key written_lkey) { + auto ret = write_async(req, [cb](logstore_req* req, logdev_key written_lkey) { if (cb) { cb(req->seq_num, req->data, written_lkey, req->cookie); } logstore_req::free(req); }); + decr_pending_request_num(); + return ret; } logstore_seq_num_t HomeLogStore::append_async(const sisl::io_blob& b, void* cookie, const log_write_comp_cb_t& cb) { + if (is_stopping()) return 0; + incr_pending_request_num(); HS_DBG_ASSERT_EQ(m_append_mode, true, "append_async can be called only on append only mode"); const auto seq_num = m_next_lsn.fetch_add(1, std::memory_order_acq_rel); write_async(seq_num, b, cookie, cb); + decr_pending_request_num(); return seq_num; } -void HomeLogStore::write_and_flush(logstore_seq_num_t seq_num, const sisl::io_blob& b) { +logstore_seq_num_t HomeLogStore::write_and_flush(logstore_seq_num_t seq_num, const sisl::io_blob& b) { + if (is_stopping()) return 0; + incr_pending_request_num(); HS_LOG_ASSERT(iomanager.am_i_sync_io_capable(), "Write and flush is a blocking IO, which can't run in this thread, please reschedule to a fiber"); if (seq_num > m_next_lsn.load(std::memory_order_relaxed)) m_next_lsn.store(seq_num + 1, std::memory_order_relaxed); - write_async(seq_num, b, nullptr /* cookie */, nullptr /* cb */); + auto ret = write_async(seq_num, b, nullptr /* cookie */, nullptr /* cb */); m_logdev->flush_under_guard(); + decr_pending_request_num(); + return ret; } log_buffer HomeLogStore::read_sync(logstore_seq_num_t seq_num) { + if (is_stopping()) return log_buffer{}; + incr_pending_request_num(); HS_LOG_ASSERT(iomanager.am_i_sync_io_capable(), "Read sync is a blocking IO, which can't run in this thread, reschedule to a fiber"); // If seq_num has not been flushed yet, but issued, then we flush them before reading auto const s = m_records.status(seq_num); if (s.is_out_of_range || s.is_hole) { + decr_pending_request_num(); throw std::out_of_range("key not valid since it has been truncated"); } else if (!s.is_completed) { THIS_LOGSTORE_LOG(TRACE, "Reading lsn={}:{} before flushed, doing flush first", m_store_id, seq_num); @@ -105,6 +123,7 @@ log_buffer HomeLogStore::read_sync(logstore_seq_num_t seq_num) { const logdev_key ld_key = record.m_dev_key; if (!ld_key.is_valid()) { THIS_LOGSTORE_LOG(ERROR, "ld_key not valid {}", seq_num); + decr_pending_request_num(); throw std::out_of_range("key not valid"); } @@ -112,6 +131,7 @@ log_buffer HomeLogStore::read_sync(logstore_seq_num_t seq_num) { COUNTER_INCREMENT(m_metrics, logstore_read_count, 1); const auto b = m_logdev->read(ld_key); HISTOGRAM_OBSERVE(m_metrics, logstore_read_latency, get_elapsed_time_us(start_time)); + decr_pending_request_num(); return b; } @@ -175,8 +195,15 @@ void HomeLogStore::on_log_found(logstore_seq_num_t seq_num, const logdev_key& ld if (m_found_cb != nullptr) { m_found_cb(seq_num, buf, nullptr); } } -void HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate_only) { - if (upto_lsn < m_start_lsn) { return; } +bool HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate_only) { + if (is_stopping()) return false; + incr_pending_request_num(); + if (upto_lsn < m_start_lsn) { + decr_pending_request_num(); + THIS_LOGSTORE_LOG(WARN, "Truncating logstore upto lsn={} , start_lsn={}, upto_lsn < m_start_lsn", upto_lsn, + m_start_lsn.load(std::memory_order_relaxed)); + return false; + } flush(); #ifndef NDEBUG auto cs = get_contiguous_completed_seq_num(0); @@ -194,9 +221,8 @@ void HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate // In baseline resync path, we truncate all entries up to upto_lsn, and update m_tail_lsn and m_next_lsn // to make sure logstore's idx is always = raft's idx - 1. if (upto_lsn > m_tail_lsn) { - THIS_LOGSTORE_LOG(WARN, - "Truncating issued on lsn={} which is greater than tail_lsn={}", - upto_lsn, m_tail_lsn.load(std::memory_order_relaxed)); + THIS_LOGSTORE_LOG(WARN, "Truncating issued on lsn={} which is greater than tail_lsn={}", upto_lsn, + m_tail_lsn.load(std::memory_order_relaxed)); // update m_tail_lsn if it is less than upto_lsn auto current_tail_lsn = m_tail_lsn.load(std::memory_order_relaxed); while (current_tail_lsn < upto_lsn && @@ -218,6 +244,8 @@ void HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate m_records.truncate(upto_lsn); m_start_lsn.store(upto_lsn + 1); if (!in_memory_truncate_only) { m_logdev->truncate(); } + decr_pending_request_num(); + return true; } std::tuple< logstore_seq_num_t, logdev_key, logstore_seq_num_t > HomeLogStore::truncate_info() const { @@ -230,16 +258,30 @@ std::tuple< logstore_seq_num_t, logdev_key, logstore_seq_num_t > HomeLogStore::t : std::make_tuple(trunc_lsn, m_trunc_ld_key, tail_lsn); } -void HomeLogStore::fill_gap(logstore_seq_num_t seq_num) { +bool HomeLogStore::fill_gap(logstore_seq_num_t seq_num) { + if (is_stopping()) return false; + incr_pending_request_num(); HS_DBG_ASSERT_EQ(m_records.status(seq_num).is_hole, true, "Attempted to fill gap lsn={} which has valid data", seq_num); logdev_key empty_ld_key; m_records.create_and_complete(seq_num, logstore_record(empty_ld_key, empty_ld_key)); + decr_pending_request_num(); + return true; +} + +void HomeLogStore::stop() { + start_stopping(); + while (true) { + if (!get_pending_request_num()) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } } nlohmann::json HomeLogStore::dump_log_store(const log_dump_req& dump_req) { nlohmann::json json_dump{}; // create root object + if (is_stopping()) return json_dump; + incr_pending_request_num(); json_dump["store_id"] = this->m_store_id; int64_t start_idx = std::max(dump_req.start_seq_num, start_lsn()); @@ -272,14 +314,19 @@ nlohmann::json HomeLogStore::dump_log_store(const log_dump_req& dump_req) { }); json_dump["log_records"] = std::move(json_records); + decr_pending_request_num(); return json_dump; } -void HomeLogStore::foreach (int64_t start_idx, const std::function< bool(logstore_seq_num_t, log_buffer) >& cb) { +bool HomeLogStore::foreach (int64_t start_idx, const std::function< bool(logstore_seq_num_t, log_buffer) >& cb) { + if (is_stopping()) return false; + incr_pending_request_num(); m_records.foreach_all_completed(start_idx, [&](int64_t cur_idx, homestore::logstore_record& record) -> bool { auto log_buf = m_logdev->read(record.m_dev_key); return cb(cur_idx, log_buf); }); + decr_pending_request_num(); + return true; } logstore_seq_num_t HomeLogStore::get_contiguous_issued_seq_num(logstore_seq_num_t from) const { @@ -290,24 +337,34 @@ logstore_seq_num_t HomeLogStore::get_contiguous_completed_seq_num(logstore_seq_n return (logstore_seq_num_t)m_records.completed_upto(from + 1); } -void HomeLogStore::flush(logstore_seq_num_t upto_lsn) { +bool HomeLogStore::flush(logstore_seq_num_t upto_lsn) { + if (is_stopping()) return false; + incr_pending_request_num(); if (!m_logdev->allow_explicit_flush()) { HS_LOG_ASSERT(false, "Explicit flush is turned off or calling flush on wrong thread for this logdev, ignoring flush"); - return; + decr_pending_request_num(); + return false; } m_logdev->flush_under_guard(); + decr_pending_request_num(); + return true; } bool HomeLogStore::rollback(logstore_seq_num_t to_lsn) { - //Fast path + if (is_stopping()) return false; + incr_pending_request_num(); + // Fast path if (to_lsn == m_tail_lsn.load()) { - return true; + decr_pending_request_num(); + return true; } if (to_lsn > m_tail_lsn.load() || to_lsn < m_start_lsn.load()) { - HS_LOG_ASSERT(false, "Attempted to rollback to {} which is not in the range of [{}, {}]", to_lsn, m_start_lsn.load(), m_tail_lsn.load()); + HS_LOG_ASSERT(false, "Attempted to rollback to {} which is not in the range of [{}, {}]", to_lsn, + m_start_lsn.load(), m_tail_lsn.load()); + decr_pending_request_num(); return false; } @@ -343,17 +400,21 @@ bool HomeLogStore::rollback(logstore_seq_num_t to_lsn) { if (do_flush) m_logdev->flush_under_guard(); } while (do_flush); + decr_pending_request_num(); return true; } nlohmann::json HomeLogStore::get_status(int verbosity) const { nlohmann::json js; + if (is_stopping()) return js; + incr_pending_request_num(); js["append_mode"] = m_append_mode; js["start_lsn"] = m_start_lsn.load(std::memory_order_relaxed); js["next_lsn"] = m_next_lsn.load(std::memory_order_relaxed); js["tail_lsn"] = m_tail_lsn.load(std::memory_order_relaxed); js["logstore_records"] = m_records.get_status(verbosity); js["logstore_sb_first_lsn"] = m_logdev->log_dev_meta().store_superblk(m_store_id).m_first_seq_num; + decr_pending_request_num(); return js; } diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index 375f892b3..86f404e8c 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -120,14 +120,20 @@ void LogStoreService::start(bool format) { } void LogStoreService::stop() { - // device_truncate(nullptr, true, false); + start_stopping(); + while (true) { + if (!get_pending_request_num()) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + for (auto& [id, logdev] : m_id_logdev_map) { logdev->stop(); } - { - folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); - m_id_logdev_map.clear(); - } +} + +LogStoreService::~LogStoreService() { + folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); + m_id_logdev_map.clear(); } logdev_id_t LogStoreService::get_next_logdev_id() { @@ -137,16 +143,21 @@ logdev_id_t LogStoreService::get_next_logdev_id() { } logdev_id_t LogStoreService::create_new_logdev() { + if (is_stopping()) return 0; + incr_pending_request_num(); folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); logdev_id_t logdev_id = get_next_logdev_id(); auto logdev = create_new_logdev_internal(logdev_id); logdev->start(true /* format */, m_logdev_vdev); COUNTER_INCREMENT(m_metrics, logdevs_count, 1); HS_LOG(INFO, logstore, "Created log_dev={}", logdev_id); + decr_pending_request_num(); return logdev_id; } void LogStoreService::destroy_log_dev(logdev_id_t logdev_id) { + if (is_stopping()) return; + incr_pending_request_num(); folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); const auto it = m_id_logdev_map.find(logdev_id); if (it == m_id_logdev_map.end()) { @@ -156,10 +167,7 @@ void LogStoreService::destroy_log_dev(logdev_id_t logdev_id) { // Stop the logdev and release all the chunks from the journal vdev. auto& logdev = it->second; - if (!logdev->is_stopped()) { - // Stop the logdev if its started. - logdev->stop(); - } + logdev->stop(); // First release all chunks. m_logdev_vdev->destroy(logdev_id); @@ -170,6 +178,7 @@ void LogStoreService::destroy_log_dev(logdev_id_t logdev_id) { m_id_logdev_map.erase(it); COUNTER_DECREMENT(m_metrics, logdevs_count, 1); HS_LOG(INFO, logstore, "Removed log_dev={}", logdev_id); + decr_pending_request_num(); } void LogStoreService::delete_unopened_logdevs() { @@ -201,11 +210,15 @@ void LogStoreService::open_logdev(logdev_id_t logdev_id) { } std::vector< std::shared_ptr< LogDev > > LogStoreService::get_all_logdevs() { - folly::SharedMutexWritePriority::ReadHolder holder(m_logdev_map_mtx); std::vector< std::shared_ptr< LogDev > > res; + if (is_stopping()) return res; + incr_pending_request_num(); + folly::SharedMutexWritePriority::ReadHolder holder(m_logdev_map_mtx); + for (auto& [id, logdev] : m_id_logdev_map) { res.push_back(logdev); } + decr_pending_request_num(); return res; } @@ -265,11 +278,15 @@ void LogStoreService::rollback_super_blk_found(const sisl::byte_view& buf, void* } std::shared_ptr< HomeLogStore > LogStoreService::create_new_log_store(logdev_id_t logdev_id, bool append_mode) { + if (is_stopping()) return nullptr; + incr_pending_request_num(); folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); COUNTER_INCREMENT(m_metrics, logstores_count, 1); const auto it = m_id_logdev_map.find(logdev_id); HS_REL_ASSERT((it != m_id_logdev_map.end()), "logdev id {} doesnt exist", logdev_id); - return it->second->create_new_log_store(append_mode); + auto ret = it->second->create_new_log_store(append_mode); + decr_pending_request_num(); + return ret; } folly::Future< shared< HomeLogStore > > LogStoreService::open_log_store(logdev_id_t logdev_id, logstore_id_t store_id, @@ -283,6 +300,8 @@ folly::Future< shared< HomeLogStore > > LogStoreService::open_log_store(logdev_i } void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t store_id) { + if (is_stopping()) return; + incr_pending_request_num(); folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); COUNTER_INCREMENT(m_metrics, logstores_count, 1); const auto it = m_id_logdev_map.find(logdev_id); @@ -291,19 +310,25 @@ void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t stor return; } it->second->remove_log_store(store_id); + decr_pending_request_num(); COUNTER_DECREMENT(m_metrics, logstores_count, 1); } void LogStoreService::device_truncate() { // TODO: make device_truncate_under_lock return future and do collectAllFutures; + if (is_stopping()) return; + incr_pending_request_num(); for (auto& [id, logdev] : m_id_logdev_map) logdev->truncate(); + decr_pending_request_num(); } void LogStoreService::flush() { - for (auto& [id, logdev] : m_id_logdev_map) { + if (is_stopping()) return; + incr_pending_request_num(); + for (auto& [id, logdev] : m_id_logdev_map) logdev->flush_under_guard(); - } + decr_pending_request_num(); } void LogStoreService::start_threads() { @@ -334,6 +359,8 @@ void LogStoreService::start_threads() { nlohmann::json LogStoreService::dump_log_store(const log_dump_req& dump_req) { nlohmann::json json_dump{}; // create root object + if (is_stopping()) return json_dump; + incr_pending_request_num(); if (dump_req.log_store == nullptr) { for (auto& [id, logdev] : m_id_logdev_map) { json_dump[logdev->get_id()] = logdev->dump_log_store(dump_req); @@ -344,14 +371,18 @@ nlohmann::json LogStoreService::dump_log_store(const log_dump_req& dump_req) { nlohmann::json val = logdev->dump_log_store(dump_req); json_dump[logdev->get_id()] = std::move(val); } + decr_pending_request_num(); return json_dump; } nlohmann::json LogStoreService::get_status(const int verbosity) const { nlohmann::json js; + if (is_stopping()) return js; + incr_pending_request_num(); for (auto& [id, logdev] : m_id_logdev_map) { js[logdev->get_id()] = logdev->get_status(verbosity); } + decr_pending_request_num(); return js; } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 2d7da6c72..eb9fa8dc0 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -135,6 +135,13 @@ bool RaftReplDev::join_group() { AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum) { + + if (is_stopping()) { + LOGINFO("repl dev is being shutdown!"); + return make_async_error<>(ReplServiceError::STOPPING); + } + incr_pending_request_num(); + LOGINFO("Replace member group_id={} member_out={} member_in={}", group_id_str(), boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); @@ -150,6 +157,7 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); RD_LOGI("Replace member leader is the member_out so yield leadership"); reset_quorum_size(0); + decr_pending_request_num(); return make_async_error<>(ReplServiceError::NOT_LEADER); } @@ -171,6 +179,7 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ } else { RD_LOGE("Replace member error in add member : {}", e.error()); reset_quorum_size(0); + decr_pending_request_num(); return make_async_error<>(RaftReplService::to_repl_error(e.error())); } } @@ -193,6 +202,7 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ if (err != ReplServiceError::OK) { LOGERROR("Replace member propose to raft failed {}", err); reset_quorum_size(0); + decr_pending_request_num(); return make_async_error<>(std::move(err)); } @@ -214,6 +224,7 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ // of replace member is idempotent. RD_LOGE("Replace member failed to remove member : {}", e.error()); reset_quorum_size(0); + decr_pending_request_num(); return make_async_error<>(ReplServiceError::RETRY_REQUEST); } } else { @@ -223,6 +234,7 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ // Revert the quorum size back to 0. reset_quorum_size(0); + decr_pending_request_num(); return make_async_success<>(); }); }); @@ -280,6 +292,12 @@ void RaftReplDev::on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< if (when_done) { when_done(ret_val, null_except); } } +// 1 before repl_dev.stop() is called, the upper layer should make sure that there is no pending request. so graceful +// shutdown can consider when stopping repl_dev, there is no pending request. +// 2 before the log is appended to log store, repl_dev will guarantee the corresponding data is persisted on disk. so +// even if we do not care about this when stop, it will be ok, since log will replayed after restart. + +// we do not have shutdown for async_alloc_write according to the two points above. void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& data, repl_req_ptr_t rreq) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } @@ -493,7 +511,8 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ auto rreq = it->second; if (!happened) { - // We already have the entry in the map, reset its start time to prevent it from being incorrectly gc during use. + // We already have the entry in the map, reset its start time to prevent it from being incorrectly gc during + // use. rreq->set_created_time(); // Check if we are already allocated the blk by previous caller, in that case we need to return the req. if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { @@ -778,9 +797,10 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_ // Edit this check if in the future we want to fetch from non-originator; if (originator != server_id()) { - auto const error_msg = fmt::format("Did not expect to receive fetch data from " - "remote when I am not the originator of this request, originator={}, my_server_id={}" - , originator, server_id()); + auto const error_msg = + fmt::format("Did not expect to receive fetch data from " + "remote when I am not the originator of this request, originator={}, my_server_id={}", + originator, server_id()); RD_LOGW("{}", error_msg); auto status = ::grpc::Status(::grpc::INVALID_ARGUMENT, error_msg); rpc_data->set_status(status); @@ -970,7 +990,7 @@ void RaftReplDev::handle_config_commit(const repl_lsn_t lsn, raft_cluster_config // since we didn't create repl req for config change, we just need to update m_commit_upto_lsn here. // keep this variable in case it is needed later - (void) new_conf; + (void)new_conf; auto prev_lsn = m_commit_upto_lsn.load(std::memory_order_relaxed); if (prev_lsn >= lsn || !m_commit_upto_lsn.compare_exchange_strong(prev_lsn, lsn)) { RD_LOGE("Raft Channel: unexpected log {} commited before config {} committed", prev_lsn, lsn); @@ -1047,23 +1067,41 @@ repl_req_ptr_t RaftReplDev::repl_key_to_req(repl_key const& rkey) const { return it->second; } +// async_read and async_free_blks graceful shutdown will be handled by data_service. + folly::Future< std::error_code > RaftReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch) { + if (is_stopping()) { + LOGINFO("repl dev is being shutdown!"); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + } return data_service().async_read(bid, sgs, size, part_of_batch); } -void RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid) { +folly::Future< std::error_code > RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid) { // TODO: For timeline consistency required, we should retain the blkid that is changed and write that to another // journal. - data_service().async_free_blk(bid); + if (is_stopping()) { + LOGINFO("repl dev is being shutdown!"); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + } + return data_service().async_free_blk(bid); } AsyncReplResult<> RaftReplDev::become_leader() { + if (is_stopping()) { + LOGINFO("repl dev is being shutdown!"); + return make_async_error<>(ReplServiceError::STOPPING); + } + incr_pending_request_num(); + return m_msg_mgr.become_leader(m_group_id).via(&folly::InlineExecutor::instance()).thenValue([this](auto&& e) { if (e.hasError()) { RD_LOGE("Error in becoming leader: {}", e.error()); + decr_pending_request_num(); return make_async_error<>(RaftReplService::to_repl_error(e.error())); } + decr_pending_request_num(); return make_async_success<>(); }); } @@ -1094,8 +1132,9 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { uint64_t least_active_repl_idx = my_committed_idx > HS_DYNAMIC_CONFIG(consensus.laggy_threshold) ? my_committed_idx - HS_DYNAMIC_CONFIG(consensus.laggy_threshold) : 0; - // peer's last log idx should also >= leader's start_index-1(ensure existence), otherwise leader can't append log entries to it - // and baseline resync will be triggerred. Try to avoid conflict between baseline resync and normal replication. + // peer's last log idx should also >= leader's start_index-1(ensure existence), otherwise leader can't append log + // entries to it and baseline resync will be triggerred. Try to avoid conflict between baseline resync and normal + // replication. least_active_repl_idx = std::max(least_active_repl_idx, m_data_journal->start_index() - 1); for (auto p : repl_status) { if (p.id_ == m_my_repl_id) { continue; } @@ -1103,8 +1142,8 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { res.insert(p.id_); } else { RD_LOGW("Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}", - p.id_, - my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_, least_active_repl_idx); + p.id_, my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_, + least_active_repl_idx); } } return res; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index b6cd9d744..53731012f 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -175,7 +175,7 @@ class RaftReplDev : public ReplDev, repl_req_ptr_t ctx) override; folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch = false) override; - void async_free_blks(int64_t lsn, MultiBlkId const& blkid) override; + folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid) override; AsyncReplResult<> become_leader() override; bool is_leader() const override; replica_id_t get_leader_id() const override; @@ -192,6 +192,7 @@ class RaftReplDev : public ReplDev, bool is_destroyed() const; Clock::time_point destroyed_time() const { return m_destroyed_time; } bool is_ready_for_traffic() const override { + if (is_stopping()) return false; auto committed_lsn = m_commit_upto_lsn.load(); auto gate = m_traffic_ready_lsn.load(); bool ready = committed_lsn >= gate; diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 4a6a92144..72d4fda48 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -41,7 +41,9 @@ void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& HS_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener write_journal(std::move(rreq)); }); - } else { write_journal(std::move(rreq)); } + } else { + write_journal(std::move(rreq)); + } } void SoloReplDev::write_journal(repl_req_ptr_t rreq) { @@ -96,7 +98,9 @@ folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, return data_service().async_read(bid, sgs, size, part_of_batch); } -void SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid) { data_service().async_free_blk(bid); } +folly::Future< std::error_code > SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid) { + return data_service().async_free_blk(bid); +} uint32_t SoloReplDev::get_blk_size() const { return data_service().get_blk_size(); } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index f252dd209..e734b3477 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -39,13 +39,15 @@ class SoloReplDev : public ReplDev { SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing); virtual ~SoloReplDev() = default; + // TODO: implement graceful shutdown for solo repl dev + void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, repl_req_ptr_t ctx) override; folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch = false) override; - void async_free_blks(int64_t lsn, MultiBlkId const& blkid) override; + folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid) override; AsyncReplResult<> become_leader() override { return make_async_error(ReplServiceError::OK); } bool is_leader() const override { return true; } diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index f5671cb16..c401a883a 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -45,11 +45,9 @@ GenericReplService::GenericReplService(cshared< ReplApplication >& repl_app) : m nullptr); } -void GenericReplService::stop() { - { - std::unique_lock lg{m_rd_map_mtx}; - m_rd_map.clear(); - } +GenericReplService::~GenericReplService() { + std::unique_lock lg{m_rd_map_mtx}; + m_rd_map.clear(); } ReplResult< shared< ReplDev > > GenericReplService::get_repl_dev(group_id_t group_id) const { @@ -80,6 +78,7 @@ hs_stats GenericReplService::get_cap_stats() const { ///////////////////// SoloReplService specializations and CP Callbacks ///////////////////////////// SoloReplService::SoloReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} {} +SoloReplService::~SoloReplService(){}; void SoloReplService::start() { for (auto const& [buf, mblk] : m_sb_bufs) { @@ -95,8 +94,7 @@ void SoloReplService::start() { } void SoloReplService::stop() { - GenericReplService::stop(); - hs()->logstore_service().stop(); + // TODO: Implement graceful shutdown for soloReplService } AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t group_id, diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index acdff7bd4..a7325ceca 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -47,8 +47,8 @@ class GenericReplService : public ReplicationService { static std::shared_ptr< GenericReplService > create(cshared< ReplApplication >& repl_app); GenericReplService(cshared< ReplApplication >& repl_app); + virtual ~GenericReplService(); virtual void start() = 0; - virtual void stop(); meta_sub_type get_meta_blk_name() const override { return "repl_dev"; } ReplResult< shared< ReplDev > > get_repl_dev(group_id_t group_id) const override; @@ -57,15 +57,31 @@ class GenericReplService : public ReplicationService { hs_stats get_cap_stats() const override; replica_id_t get_my_repl_uuid() const { return m_my_uuid; } // void resource_audit() override; + virtual void stop() = 0; protected: virtual void add_repl_dev(group_id_t group_id, shared< ReplDev > rdev); virtual void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) = 0; + + // graceful shutdown related +protected: + std::atomic_bool m_stopping{false}; + mutable std::atomic_uint64_t pending_request_num{0}; + + bool is_stopping() const { return m_stopping.load(); } + void start_stopping() { m_stopping = true; } + + uint64_t get_pending_request_num() const { return pending_request_num.load(); } + + void incr_pending_request_num() const { pending_request_num++; } + void decr_pending_request_num() const { pending_request_num--; } }; +// TODO: implement graceful shutdown for soloReplService class SoloReplService : public GenericReplService { public: SoloReplService(cshared< ReplApplication >& repl_app); + ~SoloReplService() override; void start() override; void stop() override; diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 6206c3dde..0d0391e34 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -179,10 +179,36 @@ void RaftReplService::start() { } void RaftReplService::stop() { - stop_reaper_thread(); - GenericReplService::stop(); + start_stopping(); + while (true) { + auto pending_request_num = get_pending_request_num(); + if (!pending_request_num) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + + // stop all repl_devs + std::unique_lock lg(m_rd_map_mtx); + for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) { + auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); + rdev->stop(); + } + + // this will stop and shutdown all the repl_dev and grpc server(data channel). + // for each raft_repl_dev: + // 1 Cancel snapshot requests if exist. + // 2 Terminate background commit thread. + // 3 Cancel all scheduler tasks. + // after m_msg_mgr is reset , no further data will hit data service and no futher log will hit log store. m_msg_mgr.reset(); + hs()->logstore_service().stop(); + hs()->data_service().stop(); +} + +RaftReplService::~RaftReplService() { + stop_reaper_thread(); + + // the base class destructor will clear the m_rd_map } void RaftReplService::monitor_cert_changes() { @@ -296,6 +322,8 @@ shared< nuraft_mesg::mesg_state_mgr > RaftReplService::create_state_mgr(int32_t AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t group_id, std::set< replica_id_t > const& members) { + if (is_stopping()) return make_async_error< shared< ReplDev > >(ReplServiceError::STOPPING); + incr_pending_request_num(); // TODO: All operations are made sync here for convenience to caller. However, we should attempt to make this async // and do deferValue to a seperate dedicated hs thread for these kind of operations and wakeup the caller. It // probably needs iomanager executor for deferValue. @@ -303,6 +331,7 @@ AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t // Create a new RAFT group and add all members. create_group() will call the create_state_mgr which will create // the repl_dev instance and add it to the map. if (auto const status = m_msg_mgr->create_group(group_id, "homestore_replication").get(); !status) { + decr_pending_request_num(); return make_async_error< shared< ReplDev > >(to_repl_error(status.error())); } @@ -318,6 +347,7 @@ AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t } else if (result.error() != nuraft::CONFIG_CHANGING) { LOGWARNMOD(replication, "Groupid={}, add member={} failed with error={}", boost::uuids::to_string(group_id), boost::uuids::to_string(member), result.error()); + decr_pending_request_num(); return make_async_error< shared< ReplDev > >(to_repl_error(result.error())); } else { LOGWARNMOD(replication, @@ -330,6 +360,7 @@ AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t } auto result = get_repl_dev(group_id); + decr_pending_request_num(); return result ? make_async_success< shared< ReplDev > >(result.value()) : make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_NOT_FOUND); } @@ -362,10 +393,18 @@ AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t // purge any unopened logstores. // folly::SemiFuture< ReplServiceError > RaftReplService::remove_repl_dev(group_id_t group_id) { + if (is_stopping()) return folly::makeSemiFuture< ReplServiceError >(ReplServiceError::STOPPING); + incr_pending_request_num(); + auto rdev_result = get_repl_dev(group_id); - if (!rdev_result) { return folly::makeSemiFuture< ReplServiceError >(ReplServiceError::SERVER_NOT_FOUND); } + if (!rdev_result) { + decr_pending_request_num(); + return folly::makeSemiFuture< ReplServiceError >(ReplServiceError::SERVER_NOT_FOUND); + } - return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value())->destroy_group(); + auto ret = std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value())->destroy_group(); + decr_pending_request_num(); + return ret; } void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) { @@ -414,14 +453,23 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum) const { + if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); + incr_pending_request_num(); auto rdev_result = get_repl_dev(group_id); - if (!rdev_result) { return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } + if (!rdev_result) { + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); + } return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) ->replace_member(member_out, member_in, commit_quorum) .via(&folly::InlineExecutor::instance()) .thenValue([this](auto&& e) mutable { - if (e.hasError()) { return make_async_error<>(e.error()); } + if (e.hasError()) { + decr_pending_request_num(); + return make_async_error<>(e.error()); + } + decr_pending_request_num(); return make_async_success<>(); }); } diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 9a53ad07d..953ba95e9 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -54,6 +54,7 @@ class RaftReplService : public GenericReplService, public: RaftReplService(cshared< ReplApplication >& repl_app); + ~RaftReplService() override; static ReplServiceError to_repl_error(nuraft::cmd_result_code code); @@ -101,7 +102,7 @@ class ReplSvcCPContext : public CPContext { std::map< ReplDev*, cshared< ReplDevCPContext > > m_cp_ctx_map; public: - ReplSvcCPContext(CP* cp) : CPContext(cp) {}; + ReplSvcCPContext(CP* cp) : CPContext(cp){}; virtual ~ReplSvcCPContext() = default; int add_repl_dev_ctx(ReplDev* dev, cshared< ReplDevCPContext > dev_ctx); cshared< ReplDevCPContext > get_repl_dev_ctx(ReplDev* dev); From bb39a22ceb8dfcebe7cf742a15d943a6bcc7db4b Mon Sep 17 00:00:00 2001 From: koujl <108138320+koujl@users.noreply.github.com> Date: Fri, 21 Feb 2025 09:38:11 +0800 Subject: [PATCH 071/170] Modify snapshot_context structure for persistency (#650) Previous snapshot_context interface is insufficient for decoupling homestore from customers on snapshot implementations. This commit replaces deserialize() with a virtual function of ReplDev for constructing particular snapshot_context instance from byte buffer. --- conanfile.py | 2 +- src/include/homestore/homestore_decl.hpp | 1 + src/include/homestore/replication/repl_dev.h | 39 +++----------------- src/lib/replication/repl_dev/raft_repl_dev.h | 39 +++++++++++++++++++- src/lib/replication/repl_dev/solo_repl_dev.h | 4 ++ 5 files changed, 49 insertions(+), 36 deletions(-) diff --git a/conanfile.py b/conanfile.py index b9275769f..93d78cc53 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.21" + version = "6.6.22" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/homestore_decl.hpp b/src/include/homestore/homestore_decl.hpp index 59d51f300..96c26ac09 100644 --- a/src/include/homestore/homestore_decl.hpp +++ b/src/include/homestore/homestore_decl.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index a14dd8824..2b422d647 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -72,14 +72,13 @@ struct repl_key { using repl_snapshot = nuraft::snapshot; using repl_snapshot_ptr = nuraft::ptr< nuraft::snapshot >; -// Consumers of the ReplDevListener dont have to know what underlying -// snapshot implementation is used. Consumers can export and save the state -// of the snapshot using serialize and load the state using deserialize. +// Consumers of ReplDevListener don't have to know what underlying snapshot context implementation is used by the +// ReplDev. The state of the snapshot can be exported with serialize() and loaded with +// repl_dev.deserialize_snapshot_context(). class snapshot_context { public: snapshot_context(int64_t lsn) : lsn_(lsn) {} virtual ~snapshot_context() = default; - virtual void deserialize(const sisl::io_blob_safe& snp_ctx) = 0; virtual sisl::io_blob_safe serialize() = 0; int64_t get_lsn() { return lsn_; } @@ -87,36 +86,6 @@ class snapshot_context { int64_t lsn_; }; -class nuraft_snapshot_context : public snapshot_context { -public: - nuraft_snapshot_context(nuraft::snapshot& snp) : snapshot_context(snp.get_last_log_idx()) { - auto snp_buf = snp.serialize(); - snapshot_ = nuraft::snapshot::deserialize(*snp_buf); - } - - void deserialize(const sisl::io_blob_safe& snp_ctx) override { - // Load the context from the io blob to nuraft buffer. - auto snp_buf = nuraft::buffer::alloc(snp_ctx.size()); - nuraft::buffer_serializer bs(snp_buf); - bs.put_raw(snp_ctx.cbytes(), snp_ctx.size()); - snapshot_ = nuraft::snapshot::deserialize(bs); - lsn_ = snapshot_->get_last_log_idx(); - } - - sisl::io_blob_safe serialize() override { - // Dump the context from nuraft buffer to the io blob. - auto snp_buf = snapshot_->serialize(); - sisl::io_blob_safe blob{s_cast< size_t >(snp_buf->size())}; - std::memcpy(blob.bytes(), snp_buf->data_begin(), snp_buf->size()); - return blob; - } - - nuraft::ptr< nuraft::snapshot > nuraft_snapshot() { return snapshot_; } - -private: - nuraft::ptr< nuraft::snapshot > snapshot_; -}; - struct snapshot_obj { void* user_ctx{nullptr}; uint64_t offset{0}; @@ -480,6 +449,8 @@ class ReplDev { /// @brief Clean up resources on this repl dev. virtual void purge() = 0; + virtual std::shared_ptr deserialize_snapshot_context(sisl::io_blob_safe &snp_ctx) = 0; + virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } virtual void detach_listener() { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 53731012f..46bf18b28 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -113,6 +113,38 @@ struct ReplDevCPContext { uint64_t last_applied_dsn; }; +class nuraft_snapshot_context : public snapshot_context { +public: + nuraft_snapshot_context(nuraft::snapshot &snp) : snapshot_context(snp.get_last_log_idx()) { + auto snp_buf = snp.serialize(); + snapshot_ = nuraft::snapshot::deserialize(*snp_buf); + } + + nuraft_snapshot_context(sisl::io_blob_safe const &snp_ctx) : snapshot_context(0) { deserialize(snp_ctx); } + + sisl::io_blob_safe serialize() override { + // Dump the context from nuraft buffer to the io blob. + auto snp_buf = snapshot_->serialize(); + sisl::io_blob_safe blob{s_cast(snp_buf->size())}; + std::memcpy(blob.bytes(), snp_buf->data_begin(), snp_buf->size()); + return blob; + } + + void deserialize(const sisl::io_blob_safe &snp_ctx) { + // Load the context from the io blob to nuraft buffer. + auto snp_buf = nuraft::buffer::alloc(snp_ctx.size()); + snp_buf->put_raw(snp_ctx.cbytes(), snp_ctx.size()); + snp_buf->pos(0); + snapshot_ = nuraft::snapshot::deserialize(*snp_buf); + lsn_ = snapshot_->get_last_log_idx(); + } + + nuraft::ptr nuraft_snapshot() { return snapshot_; } + +private: + nuraft::ptr snapshot_; +}; + class RaftReplDev : public ReplDev, public nuraft_mesg::mesg_state_mgr, public std::enable_shared_from_this< RaftReplDev > { @@ -204,8 +236,13 @@ class RaftReplDev : public ReplDev, m_data_journal->purge_all_logs(); } + std::shared_ptr deserialize_snapshot_context(sisl::io_blob_safe &snp_ctx) override { + return std::make_shared(snp_ctx); + } + //////////////// Accessor/shortcut methods /////////////////////// - nuraft_mesg::repl_service_ctx* group_msg_service(); + nuraft_mesg::repl_service_ctx *group_msg_service(); + nuraft::raft_server* raft_server(); RaftReplDevMetrics& metrics() { return m_metrics; } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index e734b3477..88d6174c7 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -58,6 +58,10 @@ class SoloReplDev : public ReplDev { bool is_ready_for_traffic() const override { return true; } void purge() override {} + std::shared_ptr deserialize_snapshot_context(sisl::io_blob_safe &snp_ctx) override { + return nullptr; + } + uuid_t group_id() const override { return m_group_id; } repl_lsn_t get_last_commit_lsn() const override { return 0; } From 6cacae047cc583f2f1fb17773054ff89061a8a16 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Mon, 24 Feb 2025 23:43:53 +0800 Subject: [PATCH 072/170] fix dead lock in graceful shutdown (#652) m_rd_map_mtx will be lock when cp_flush repl_service, so we need to release it ASAP in case of other component triggers cp --- conanfile.py | 2 +- src/lib/replication/service/raft_repl_service.cpp | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/conanfile.py b/conanfile.py index 93d78cc53..0d66c297b 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.22" + version = "6.6.23" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 0d0391e34..06aae86d0 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -187,10 +187,12 @@ void RaftReplService::stop() { } // stop all repl_devs - std::unique_lock lg(m_rd_map_mtx); - for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) { - auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); - rdev->stop(); + { + std::unique_lock lg(m_rd_map_mtx); + for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) { + auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); + rdev->stop(); + } } // this will stop and shutdown all the repl_dev and grpc server(data channel). From 77a885273205250c2d4aeea3a67e60e2360f4c35 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Mon, 24 Feb 2025 15:23:12 +0800 Subject: [PATCH 073/170] Minor fixs 1. fix char* copy logic, add \0 at the end of array 2. update nuraft_mesg version which has https://github.com/eBay/nuraft_mesg/pull/117 --- conanfile.py | 4 ++-- src/lib/device/device.h | 8 ++++++-- src/lib/device/device_manager.cpp | 3 ++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/conanfile.py b/conanfile.py index 0d66c297b..6ae16c1b1 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.23" + version = "6.6.24" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" @@ -54,7 +54,7 @@ def build_requirements(self): def requirements(self): self.requires("iomgr/[^11.3]@oss/master", transitive_headers=True) self.requires("sisl/[^12.2]@oss/master", transitive_headers=True) - self.requires("nuraft_mesg/[^3.7.2]@oss/main", transitive_headers=True) + self.requires("nuraft_mesg/[^3.7.3]@oss/main", transitive_headers=True) self.requires("farmhash/cci.20190513@", transitive_headers=True) if self.settings.arch in ['x86', 'x86_64']: diff --git a/src/lib/device/device.h b/src/lib/device/device.h index beefdfc7f..1c3843534 100644 --- a/src/lib/device/device.h +++ b/src/lib/device/device.h @@ -36,6 +36,7 @@ VENUM(vdev_multi_pdev_opts_t, uint8_t, // Indicates the style of vdev when multi struct vdev_info { static constexpr size_t size = 512; static constexpr size_t user_private_size = 256; + static constexpr size_t max_name_len = 64; uint64_t vdev_size{0}; // 0: Size of the vdev uint32_t vdev_id{0}; // 8: Id for this vdev. It is unique per homestore instance @@ -48,7 +49,7 @@ struct vdev_info { uint8_t failed{0}; // 30: set to true if disk is replaced uint8_t hs_dev_type{0}; // 31: PDev dev type (as in fast or data) uint8_t multi_pdev_choice{0}; // 32: Choice when multiple pdevs are present (vdev_multi_pdev_opts_t) - char name[64]; // 33: Name of the vdev + char name[max_name_len]; // 33: Name of the vdev uint16_t checksum{0}; // 97: Checksum of this entire Block uint8_t alloc_type; // 98: Allocator type of this vdev uint8_t chunk_sel_type; // 99: Chunk Selector type of this vdev_id @@ -59,7 +60,10 @@ struct vdev_info { uint32_t get_vdev_id() const { return vdev_id; } uint64_t get_size() const { return vdev_size; } - void set_name(const std::string& n) { std::strncpy(charptr_cast(name), n.c_str(), 63); } + void set_name(const std::string& n) { + std::strncpy(charptr_cast(name), n.c_str(), max_name_len - 1); + name[max_name_len - 1] = '\0'; + } std::string get_name() const { return std::string{c_charptr_cast(name)}; } void set_allocated() { slot_allocated = s_cast< uint8_t >(0x01); }; diff --git a/src/lib/device/device_manager.cpp b/src/lib/device/device_manager.cpp index cac91237f..28eb37e33 100644 --- a/src/lib/device/device_manager.cpp +++ b/src/lib/device/device_manager.cpp @@ -99,7 +99,8 @@ void DeviceManager::format_devices() { ++m_first_blk_hdr.gen_number; m_first_blk_hdr.version = first_block_header::CURRENT_SUPERBLOCK_VERSION; std::strncpy(m_first_blk_hdr.product_name, first_block_header::PRODUCT_NAME, - first_block_header::s_product_name_size); + first_block_header::s_product_name_size - 1); + m_first_blk_hdr.product_name[first_block_header::s_product_name_size - 1] = '\0'; m_first_blk_hdr.num_pdevs = uint32_cast(m_dev_infos.size()); m_first_blk_hdr.max_vdevs = hs_super_blk::MAX_VDEVS_IN_SYSTEM; m_first_blk_hdr.max_system_chunks = hs_super_blk::MAX_CHUNKS_IN_SYSTEM; From 08567d8701ec3874e84e466ab6da978947b0aead Mon Sep 17 00:00:00 2001 From: koujl <108138320+koujl@users.noreply.github.com> Date: Thu, 27 Feb 2025 11:50:28 +0800 Subject: [PATCH 074/170] Fix missing fields in RaftReplDev::save_state (#654) Newly added fields in nuraft::srv_state should also be persisted/loaded. Signed-off-by: Jilong Kou --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 6ae16c1b1..daf9b5326 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.24" + version = "6.6.25" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index eb9fa8dc0..9829cb2b0 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1238,7 +1238,10 @@ void RaftReplDev::save_config(const nuraft::cluster_config& config) { void RaftReplDev::save_state(const nuraft::srv_state& state) { std::unique_lock lg{m_config_mtx}; - (*m_raft_config_sb)["state"] = nlohmann::json{{"term", state.get_term()}, {"voted_for", state.get_voted_for()}}; + (*m_raft_config_sb)["state"] = nlohmann::json{ + {"term", state.get_term()}, {"voted_for", state.get_voted_for()}, + {"election_timer_allowed", state.is_election_timer_allowed()}, {"catching_up", state.is_catching_up()} + }; m_raft_config_sb.write(); RD_LOGI("Saved state {}", (*m_raft_config_sb)["state"].dump()); } @@ -1248,11 +1251,17 @@ nuraft::ptr< nuraft::srv_state > RaftReplDev::read_state() { auto& js = *m_raft_config_sb; auto state = nuraft::cs_new< nuraft::srv_state >(); if (js["state"].empty()) { - js["state"] = nlohmann::json{{"term", state->get_term()}, {"voted_for", state->get_voted_for()}}; + js["state"] = nlohmann::json{ + {"term", state->get_term()}, {"voted_for", state->get_voted_for()}, + {"election_timer_allowed", state->is_election_timer_allowed()}, + {"catching_up", state->is_catching_up()} + }; } else { try { state->set_term(uint64_cast(js["state"]["term"])); state->set_voted_for(static_cast< int >(js["state"]["voted_for"])); + state->allow_election_timer(static_cast(js["state"]["election_timer_allowed"])); + state->set_catching_up(static_cast(js["state"]["catching_up"])); } catch (std::out_of_range const&) { LOGWARN("State data was not in the expected format [group_id={}]!", m_group_id) } From cd242fd3d95dec68220fe17c4c164961e7fc685f Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Wed, 12 Mar 2025 08:33:22 +0800 Subject: [PATCH 075/170] Add event callbacks into listener for upper layer (#657) some events need to be handled by upper layer. This PR add three event: 1 fetch_data: upper layer can decide which data to be returned 2 no_space_left: this error should be handled by upper layer if necessary 3 on_log_replay_done: after log replay is done and before joining raft group, upper layer might do something --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 24 ++++++- .../replication/repl_dev/raft_repl_dev.cpp | 64 +++++++++---------- src/lib/replication/repl_dev/raft_repl_dev.h | 18 +++--- .../replication/service/raft_repl_service.cpp | 25 +++++--- src/tests/test_raft_repl_dev.cpp | 26 ++++++-- 6 files changed, 100 insertions(+), 59 deletions(-) diff --git a/conanfile.py b/conanfile.py index daf9b5326..97512fad3 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.25" + version = "6.7.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 2b422d647..dfa241f7a 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -11,6 +11,7 @@ #include #include #include +#include #include namespace nuraft { @@ -367,6 +368,25 @@ class ReplDevListener { /// @brief Free up user-defined context inside the snapshot_obj that is allocated during read_snapshot_obj. virtual void free_user_snp_ctx(void*& user_snp_ctx) = 0; + /// @brief ask upper layer to decide which data should be returned. + // @param header - header of the log entry. + // @param blkid - original blkid of the log entry + // @param sgs - sgs to be filled with data + // @param lsn - lsn of the log entry + virtual folly::Future< std::error_code > on_fetch_data(const int64_t lsn, const sisl::blob& header, + const MultiBlkId& blkid, sisl::sg_list& sgs) { + // default implementation is reading by blkid directly + return data_service().async_read(blkid, sgs, sgs.size); + } + + /// @brief ask upper layer to handle no_space_left event + virtual folly::Future< std::error_code > on_no_space_left(uint32_t pdev_id, chunk_num_t chunk_id) { + return folly::makeFuture< std::error_code >(std::error_code{}); + } + + /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer + virtual void on_log_replay_done(const group_id_t& group_id){}; + private: std::weak_ptr< ReplDev > m_repl_dev; }; @@ -449,7 +469,7 @@ class ReplDev { /// @brief Clean up resources on this repl dev. virtual void purge() = 0; - virtual std::shared_ptr deserialize_snapshot_context(sisl::io_blob_safe &snp_ctx) = 0; + virtual std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) = 0; virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } @@ -460,6 +480,8 @@ class ReplDev { } } + virtual shared< ReplDevListener > get_listener() { return m_listener; } + // we have no shutdown for repl_dev, since shutdown repl_dev is done by repl_service void stop() { start_stopping(); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 9829cb2b0..330bca99e 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -335,7 +335,15 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& handle_error(rreq, ReplServiceError::DATA_DUPLICATED); return; } + +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("disable_leader_push_data")) { + RD_LOGD("Simulating push data failure, so that all the follower will have to fetch data"); + } else + push_data_to_all_followers(rreq, data); +#else push_data_to_all_followers(rreq, data); +#endif COUNTER_INCREMENT(m_metrics, total_write_cnt, 1); COUNTER_INCREMENT(m_metrics, outstanding_data_write_cnt, 1); @@ -794,30 +802,8 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_ auto const& lsn = req->lsn(); auto const& originator = req->blkid_originator(); auto const& remote_blkid = req->remote_blkid(); - - // Edit this check if in the future we want to fetch from non-originator; - if (originator != server_id()) { - auto const error_msg = - fmt::format("Did not expect to receive fetch data from " - "remote when I am not the originator of this request, originator={}, my_server_id={}", - originator, server_id()); - RD_LOGW("{}", error_msg); - auto status = ::grpc::Status(::grpc::INVALID_ARGUMENT, error_msg); - rpc_data->set_status(status); - rpc_data->send_response(); - return; - } - - // fetch data based on the remote_blkid - // We are the originator of the blkid, read data locally; MultiBlkId local_blkid; - - // convert remote_blkid serialized data to local blkid local_blkid.deserialize(sisl::blob{remote_blkid->Data(), remote_blkid->size()}, true /* copy */); - - RD_LOGD("Data Channel: FetchData received: dsn={} lsn={} my_blkid={}", req->dsn(), lsn, - local_blkid.to_string()); - // prepare the sgs data buffer to read into; auto const total_size = local_blkid.blk_count() * get_blk_size(); sisl::sg_list sgs; @@ -827,7 +813,18 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_ // accumulate the sgs for later use (send back to the requester)); sgs_vec.push_back(sgs); - futs.emplace_back(async_read(local_blkid, sgs, total_size)); + + if (originator != server_id()) { + RD_LOGD("non-originator FetchData received: dsn={} lsn={} originator={}, my_server_id={}", req->dsn(), lsn, + originator, server_id()); + } else { + RD_LOGD("Data Channel: FetchData received: dsn={} lsn={}", req->dsn(), lsn); + } + + auto const& header = req->user_header(); + sisl::blob user_header = sisl::blob{header->Data(), header->size()}; + RD_LOGD("Data Channel: FetchData handled, my_blkid={}", local_blkid.to_string()); + futs.emplace_back(std::move(m_listener->on_fetch_data(lsn, user_header, local_blkid, sgs))); } folly::collectAllUnsafe(futs).thenValue( @@ -1238,10 +1235,10 @@ void RaftReplDev::save_config(const nuraft::cluster_config& config) { void RaftReplDev::save_state(const nuraft::srv_state& state) { std::unique_lock lg{m_config_mtx}; - (*m_raft_config_sb)["state"] = nlohmann::json{ - {"term", state.get_term()}, {"voted_for", state.get_voted_for()}, - {"election_timer_allowed", state.is_election_timer_allowed()}, {"catching_up", state.is_catching_up()} - }; + (*m_raft_config_sb)["state"] = nlohmann::json{{"term", state.get_term()}, + {"voted_for", state.get_voted_for()}, + {"election_timer_allowed", state.is_election_timer_allowed()}, + {"catching_up", state.is_catching_up()}}; m_raft_config_sb.write(); RD_LOGI("Saved state {}", (*m_raft_config_sb)["state"].dump()); } @@ -1251,17 +1248,16 @@ nuraft::ptr< nuraft::srv_state > RaftReplDev::read_state() { auto& js = *m_raft_config_sb; auto state = nuraft::cs_new< nuraft::srv_state >(); if (js["state"].empty()) { - js["state"] = nlohmann::json{ - {"term", state->get_term()}, {"voted_for", state->get_voted_for()}, - {"election_timer_allowed", state->is_election_timer_allowed()}, - {"catching_up", state->is_catching_up()} - }; + js["state"] = nlohmann::json{{"term", state->get_term()}, + {"voted_for", state->get_voted_for()}, + {"election_timer_allowed", state->is_election_timer_allowed()}, + {"catching_up", state->is_catching_up()}}; } else { try { state->set_term(uint64_cast(js["state"]["term"])); state->set_voted_for(static_cast< int >(js["state"]["voted_for"])); - state->allow_election_timer(static_cast(js["state"]["election_timer_allowed"])); - state->set_catching_up(static_cast(js["state"]["catching_up"])); + state->allow_election_timer(static_cast< bool >(js["state"]["election_timer_allowed"])); + state->set_catching_up(static_cast< bool >(js["state"]["catching_up"])); } catch (std::out_of_range const&) { LOGWARN("State data was not in the expected format [group_id={}]!", m_group_id) } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 46bf18b28..bb0a72815 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -115,22 +115,22 @@ struct ReplDevCPContext { class nuraft_snapshot_context : public snapshot_context { public: - nuraft_snapshot_context(nuraft::snapshot &snp) : snapshot_context(snp.get_last_log_idx()) { + nuraft_snapshot_context(nuraft::snapshot& snp) : snapshot_context(snp.get_last_log_idx()) { auto snp_buf = snp.serialize(); snapshot_ = nuraft::snapshot::deserialize(*snp_buf); } - nuraft_snapshot_context(sisl::io_blob_safe const &snp_ctx) : snapshot_context(0) { deserialize(snp_ctx); } + nuraft_snapshot_context(sisl::io_blob_safe const& snp_ctx) : snapshot_context(0) { deserialize(snp_ctx); } sisl::io_blob_safe serialize() override { // Dump the context from nuraft buffer to the io blob. auto snp_buf = snapshot_->serialize(); - sisl::io_blob_safe blob{s_cast(snp_buf->size())}; + sisl::io_blob_safe blob{s_cast< size_t >(snp_buf->size())}; std::memcpy(blob.bytes(), snp_buf->data_begin(), snp_buf->size()); return blob; } - void deserialize(const sisl::io_blob_safe &snp_ctx) { + void deserialize(const sisl::io_blob_safe& snp_ctx) { // Load the context from the io blob to nuraft buffer. auto snp_buf = nuraft::buffer::alloc(snp_ctx.size()); snp_buf->put_raw(snp_ctx.cbytes(), snp_ctx.size()); @@ -139,10 +139,10 @@ class nuraft_snapshot_context : public snapshot_context { lsn_ = snapshot_->get_last_log_idx(); } - nuraft::ptr nuraft_snapshot() { return snapshot_; } + nuraft::ptr< nuraft::snapshot > nuraft_snapshot() { return snapshot_; } private: - nuraft::ptr snapshot_; + nuraft::ptr< nuraft::snapshot > snapshot_; }; class RaftReplDev : public ReplDev, @@ -236,12 +236,12 @@ class RaftReplDev : public ReplDev, m_data_journal->purge_all_logs(); } - std::shared_ptr deserialize_snapshot_context(sisl::io_blob_safe &snp_ctx) override { - return std::make_shared(snp_ctx); + std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { + return std::make_shared< nuraft_snapshot_context >(snp_ctx); } //////////////// Accessor/shortcut methods /////////////////////// - nuraft_mesg::repl_service_ctx *group_msg_service(); + nuraft_mesg::repl_service_ctx* group_msg_service(); nuraft::raft_server* raft_server(); RaftReplDevMetrics& metrics() { return m_metrics; } diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 06aae86d0..a8c4fff37 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -156,16 +156,21 @@ void RaftReplService::start() { LOGINFO("Starting DataService"); hs()->data_service().start(); - // Step 6: Iterate all the repl dev and ask each one of the join the raft group. - for (auto it = m_rd_map.begin(); it != m_rd_map.end();) { - auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); - rdev->wait_for_logstore_ready(); - if (!rdev->join_group()) { - HS_REL_ASSERT(false, "FAILED TO JOIN GROUP, PANIC HERE"); - it = m_rd_map.erase(it); - } else { - ++it; - } + // Step 6: Iterate all the repl devs and ask each one of them to join the raft group concurrently. + std::vector< std::future< bool > > join_group_futures; + for (const auto& [_, repl_dev] : m_rd_map) { + join_group_futures.emplace_back(std::async(std::launch::async, [&repl_dev]() { + auto rdev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev); + rdev->wait_for_logstore_ready(); + + // upper layer can register a callback to be notified when log replay is done. + if (auto listener = rdev->get_listener(); listener) listener->on_log_replay_done(rdev->group_id()); + return rdev->join_group(); + })); + } + + for (auto& future : join_group_futures) { + if (!future.get()) HS_REL_ASSERT(false, "FAILED TO JOIN GROUP, PANIC HERE"); } // Step 7: Register to CPManager to ensure we can flush the superblk. diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index c419e6b1d..cdcfa9b1e 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -33,8 +33,8 @@ TEST_F(RaftReplDevTest, Write_Duplicated_Data) { stored_key = dbs_[0]->inmem_db_.cbegin()->first; ASSERT_EQ(id, stored_key.id_); } else { - LOGINFO("I am not leader, leader_uuid={} my_uuid={}, do nothing", - boost::uuids::to_string(leader_uuid), boost::uuids::to_string(g_helper->my_replica_id())); + LOGINFO("I am not leader, leader_uuid={} my_uuid={}, do nothing", boost::uuids::to_string(leader_uuid), + boost::uuids::to_string(g_helper->my_replica_id())); } wait_for_commits(total_writes); @@ -45,12 +45,12 @@ TEST_F(RaftReplDevTest, Write_Duplicated_Data) { if duplication found in leader proposal, reject it; if duplication found in the followers, skip it. */ - //1. write the same data again on leader, should fail + // 1. write the same data again on leader, should fail if (leader_uuid == g_helper->my_replica_id()) { auto err = this->write_with_id(id, true /* wait_for_commit */); ASSERT_EQ(ReplServiceError::DATA_DUPLICATED, err); - //2. delete it from the db to simulate duplication in followers(skip the duplication check in leader side) + // 2. delete it from the db to simulate duplication in followers(skip the duplication check in leader side) dbs_[0]->inmem_db_.erase(stored_key); LOGINFO("data with id={} has been deleted from db", id); err = this->write_with_id(id, true /* wait_for_commit */); @@ -109,6 +109,24 @@ TEST_F(RaftReplDevTest, Follower_Fetch_OnActive_ReplicaGroup) { g_helper->sync_for_cleanup_start(); } + +TEST_F(RaftReplDevTest, Write_With_Diabled_Leader_Push_Data) { + g_helper->set_basic_flip("disable_leader_push_data"); + LOGINFO("Homestore replica={} setup completed, all the push_data from leader are disabled", + g_helper->replica_num()); + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + + this->write_on_leader(100, true /* wait_for_commit */); + + g_helper->sync_for_verify_start(); + + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + + g_helper->sync_for_cleanup_start(); +} + #endif // do some io before restart; From 43992eb7d33099cda767fd04ad59bce089a19f71 Mon Sep 17 00:00:00 2001 From: ywz <649521587@qq.com> Date: Wed, 12 Mar 2025 14:16:02 +0800 Subject: [PATCH 076/170] Destroy upper resources after Raft server shutdown (#658) Previously, upper layer resources were destroyed when a member was removed from the cluster, while resources on the repl dev were garbage collected in the reaper thread. During this period, the commit thread could still be active, potentially leading to new commits accessing already destroyed resources. This change moves the destroy of upper layer resources into the garbage collection thread. The steps are now: 1. Shutdown Raft server 2. Destroy upper resources 3. Destroy other resources on the repl dev authored-by: yawzhang --- conanfile.py | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 6 +-- .../replication/service/raft_repl_service.cpp | 39 ++++++++++++------- 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/conanfile.py b/conanfile.py index 97512fad3..bda41e82f 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.0" + version = "6.7.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 330bca99e..2d2e5743f 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1283,6 +1283,8 @@ std::shared_ptr< nuraft::state_machine > RaftReplDev::get_state_machine() { retu void RaftReplDev::permanent_destroy() { RD_LOGI("Permanent destroy for raft repl dev group_id={}", group_id_str()); + // let the listener know at first, so that they can cleanup persistent structures before raft repl dev is destroyed + m_listener->on_destroy(group_id()); m_raft_config_sb.destroy(); m_data_journal->remove_store(); logstore_service().destroy_log_dev(m_data_journal->logdev_id()); @@ -1310,10 +1312,6 @@ void RaftReplDev::leave() { m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::DESTROYED; }); m_destroyed_time = Clock::now(); - // We let the listener know right away, so that they can cleanup persistent structures soonest. This will - // reduce the time window of leaked resources if any - m_listener->on_destroy(group_id()); - // Persist that destroy pending in superblk, so that in case of crash before cleanup of resources, it can be done // post restart. m_rd_sb->destroy_pending = 0x1; diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index a8c4fff37..5a75093b6 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -558,21 +558,34 @@ void RaftReplService::gc_repl_reqs() { } void RaftReplService::gc_repl_devs() { - std::unique_lock lg(m_rd_map_mtx); - for (auto it = m_rd_map.begin(); it != m_rd_map.end();) { - auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); - if (rdev->is_destroy_pending() && - (get_elapsed_time_sec(rdev->destroyed_time()) >= - HS_DYNAMIC_CONFIG(generic.repl_dev_cleanup_interval_sec))) { - LOGINFOMOD(replication, - "ReplDev group_id={} was destroyed, shutting down the raft group in delayed fashion now", - rdev->group_id()); - m_msg_mgr->leave_group(rdev->group_id()); - it = m_rd_map.erase(it); - } else { - ++it; + incr_pending_request_num(); + std::vector< group_id_t > groups_to_leave; + { + std::shared_lock lg(m_rd_map_mtx); + for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) { + auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); + if (rdev->is_destroy_pending() && + (get_elapsed_time_sec(rdev->destroyed_time()) >= + HS_DYNAMIC_CONFIG(generic.repl_dev_cleanup_interval_sec))) { + LOGINFOMOD(replication, + "ReplDev group_id={} was destroyed, shutting down the raft group in delayed fashion now", + rdev->group_id()); + groups_to_leave.push_back(rdev->group_id()); + } + } + } + + // Call leave_group to shut down the raft server and destroy all resources on the repl dev. + // This operation may require acquiring the m_rd_map_mtx lock for some steps (e.g., trigger cp flush). + // Therefore, we perform it outside the lock scope and then remove group from m_rd_map. + for (const auto& group_id : groups_to_leave) { + m_msg_mgr->leave_group(group_id); + { + std::unique_lock lg(m_rd_map_mtx); + m_rd_map.erase(group_id); } } + decr_pending_request_num(); } void RaftReplService::flush_durable_commit_lsn() { From e23574a330274baebc836d62192ae11780d38337 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Thu, 13 Mar 2025 14:47:38 +0800 Subject: [PATCH 077/170] Fixes on gc repl devs. 1. skip gc repl devs when raft repl service is stopping to avoid concurrency issues between repl_dev's stop and destroy ops. 2. skip flush ops on repl dev if repl dev is destroyed --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 10 ++++++++++ src/lib/replication/service/raft_repl_service.cpp | 7 +++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index bda41e82f..1b997771b 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.1" + version = "6.7.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 2d2e5743f..4b4a22213 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1409,6 +1409,11 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, } void RaftReplDev::flush_durable_commit_lsn() { + if (is_destroyed()) { + RD_LOGI("Raft repl dev is destroyed, ignore flush durable commmit lsn"); + return; + } + auto const lsn = m_commit_upto_lsn.load(); std::unique_lock lg{m_sb_mtx}; m_rd_sb->durable_commit_lsn = lsn; @@ -1417,6 +1422,11 @@ void RaftReplDev::flush_durable_commit_lsn() { /////////////////////////////////// Private metohds //////////////////////////////////// void RaftReplDev::cp_flush(CP* cp, cshared< ReplDevCPContext > ctx) { + if (is_destroyed()) { + RD_LOGI("Raft repl dev is destroyed, ignore cp flush"); + return; + } + auto const lsn = ctx->cp_lsn; auto const clsn = ctx->compacted_to_lsn; auto const dsn = ctx->last_applied_dsn; diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 5a75093b6..ef0aaecc5 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -559,6 +559,13 @@ void RaftReplService::gc_repl_reqs() { void RaftReplService::gc_repl_devs() { incr_pending_request_num(); + // Skip gc when raft repl service is stopping to avoid concurrency issues between repl_dev's stop and destroy ops. + if (is_stopping()) { + LOGINFOMOD(replication, "ReplSvc is stopping, skipping GC"); + decr_pending_request_num(); + return; + } + std::vector< group_id_t > groups_to_leave; { std::shared_lock lg(m_rd_map_mtx); From f61a1d18ff8697df094cd3033d9cd15a9db7d6c7 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Mon, 17 Mar 2025 10:03:37 +0800 Subject: [PATCH 078/170] Send response if failed to save pushed_data The previous implementation missed sending a response when storing pushed_data failed. As a result, a large amount of RPC data was held in memory and only released when the connection timed out, leading to increased memory usage. This change fixes the issue. The issue occurs in test case `full pg recovery in 5 replicas env, blob_size=512KB`, when a batch of old shards sealed and a batch of new shards created, a lot of 'pushed_data' failed to save due to shards not committed. Then the memory increased and the pod is OOM in the test. --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 1b997771b..0e4b79f60 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.2" + version = "6.7.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 4b4a22213..9bca6d63d 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -452,6 +452,7 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d LOGINFO("Data Channel: Flip is enabled, skip on_push_data_received to simulate fetch remote data, " "server_id={}, term={}, dsn={}", push_req->issuer_replica_id(), push_req->raft_term(), push_req->dsn()); + rpc_data->send_response(); return; } #endif @@ -463,11 +464,13 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d "Data Channel: Creating rreq on applier has failed, will ignore the push and let Raft channel send " "trigger a fetch explicitly if needed. rkey={}", rkey.to_string()); + rpc_data->send_response(); return; } if (!rreq->save_pushed_data(rpc_data, incoming_buf.cbytes() + fb_size, push_req->data_size())) { RD_LOGD("Data Channel: Data already received for rreq=[{}], ignoring this data", rreq->to_string()); + rpc_data->send_response(); return; } From 053fab4ce5014731175dbeb45c506671fd71a8af Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Wed, 19 Mar 2025 09:44:06 -0700 Subject: [PATCH 079/170] issue: 667 Enable on_repl_devs_init_completed callback on SoloReplService (#668) --- src/lib/replication/service/generic_repl_svc.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index c401a883a..f357cb819 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -86,6 +86,9 @@ void SoloReplService::start() { } m_sb_bufs.clear(); + LOGINFO("Repl devs load completed, calling upper layer on_repl_devs_init_completed"); + m_repl_app->on_repl_devs_init_completed(); + hs()->data_service().start(); hs()->logstore_service().start(hs()->is_first_time_boot()); From 36600b1ae949347f0ce652ee7f552c07e298bedd Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Thu, 20 Mar 2025 18:04:01 -0700 Subject: [PATCH 080/170] Add Disk Layout Diagram for device layer --- conanfile.py | 2 +- docs/imgs/HomeStore_Disk_Layout2.png | Bin 0 -> 345352 bytes src/lib/device/README.md | 7 +++++++ 3 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 docs/imgs/HomeStore_Disk_Layout2.png create mode 100644 src/lib/device/README.md diff --git a/conanfile.py b/conanfile.py index 0e4b79f60..79e643b58 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.3" + version = "6.7.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/docs/imgs/HomeStore_Disk_Layout2.png b/docs/imgs/HomeStore_Disk_Layout2.png new file mode 100644 index 0000000000000000000000000000000000000000..8775927ee41926ca4ebdfa97c5cb34afbf3c7bd1 GIT binary patch literal 345352 zcmb@u2Uru^`Ys+ih=PEENZk;MB3*hH5D)nrc~u7p3*RJn5L<|PmabVW^7Ne2WX zfrCHA_DMm6csD^rxJ!VKEFQz( z`}gs#k9cnw?!QqWaYQv1fu1@eel$Dz6#_S&J zyXtE^l(cY!@;$S3d~U_(33bBV2PEw&3G70xT%WOeLLm+?NlzKJiz_67ecWMwHr9(v zT^l z&(Xz(Ur<6qf?q(0Ur2}-xPlkv<>30vlh*;p{^w2px{s0-%)-Ua$<@x$ffaY(XU`qo zTxHnUa1Z+X@6Uc(dD{K!Ne-~zuLZmyKkf{_AfEvL-}eTZO5=`7KC<()f*2^-K>?Wo z&ybZ65R$&Q{(n33uP6SarT)KK3Q7oz|Fh{oPW@-oCon4)MMo&`Ojp@|jo0tS|2+A- zp)^14)&C)iKjnOJ6cDuRC29V@2Tk^pCO%pj7)M4sC9TK67SK4{AKq8sANQYIU>`r1 zg2ti)&ErR36BP$uFaN)O*=xN7NzZ@${p2_Rd%QDu zzCG7q5hGEEv0ER*p6_CJP9`cWI)^)x2wp#5G{YX6HDWOc?3VlXdzZ z6IX4Iz0V~4eh>+BZqXL(+3srXOg+rz*5q&*l^g0arT^d%lu~jnr=Yc!@#?chv-4#$ zmnmkq@7mwCfGd{6$`$0eYHZP3*aNMZTIf^*dyM#VJEk7IL;?m9W>0g}$Z(lr#joaF zzp|+oKOc{ig;0KzP?xp40|}#({O>Dbu_v+hCX_MSX$M{KO z>uF=tk@(K1#iz^(>>HnNhDp3t)`FCuKf7Zra<)Bp4(*hl65>}%wBQy!UvWI=EE}N@ zeyUNtdc%hgmfBnVB9|c|y7hdsbw^y!5N)SqqE93Ay5hBgi1PT!fJe(nrsvOa4Jdu9khPfz>=kWDu^QN5x4YfLbeTLPzN^YyDNazO$(3YmXeoheZ{S}?*`1WkG z&yNq5a(H8WykoK_@vqGZSSZF^xH6o+aF|k}CC?L*>;*7aqFxotJ=LIEz7cT+&Ao`O z=wxpEz&L>rrfJvy0viq9>=Ju+aJt?>$r z@VtXO#R@=1+t2?shJfmB^j#tDuJL2{9is9-d>T-z-~t^H0p9&h$hSWtt1_iLHsq2# zJC(bX=}f*e`R;vIXDnce9+zeR(gBR4@#T05$u-vNKH(mKK!I0S2o+jTPGtXbf3rQZ z0T|^@KzD&T@`T*wV#@tFJ^qy3ze~Ao)p_9+ojYK#Mg#8rqR1v0V+v*VUZYl7R|$Ro z6&Sv6z%VD(xmJQO5Y>%#eW$M{| z`_K~?S`114%KcbZ%hzm=e)0dpG8vos*l+$zTX6P~9bN-v$8-VH0T}rUk7IE(0Ei&J zD>V%Io*i-nY~L7VhZ6^-o z-a!7fPm3`|6U1(Iyq%9AgQZcMfOFXD@xE-O*AwOfbXd$hCa;Xe?_dZW%Q36GRZjmb z*t;5CWv~m|aq(+>k;_N7S@Q`|0{ zpq**u^}*pf&8IK6;n)FhTZa4c=D-=h&Tpc0$>QCHpe+tPJYI>ITMm~XL2WO7!Ou$9 zJ}c9capBt;1a>&NgPb(z7b2W||!iLAni$Jvtb9iaS2gVzwz#Ks4(;6G52RSygL zojO4I+=v@AlgZU@WDzE3D$HJ%5|-J`LY>Z;S5y~5VX^xoDm~G5)1*&2>a8XzjkJ7! z(lnO&F7r&R@H%?+?!2Y+Tl*qg9>HvT9z^MzBZce}hSlF%B{$uB`9@*iI@-xOTEmxe z)iE;9p{l_cey$}|hrg|^<0r<$TBg{9d-DGCCR+ zID~tx0e?5B+-dw1%G3C-L9AS z7vIm)qPOF1cCSl$Jv5cSzY0%|#QOOiUmqaJP>4~@%DQSX`7#*U;Op3H1;_BXx7{~;8vO-GGkk6 zE@MRPEzVZ|Zpf-@Y%I89UIO8zNqp>kiR64F`~2XFN9BzNO8dH5`-zNK`rIC^^JjR- zTbA0qBOjwts@tBZDw5(*zJ}}(q044>PH1CWQ=9Yop2z&7`RDbru2pav<|qI8&#h0s zPH1H_BMvtD(_R1LM&S%=Og*aKaklxWrfi$fXQG#OM7>#u0*%fkH7@-)i_AJ++WpGC z*-RpL@_FV>sVfI6CK}R3BIc>sd_!)~>&G=*9RI!GIzB;%c&DcHuXYhXn~&&6F;7n< zuQ6m4(V2QatMfVDDR~;uzq*Dn_NZ>_csp*s_0!P7Oa-j6reM~Y{heC+t9Kh!cTMjI zf_{uCUiwFf^HmK)Ojncwrh2Qq+e@%l)BgKyyEwL<`K*= zZGI~b6J;aK9(;}MMz{dsXkwV^*=Wr!8>>@DbR;>LV1D*jjEC#4@2XT|>q%s5mjzdh z2z44>u&3EPE9l^ZUh@l&#;KXo#@EXT)ZVh_(ERZtOKX z-$zM8Wa2PuDsl~{?Od|)r*kHNx?N5p59FIuW@!coF3Ft)%XL9}Ev`7Y<}tXHdH?u$ z8LrM=ik!rUuO~pF&nFQ6qH|5Z^5oEYGlyjXXKpQai*l4i_$rCPf}KF{P=^Zv8IeG| z{GsG_;q~UDF^38h zjl`wtuXbH-U&mUGYIbl~fK=`q=O&u|`_(7wMcK1V0hnLZ>ne$)VxXplD~?@lQQP6+ z`*&5dj=UJ59Biqi7~ui&ywl!ROs}uZ%f$^iDICwZO`FYUO{LAWltVwgJjNQ@mU&#P zh}SD`l{1^U<65-CDOTV8h1(n?*hqdb@}|t!YTX`#V688El83lJ2 zG~mJ>NB(af8d_?WkHB9)XXky*xvu#=>BB>Ko(24--8IqS5HYn(_v|qa-f6CNLdb2E zkZ4@9Dx2(N_FTTTi4S)8MG1G;Mb;JQ-)X7e1keg%zkCiscY)W{`ESQn2ZqQjUZ~#$ z@Snw$B13AJdc(UHg7K@9qb^F{IlkO~S=0~X@aF6^ODI$;I$o@#+!y?X}%|4S!VZrKTjW|jWt=r zYu{yn<3#FzlsJf>FspK{kBurgAA*YCL)jr20q3axGuJ$qN1as&Ss#~;_eGk+ulB%D z8ELgCQZ3n|ir8fl^C6H%Y8id+>~YJn#;|)+LiIO`Q8B2SjM`Q2eu;j`g+D00y1)wG zx&doG*FDyHerT?u*6YD&4^ud*nJ_P+7DScmN9pJ zRF}Rv+)%6BWri*=O9KmJoDx8@9PXw(;`irX)N4im=*G>*?%lK5U5()#P{Oq*(rRMY zmt;Gg*I&QtUSz?ZU{FsOUW+a_CTScv_qYKF)ABy%1i;#s(&sQMmDbkTMT`fr%HgP= z9s$Q5NDu8@LIjr?dN_I1*Oi^YUbbm~D7j>}*5Yw4jsy z;W=c~tTF5{- zF*sq2wf-m;XLs+S?9?ux2P(iCqyx}HSh(0lvMhwN82^l`J6rI(+bH~<_K{zeS|di;{>`FyGzUa@|D+#2w@S>7kmF(Sf* zV5TCd1P%%YVirZNb^0iJ@4|OWm+{mMJiiS<9et+MA{sKM0T-B_Z@U7i^DP|TXidT% z)^K~B*HNkRbtdU;CHc;E_?poAm14aheMQQmD{@s6K9G4|PgJj2*5~IDJ6ZW%dzYIG zTFjCh9PHPYx+22$f4Qk^1^QQcJ4s{8C>1#K_-3cVDK@d-)6CfGQ8aUP|AwBt3yP(z z7u++nv4u+}Eumh%YPg))mk9VzSy~1-xWCZ}5@gL^w%p;NzuO8ikbAaJwXFRK*%%p( zaU(_Qt3h2xk=v6s_Wjsebf_y8yzY(#dDWAVN059X`Pr3jglbl;Wy7NZC=f>#dE<`V zcN{`a)lMFdfD@qAkXtg>80_%k>Nun^dtd7e&qT7->E2HePqE<8R`%!QF>k+f@PZna z+6Ux5)~XwVmP&YOEcJi1n`b#Pz74IbEuCSNc)mZ$t}I)MnMPNO_aP=9&VRgVD%5#% zEhm^_!w?EU%#ou{bX~qLtEg7@k$*3!VN|Sec6Nkj^7&aCi6SB9wJ)vCz8d%XX6IgO zL4%A?mCBCz*HEb$s|CC{BheyWGRoODy|!6HYCgXv zmYQ~2Fw6X_HVVilqmFb%N}1}@1(x%Lrn46>DZEbCl+O|?bq5@M7a36X9k*@@%ZPM_st=WtpW5MnnPF!^b?qp!pdZazr`jCgf~>yf&11W}x26P=TXD4m|5=Lk z-RF^;+35G#8>K!oo9--UYbvn%lj(%4gZv$wBk`zfoGEGi#_JGYCsa5KgjoG)Nw82M zn5i!F4`k9x^^|q^>S)WB760ZeW<_k(J!cMSyq0}hpPkP1K<0dp z5-U(g7Lf0|PW;NiXirR5QOX~)$bwkqUnCuK1EVjm%131E`mcrht4_vab*;0OGaaJV z#A8G~x9!au%g=$Zh{u;M{9N9w3H_riE1PCuj+=b0+19V`*APN=<@AQwIBg~@S}>~R z4ai}K#P$KUfCUz9gm%L3XEKu24WJIN=`)YIZKrMW$zfIeT;+LlSABN{_HEu>e+qL;=kDfgYN2?5M7_*$2pf(>Cv8Z->izSEbJsH_aTp*|o@c4$&96@{M>KG2K4i{32H_kKdy2 zgB`c%+L&>#;M3x?*E)0qegk zYYbE#ngHC^_h$W`lB?-aGyn++?mRE@Gty~Lh0gwTC&j^s%|K@Gq4OJkdr-_+#*$0hC)edRKajLwLJ$$c~ zSo4^hJ)K;u$6yHwm->-cmbD5YHuzx)Tym&xuhZ$JY9n3u+}Dn<^r@x& ziUJ|3ipY-JD?Tk%*)4mo_#X~P?KUyG$k&UK0A zDk1*iyt4kM`(&;fJFx82rAqBOLUk^jy@DyZxI9?m?MRIkdjMHB-q4Zl^!-{pe(;B@ z%||=7z*V`<)dT|fef!oUdoO#Ktj|}5S%Kqv^eM?lao=UQ<+!o$DwclS=4#x%@>IH~ooB47El}rbS?{OJXai>T5)fhNHO?isa7L>-{0Ivn(X<0$hwWfpqNS zvh1&JEq|9nrcO>!`l)||vpGBvxg_SlD29I0oxMHws=;|6KV8FR?q`Z#)6eTwNc{Bm zj*uolRP5P!>~bf$6CaE@xq^2lN4kFFNm<&|T^R;ncL?fv2b|CkhE-mannMksq)wN) zvzJe)i?cB8S{NlhN_U#20sC=64ARr(!K7&BQIX?YqFo6wYJu#g9k()`!rFreW}nRl zfdD<&O%@4NzlS9pGkmyo#PagWin|JUtgu~gkR^08pf<~*|FfG4E({-4?knL1vfC;N zKjBlw3Wdg6bUgdU7q6)gw?t`6e8lV{&$0Ah_lRop!(1+ZBSs$TW4emfZcrM*foUV7 z0u)_lCk+q7_CD63tLl*!NdhY+B{l%Yym?!B06?%=Eet^b)-`n#xX@U-e9u9RipEq) zf(@5#t=YyEH+TP4^uu9m#u=S;_>w2#;oEUC_9-p#S zzZ)1Q>?4%@HF52_l$v(5{XB*J7nIO724q>_{lod5B3q8owLB*O^VI-@&a#F+lu{tg1JTSM2sJ_7zcQ%Au zmm~JbQ7qW9B{ZGcf1fKrsO}bPCIhY0-dDg`_q=hq3&HD7j9lfc(vVyw z5wpth*{QF}%@t!!#r{f_RV~!2v_c0nALOV-==UGyv}9;WhtiXFd-Z8tJ|IWquZqgo zfVUip;i$`E0_K$5JrX@A57UPD=d#+I=+0b1&q)04L^9i6T4dDqjMhNQfifuoeW*Wg zgLMc@(rpidm4grw_p|f12VV&Wrt6^x=1>iTPjCH{QPdxJlJ%>vUNu*GdismT<)}~$ zOXer4VCV4N6|X~S-R6wm1MRKDPTJAMUGK5R{b=qemh>RY4v`LnNwMuV0Wa+7T=uE= zvKbx259`?IzoU<4808^ld3g~?ypvbuT4z5}O zl$wul^-ElR`=UfeMFhvQ348{+w1=-Bo=^uyFW|B=dTv5inrL$U#Q@MvwzJv2*J4Xy zW~}4_^s4vhPv`RjEptfbli!KYYffs4s65!u(8*22y9HwCA#5TlpI+DWNW+Xeo{i=! z9pir-7X3`$zE{6#yegT1!%Ok;oabeZ>k<8j1Tmnza8+<65-=f}d@OMpszy1~gT#Y` z0?$h~l9@t<%_%^=$9RT=L`?cdNCW>QyduWl<7}^$RN+Qx9~6+Tnh|GUV!7{421nZD zDurKLIt>q77KaEpxDW^+EI4(N-3foa$byBa?tuX0!V@VhSTnwObkAcOgJ{f=Y&G?wE}omN-E`PKqF*7c0Mmc?^p!|xIM z<8=%Lx2gnL2GSu|iFs!Yq2CDbMoSOt%~f;V3yO zr1Wa~;w=`#A$~fJyL`2)SS_zSd7&|t44!FE1-V=~!PkPL&JIrJAl>lqt4;S<%%$S7 z#Yok_h3X&f%!=hf{&qYk1{JHDU!3j8k-mI`j%dlBdY(WuEyhE+$SOwa#Ef@pN9hpw z%$U$snC~suP(P*>+>u!gJqJS-vm9F{E~&0qCwRp@tQN7jV!^-2ZJwS<9d>(l7bCF4 zm#{_-WJ!W(;q6(R754`Mok@6KeDNzo9l}9oTspN)_PeV4`KIk1|0PnhYM=lR@VhYn zCj;S6MZOGIj`4lgr8W&`3v|a-VQ>XEp#2XVQ-RPoI%SWohw7pPZvq^(z$IsYmo;VL z`a}y3m^zaB{Rh;wUxM1A<%RY_-rb$YXQeft_6Q>&4v{=3W1R z^bh(IBI7K7^tuF^j_Z=BY7@S25PP6{4)<01^6s>f>_7%cHhBs$wYF%-5ghbV34<>d zE=pMZR*i*v-(ZV6NIg;5ili_RY8J_Big&xN^=w=CJKhprde}AJb7XdQ-f5t;(rzx% z;k?gE>qo~}>1+aGTKOxg#BA4PdRBvas|l`>rjTFW8LRQ7u-V6%qRaP(DCHk89OXfV z$gWZduveyaJ)ivGNNM{SP5bFd?v3!DgfYCs<$GZ6(1CsoxCzjQGTrL-jb~9e1PBHA8ZYQh}68abOy08g{O&h=@Gyx*j=_{LO+kv8gF>-> zi$8{bW*qy*w2KGdhbT0^3A1$g@(F;SGJD%U$%nwy63GQ-NxJR{I5K#UR{U=BZo+YWziid&-;qCpG~1w_ zQVKys#X`xQ$k81Y~YG*jmP1Ry^KeQ4PY_vzrY$@3! z={rR~_+U@`vt)l2+sN<`NNs7~KYywNR_);QTXgVfG}?g$N(3_bk;Zk^w_hQ~oNZMh zvVx{`lB`UP6ohV0Z<-$WoW-=rV^d~|T4ZJIykw3WEq_8tTE69lTC4__(gm?ry~@gio0 z6R$%XBv97GAAYU@78dh!a&S&i^xA>7@_{b1t=In|=Dp!FULVx%Ml=ayClL+&k)fO4 z5{(HxAou%Z5Qq$Csr}A3HlG-p?%_FlXw8}`>0&`FHv33SUP#P4(K60BQU4V1;%QDy zwtsnX3t1d3>~BICP++g(d?pP6aHbDgzo|k1AM8*3pYsC$?~2KAq56oOKsS=+FbNtc zkKY)<5A1Gr!xJbZqvTy<=j{eK3*P@HpI$)IK{hO4n)rty{GA6-^K|a}J@~eNT%z1FFgL(9re+7-Q6eTlkE81Y zZ>^Z-kXx-5$wSTiz2zVro2aCIHDjU(vzWjS1e0Q794rh^GZ0%mAqI11Tqk*$6nYM8c4Z^%z zfovk(ZJ`iUtHtG#d>*wE>-{}1&GC){QFpO=y+JVY&5?!ui3?5Jeb&?#6{F!g3B7Tl zR8wt?LCIl3UrQMW+I2 zULP-qwi5J8ka))IX<)_hQ&vf{fe`hZl5CdEi>h8cNz>)qda25XN1OM2oX&@AJc)&* z{s6J7VPc`Bb_8t#2<9TFsviT2a-GhC(Jzzfx;UL@jI>c)!U;>G*dCJ+UUwTL%SU4( zXqvd&phF`7$Vij`9U-x}w0W4+KCavG>&Z5d)z|}sp2>)6dwY3mi$|$kAC{M0GU}*A zMGbu%3UJpC17_^cq(`ChgBWM4qy-63yZ~3}0L_4ruML1W zIf2AS00p4Y9d{SrDsP4WRuggE<)Y%p7iOpv0$8-@G2?|R?IhV?biA_^4B6n}`%9@^ zZSTDp0^w7$N3{}ZvAnP5u;<78jL1}yFd@Q}eLp3epCPPx8i7`cOtud~ZpD$Z*3VLE zhuD%XJ?niTTgUu(aWXJWJ@*4%@v$aV8xBK8BJJO^$GvN_+Q(Hegu~1Ti^K@JmF_%e zh$Ad!&vmJi*b^!9(m21f4R~4h@Ke^*Bd!fsZJGCSkK%ThC;Ac-j*PYB&If9nAyk?( z6D@Q%8oS&b^4Fy#yJZf6}SA49mr8+e)e$jH9ZCsQ|3uR*z)f@oi?8CYc8aXM*tY1M5f*msl z0YEdQJ*@vveFyz##?L}QAjvVZ0zO+M!n;QENcw@(7G`5%p`#(tOapm3Td00Pj z4{n0No1ngvc37oa_36Z|ds%DNE%tdfU|l|QiPbj>E@lh;NI-$87DE7xiZde=7iFz* z|1(wi&EMpODSl+}!eQXAakSW)*gq*9K!xH@`vs7adg6BrNoZ^HapenQ8fKu1^1GO9 z;G^3f(s|EPn5)5mbJF92*!jQ;$Yw15NuQz(TCz9UG?ZYwcD3;h3)@SeV4IAX-;q|e zHqw(PWuvZk(bAt7eD!Fa*)FwoF_Ir%57;yJHu&hW=q@Cs|c95BVp8&BiM z1D z_jMz{b*2ZyOkLQOhv@V_F{>6%N6{;NO)BXbq99envU+DQu!WstzZc1hJlmEm8;&paS@9y6P zo4teIo#Anj**dRL3h-$ke}9G5bwE=rQg08nqSDU$HEw74WCX974715_i*0|tawRbw z_ECwtDrWgL6?)B@m40eVz&W{b+R8fAIUn58iBRQxvBm@KsULx_Ax;tuhacf6Q9M6! zyy-LnJ|G*$@h<-+GzW|5Y#K`SaLrzl0cdd-Jj&`9fQwyKzhE%`oj*_jl+>@RUYfCj z8lD%_;ceFcqlaWxU8!#}Y91mcA!f~0B0~gQLhy`0;SgaApg>O1+g_xXGeB*iB+z3& zQe$}`ECD)c`+NloWRcQQJ0AmsTPfjq5kz1D&7jUU<`&AF%aDn+P(&hvicH;=H_$3I zq)?tAV*Z%P#|CFjqH|+$#U-lIV##@0S-s z=MYRG2!iVEUT!Db8MSd963xR1KvQ3`VFMB_Er2x8#E9{gjJMfuwp)8)!Rqy+J_XVl z{E^gtoR`4D%+zju6hwiro}Q}6`p+R15rR86$3ETn=aX{#IlU?zDep3LH3ebw$zTzv zBO2}ae1BE3l^lRdhE44;p5QnbIx^_1+o7`{d$hA!-7L0V()Jf;EiNJJ2(N`bU_&=*?e6=2v!CekE#NMl-LVqpn06} zl;{^aCS1MMCYBalH9oV&cQ~052?MMfIbHWzXT!<6iAUtL7bvY=9 zxx_8J9AKJK$^eDeLq;p3Z4W9M$HeFo{D{L1T7YHwu&Iw6cMZUkl_6SH+|1}#mWpXj zx@Kh|JYJAmcGfP+KuH4!B;zW@9m{bpHHSru>3)1k(@L5W-=2PZFsfku>d= zjfoO;gG>UMItfDYeAA))>n7!L<7x8n&CtQ^{4w^rg zi~R*zE6IW>$ydF)OO*tH4$!tJc_BSaPvzR2*zE1h(A0i~7;F=v(dv{fpnT4+(;355 z3(NK&%-Q&CD{D6tWY(bpuFU%p64|{V#%)8kX|Gb4T6)mZz66>=EFAPA`?zU!xui0D z*2d>Wj#E%ihTN`arb$ijjCC63Q`6iJaT-4 zJglk6_kga0!+`<>8tJ@Yc4Nv@RLTo2oozx0ABq#)!!b=tr5ZV<&ws@t4(HLKC3E58 zEB8qqSpT@xkp0Oz-!V_{>?38`kcTOyYtBPkja1N>UtHhlIHaPPUbRDXm1ru}A~eiy z-dpV;T<|`4b!btd9X2QTf@sl|wK&79h#A%mT#g zH7eHn8piJhhu%V8h!45NG)gBCY&c@1?Oam3S1kWK4(()sE;4R@2f|AN^9L+hfJQ|` z?RP6uQ6Q~8v@KCjyy6N~Gxo?)Q_tlqS$!J}ss%QO z+b0U7B73v>ew5k9)6UMsFiG+KkWB0;3a?YA!Ra`J9`zcEg8j94#&lE>W_uWuMk`$JnA2S# z!?CJp>I068N2IDZGTXh)7<)$c!ep|b>rjPqbe$~3Rm-bKw}DWgfv~B85ZOS;RCl+q z++EFx)dS%@TGg<0vPpe)BMa4M<4*R}eJvv^ep&|At+g`@b7)M^l9ia!jI;WRx}y-o zhd03|gZ-qw$3CTLG-vHY=HO0-abNc;o;lkM!1Cdx@CE%4*+v1FvrK@acdczQ+uKGP zxzmZ6CjduFWEF3Whu^DPy@Ak8vLohN{4~eAapwVW`8bE?v>#9v-NmtgG0p3!67{kl zDXnU@PfY7M^{pxLTF8vu$nLw7t+|ptc8~Qx$b(5#q}JAL57T}#ak=42wzcAGmQ0w} zK-k?#xZFs%(Xc9eTyNsN^XsOJ3m*1Z4?l21GveNsPKe#{OS+~U_4pPqnR^npdOdI2 zbR5PAtr`jI(3}l~Sq;_KU6OMw_;~!-WrV%d{XROrvmM7y<@!eC?qmT~ttFwt?bpc7 zT*RB!0vjohzPGpb{f-Pa_P)@yl&e2_ypd1}Ya~4Gc}s4#u9ZPa7P5j2s#SF=QAdny zQ)7D53c{Q!8V3i710K$c*0GFf9|OzUkf_=po9d-v!%yex{fld8Fxy#xt?;{Yh~-!aR!YL*6uF>&RO}q88B=fq*`!0t=8T21a)|> znqIhF+_);eDp8-6K_2VXXVGxYOPYavkXNCFPuX+TIT0(oQota%@4=bFw{pvA!(kZ}LQo#Qqf^I|Qo=7t{cgaI z4q&UR6jWGLStVnkOEhf*t1b;{`HpO>c2ekH7t{b8t3X!nhMumg>$#kTVZ$@_W0scC z`jD0$V;4KEWyZGw;%;6P7SlD%BeQLQAUYeqzTX5mxTb;37>HZeSYGEbj)tpy`r`>@ zhYtL#OV?yDxaJeEmD$qE-gx9s`K2t2FW35(3FaXj_6)~GYiBZ6tiW52{giM= zK#UrV1`>enML*@$9uB+@bIA{?K{@1+GEA$9h5#Ji2Bm4g<*=oRTj|0F(I`@av-XwU z8&cvnxM1lr2b;t_%9RHz?~~f6hrP$#E{fZY<3_6w4j$5N^Y+81EX}5F7y-4)T9KOY zy4t6xTsFAaE^JY!ib)o+KncJc@RX0Sfm-f3-WTgmy z{7f-Q^Z{8p0>A@Y#h?Zo_p4pPRjs9r(Zjx}6`5_i{_cF*8A{Ted)4bF(vCLq%$sl0 z&z+~8JD0t6#`^u&RFu?@?v59*1}#qOh3jzUX0 zQ$pvp;FZ*{uqBj#9VB&)OR-!b8xpO`{J>=GxMa*cBh(^XHD)l_UVPO3KN)PxhOHdW z@(yoi3PSDx++c43v*R+7r*_$-EPIb2SEswsA2_-RZx@mZhe(nnn3>$(izRFei4G5rszUdS(?x`|ssc&imySCk8s zo&H`{`AJ+}2r01_RidkVS9zAgeRg3ev@I(9*1o9Y=A{Er7Ononb={@y%R)uAeOpn) zSqIR)dVF|L8z>b=AX!L3+yrVyY%xfrfv)RH-%t#a;vKNK9)jnHwp#F6p)u+b;#_=4 zB%erl$HIPES4&3Z>jbXuU$uKH`&L;o*?V0%yzL#JXlFeh%xA?jy`KsM#$$gl5|9$4 zRen2Q(|^3>{B}Teu9|*u?O;R;v3aD!mzCXD+aM3kVjx%I!SaZD5>aHun{&7aCR2L0 z5s3bXEAc(O`SY5gWg}LGBRe;U>(++E+|@P>yk(LUqy1$OWha|=6iX}g-3P(@ayclB zhXXNOm+%a%+->8ho4>sT;0gxrp84-EmzCEQAv_Q1*MW7oPvnLC;#*SL^pK-{)eQ3S z3^&o?h!PUGfw1#VR&uRa$EY_e{mc!c@%;=@n@DSOj)2Z{Y1MOpg`XPPhMl;S#lXA< zJ9si6;cCL~+~<8JehFMP%HxJN?Xt7qe5TYaPI5z7XUV@**iD}j47+1NnZwXV-oD&% zLw>8o(QmV9`H+C+r0tc!t9=1YK|aA2y!uF8zj$WfbDYb)Oq=IuCp7@ex4*ErxdkeC*TK;War4An@(_&w&KLO%Djp56z2#@+NMf(w_^*`o@rBPtf80 za}~6#U#}^9^52}(Z_Ei|5Vl{4ffdx32zjPZ|msP`hQL$%arEj};U_U7-_Ele=2*L2J!c-@yP5x@2 z{iLiU0X19fNKz&BHi0U!CFo1wb0TwEhHus^;38-RQaFe%8U+--w}{Uds@b)(aCj5_ zIsPkLf!|7MvwwYj^`!dx-qJUI|6fbNEEse5ookn4* zgwnh)0kv(53l#1zmKuG+}!2)T=G=HCuMl&zq zwZf+F^srK6y^cg+WG3d((;VqGxXn~jn0xMUQ2arw|4(G3yikDF)R=>o5Qg9W^tAEvH>O7g8WQ^UwmvZLUrc6p9wkF>B0Ll>)U8<*ixM6p0`UFiA}5K!G;dnCo%_62Gm|}J=B3$(>E;Le zIRP49=I3LS1>Rsf2l+D1~Gagg|8_^KWNKV8v znl>J+`+RH>dfqqw)IB68quNR>gZ(V*0%Jn?p|F)@zFdpmIEslYusW324yQ4lC>(m8THcd4sP9GMDX*{G$Vg0 z9C~laB@I-VByr$syz(Q3R=dUD7}y%H6r1axV4JgZpk>RiNHfgy+1a#rk3mnpYj>6a zUh@m)VCMtaP;4I4BZbG@dtqxDfkNi>lu|)&eU7%eAB94Tg1|(ydxij>P}P6OsF5!% z5;aR5f7wz3Sd3(~DLX$1c%w2#ZS?~dc{0+j>4D=nEFCVjC7KBG{&+w6q=PMO14=xf zVc~ov_WfKmzbmrakL^pWt_PnN|q`W_ilaiYF&@0D}uNp<@QRHi_Rj zcqQj6V<^+g!=7;7+ENa5d;@-ME5JGHpSOKaK4nhOTTb%H+Q@z~sPYY=Dlzvjzd&Wz zpC_o0U~(U&ja-(Y)@e{&){l&-JYTLd+e^M&F6j;UBLlLhThg6)-GjK@7FJ5c!Gxw`$v5LT9vZmnMe)Te=17+4(go~s+013+Hxr=uXNnLh7$ zZl}p$Xc*ondDMaNim~XV{pvs_Jt^(r43=-Bh7_zzckpvMkFs!8X0>)<8em*aQ579G z&K{O_1E_KTv|Ql()eM;C`Au1X`@EufaEhZ!p&AE;Wu~O(28YI)5>s9J6%qS#JtCBF ziLo2UXt~wwfOD`6rOElz^Vp;)2Bm7)>|lTd_QuGAOcb-LIV9JSAY>@C5=NlmJ$kZL zbtbDIQPJPeRZZ4*wm=!+Ih&T5euaB8H{?o>3Rq}a4N;Hq%B`nItdS>1yHqegK(uF- zH;ZQ%IA*1{Iji}`RNOexn47;Jc5r8H;!_WnUQs->UK`(2qy4U%5h0wOA!rDb&0zK^ zJVX*Gj8kMj3_mRq>V_rYaNj<^3)~l|ZT^cQf?GL{tJ*K4<*^OZ zjlb9PDUa>1yWx2X$*nUxg*@&`Ar&hz7DSF(c?ScWF zEa_KgKUS(!jXS5B%oo|b z;c(&_!y_ATw`*MI`<%@O9h$^3YPPAJ`oF5cH-w%lXD+75+3Yh%?@(umMi<~ zWUSQZ^VO-^y<}3d7N$6yq)#ae6)$h#x_N3J14JIM?mDK3B<;ho)&&8CT zJfiFm{92@-$1@x$?Ar;f?xQ@Pe>S;4h@YFODf8Qo*;S>QHi@`7V5RklXN;~fgM3_K zyLv=&5^sA@R$*;tX4l`t0YSI`n^Q$7Bq3jW=kWWHGLSuWBYWDvVK|_TCRud$9<@K8 zB@{zWMtSGI+e&>MC5&4W(m=@Acz_u##>(t^AuuzH2z02(%!l`hd5*#U>46T(^Y+Q} zcFAlaHrujT8L`HR{)TJsQxUO60JYTQ`)TfYxl8F9U?0lHt7dOlL?!<)jvEOY+Uo?a;a8X=bXRsmr!%+pLFGnGA)5m8EmY)*TYy%pJeMN0N| zAMca8y5{Hmz3%&a-;c-d@%yLiO6Pcc&Et3;&*S;D@!5>FtgasCa69LYHDC-Ia7_{0 z@cVY(W|GVySGN6*z`P4v7(a$8b`3qO++`^qM{KJ6wMV0U&g1-*;-GK7cD@;XLo_)P z6EEjwQsP=ki8_UcHGC(VhFLbmPx+2FMdS;(CPZJ#pMNtNXd{Y0#M>(#x<))|91vy6 zUNc>LOIk{~fM`V6Kxb5kr3E|w*x^b9w)rcibmK79wBOlz!eXzR-KAhLvjP_?CL(bhvpO^xRw-(h6L$X*&^?|C~Yiy*mbaFyD*?BVdqRuK=kV_B_&@H*-obJVV&f1#9+O zCl6T$(c_YEy%&%3B=w8>GqBH5I%=4Ta~W4Up(_J&qR)h9IX`nJ?B(mHkXU=WFY1J- z^2O499J7z}!V-=EU0m#Cu0i3E1bj>bjbU9ENcWr=T%oTis#K=xGr_VedB)2P%qQlJ zevbSS^J`F6aEEO#txND8s#Qhh(}e3Rtk%#yWHWvd;i4Xw;;+hL9}B;s+NRIil$|{4 z{Yv>RDNw>OT9i&N{Jzjo<*6)Yld__UAF>M5dx6}JRvM968v#R&N4lBNtWBVz`ii&N zaN)UNJ_Yhw|N33boXnduUOD1=$3@bo`CFdtWgj&WVjUjq_I321jXy#CPQDd< zN=2>v4ByDWpN;jwQRBb_rcd#r-S3DV&3L+e{#J6MEGFkZM@E>UMmrQ-y=7uKm7m+B zE5uG%Vk>y34z%}MD4#K8?f+;+9M9>aIC7gRvfky+9!$I8-s)-%j$m=WvqvsOU;@`x zBgo%NiPAjwxo|}5k!=FWOb>a-iFH2ODp_Ydt+0Pfwe8Y1&@#UT^c?7e(02XW0hxVszhNg(_}lHZKlZz%^IK# z(VWONwbGt#beH_rQDL0GkhTJQszZUy>TNANXdZugZu>t9hw4!wJ?{<9IGjzWNkI#o zD>y&|sR3V}0N4Ke&9Cdnr3hLpvrzxL$L3s2;j#hvO&FYxAm%>Rd;z*U# zLDL%46#C2abJhkmY*STU`p@LpsZ3YkBIQzecm6ah<@?1q+WGIp7f3=P<%A+^9(pUW z`i=_?t#eEU9=*ouu(EmMTgX=wXRU9wemsNE)dGmk3yx17#=MSUyPJOMgjyG`sDk5D zE()2wR}7!}C~^oqT6rLMaVY-i)QISGtKen3EJpM9yG|9XJvSLzd6l}0JwL0>B9KF+ zJLl$Jn(IZO29gS%lxSIMVbxMo?QZABhZ!RT^&6*~ZZ=XRBE3olS;>~|ce2UH&F?68 za;V;uOVJm>&_)Xl>Rf5Eyd#sxBMBGa$Wf(Lblgixhr(e-;|)o4iR}k|)pEHS%Iwo( z2R`}Smpe|CeERtbiqyTl4G)X!D7>uP*1G4kVu!thRq$@9d^o=0Sx~&Pf8n$U{m7!i z+qnz#toHKStY$vgr~L<#&#<#i*V(u*nD5J|kmAmeDLL|M^w)C($zo90qG^Aas;K>F zd{Lf+)#<&;zC1?CSDz;yd!n@XRR+wmgh-#ny?jnRqL?EpshrWTY~Ran%0e@RAR7! zO-4#*w|a6KRaD>l+jVfXibxm(Ee0)#Ubd6)=^E2v?}}=|B%C|$${$Wy({%aQlK9kT z2j{4#8zPmUFmAXX9o-c^mv?rnR5D~5N>NSk zi&4ragtea1ccs54z*M2ty8@f6!qO^ov)Y>YGK65GQMr^;)Vx=ZZxUn8Tf98xQb{#7 zb%|>HxSiPeXQZX-va&#m7^`DfsK2L4`61J`cXaQX;um_rd5SD zQ1Q!SdRcEpVZ|>()z0C2gFK!X;g^o|p8SS`ffj?ICw5}|dHB8R?_x8td{EWLXX3gb z7r^(8w_JXgXoYIkZcV>AmU!r61a&a6^zX!WRcTvI&-pFCVK=5X^ILkeDD4>HQLDnC zfhgjd(Jj%6WlPjl#o>3Wbc$uDiI?3C8Z9jMLLmLY3=5v0v zcMV8i62>K*5#_ZROVPL~+2nXiZzwl|_vQL-X9CI7=+*v`qfO%Fr+9{G7H&_&hNpRZG$4I`FV z?GL!`Y}|67%FrX$nXfTzbDXyG=wi-lel?NN1m}F8iHduQ1%4l&_SSOpi0N>66?TS^ zYxTA+O9vm}IA4B-WF+65{OA+6C8~{Uo3~IFa5wcIov!X*=54&T+Tc`muy;j#Tx!EW z!oB>^ql`r=qcY*W48X43tfI@*?5ZYOn9?f{Jo|*a+d0CvMbD#H$Kbux>3BUgjTBS5 zbD}R-KCkE2=0(Ilh{*}R&V5_U>5R0~=C|-BvDzlOGh)_~OCRr!>h#sF9uWKB6HPcV z1(a;h)>wa;n1azU)Fe9f+-ord5|P*EH9ohFr@uvG`#Q9sUs=|plwwp`#B-!I<9#$I z(%@_qqiuzH6Q?63LcIs2O4cp|h5h6xw&v$jv~63KJXLh>&K=wJ!bKYsW%=d@zMX>!#nb|Cjcw+O(fD~PksS-s0 zf;mlQmM9rO#vvrXMR;(7AzB=L-#H`_dOx>dyS#k*OZI{Tj@66G5q+Wk<>pxY<5QD` zZylz5Z_?L)Hb*p=^hLY!nl1D2u1P5LcO?qyC?9tbp=OXKmS&sLZ+qvILT;^*E=bK# zs)5Sm8F!J>rIBeevJXg;`7pS(Td|MU-ly5%C<{HO<}rWEY>x+ln=L`YSFpM(&T?qc z@piwg-qTU9l#@uWvJERRvs;Z8*lcFJ$8V|r{Gn7Z)R9+EjNwga{A4+*EuJgQW98;C z)wZcNL4tv8R8>Gk7f@G~nS;k^4zZsT8eDzEpVxEz;=?W{UV@nImx#?#J!yjBtiy>G zMx=jW$%_^Iu3iTEfv3J}Rb;Dt9EqH{%#MYURhY?;w{4oJp-p9PCr&mJR2GFnToG8- zOhC{99U+{i62gqOCk4LPA2~k12|VqzTPL|21|B~BFu4&)>c*z@vD4Sr#xEUCk(uqf ziC=DQH<-vpAP-80tH9_$)T_!Ce)MJ|a*cEkpdCNeG8mL=;BPbS?YI~5-xdiIs{yq$ zP>6u{Rc-4Gf_&IdEe7!VHHEuK&$k)?Ab+dz2YvOn9)+LrWZv<18_eyW$ZR~BG~C%K zF1xIB&gp_HZ=P2FYQZdH(w&J5wCglHNRuY!`VNS)^T^*Zm46i6!s)W;-VX$36i)L? zr~9D}NU{L^_jfHl+g%mE=S}@-Oc8=<8`Mp5!~n>e#l|4S!u%%ZF+oSa{<me!B){^y5p0;x&i5C@_fR+yXGmH?bu zSpcY&fv|}R+kaC|u}58I7Sx4`zaZquC8$RnV_-SCa#|z*!ghN%P-0pT-9eQQ5Kz%< z?@Yz1q=~XRwzH!R{ygOW7X%>Ti3KQXpeJrhthDs77pnEK)8y;k0ML>7lJ)*T4rqQ_qp?=^^itL?yWsHi-0D7sp>|cp@@=IH{9Cfrz{_4g(&Op6vE} z_bf=#2~=fya!ImutW)&&NZh~bxf{<2J^AXH`fGW~`+f&_Z)(qbXO!K$UH6!u8}C(J zhyqjRa|Ls>wMDSjB;_^F-KC(e_9%u9QTjz$m)1Tat01DNS)0z$bLhC{m8?Ts$;{D5 zBN1<5(O+C-AXajT${|!WDBRc?VdorgYT@MpvR4^_J33`lKg#wz`DCWqA>RNJzQW^6 z0pX5uyD41WoC-S1;9oW0c8T)*<6>8+Oec${s0D=!oVRS3u5U|fXR9S*aRj7UWtheO zsTF@LqT?`phlIC!YCfFbe*tB1&W}k|*n7sHPfrNH9Svf>T*2OXuM-eahRv062N1lp zG}E4J&A9oL$4j3ut=PCJ_ELd{WZ;4;$&2FQT3>Km1FqJwuIE_f%BnA?xlUZNT`$@bW~ zXP!44B<0$sZ|Fud)g7MSe9r4Cp&Maw5RLU|-L&_%Jc?2Y7>a+WsW zMY8Z^HUm$F1$C8qzSiPJAB8Z&;Cdmax@_W6lZ7O~k7Q-E0eXxj3U&Fx`B|$W@LPjE z@VPoIUvxnQXEBd`E4|`%>+38gT+NA`n+JkqNr;xi?{yxS$~KE?_N*x|;?JCPApa&x zWXw?*e(D|W=X95q+6@v}JO!z&>h!OhnW9Qs;FY_5gJgY7d2(|NWj+viRz4@DjhrW) zYB5sSZ%$5A$8+A9d&AQY(|(QTi8Z&@dGh(IkNXHbs-6=!60@8o*~RpLNfc`%emTfA zX~^F+Z_w_Hi)a4jpsskGxBRYIqP$7j%C2M*k$PIFhzRF{W~6w#->7I?Qod^ET|qlb z>lDBnwKEWnx1o;WktJU(!ue8>>r%59sn#dgJA%a9-nuS0r34S_TnkLsb4m2usP2+# z_M8%33wfE}5%gxmGn-Db+~t$TQhPhlt^*4uMSZltu=}}#mUUUh5t8_Du^QG%N_=ZJ zkYgE$PaHj)fgA?jk%cyA`3JK_4OuxFo_5R#h8l@ZvtkFlFU!F{x|c$O^co@~;sjmM zaQz`_i=e^T1_E%~&C_s~ULu!1ykYRQuV^9Y*hsGCQ=?>Nd~W!bcSh?bK9_@VGCX(s z<98Df-ifr*X3b(70N8zCj)gUfkx5-;d2j9)6ORDdbPsYb^Stmq(;#R<;51})mD^Au z?7L7yU~1ZtU&RQDVY`YveWC>7ItUm5!q86`2TP1x%Z)oT-Wwyd7vLs9SK$c*F7v11 z3{r^b_Kq5Ka@i$U`HU2SKj;gOR2(Kb)17m4&tL$%_4s!j}^;${U2=y9bLM*hL{ z1Ev9H+l{iI;rfaGy~bbFL2y^NZ5tuy&(6fKMc|>GK3l=1hA;Oiv z_Xwd!e<#5HtlfzeZvHZAF>L!MSz3TZuvT~Zph`c5V8I%;a8lR&JW`5_vik>*cBjSP&=tAbh$OERIO6Le&`4vm-i7fqxdrI4G8LNZaOnKMFG&M2#DNOwk=Zr(7bB*RzL#`(_?X7vkMvf0Q^4?`p`c`4)u; zDV~kfXx{vZ_%8#VP~`abGLvy8Hv8=9q@hy*X@ z03x)@UjUVVC~-A6De*iCG%aVKRJ442i;W{`4+G-sP(Vl~`b2ELSvErQTrEB#zFj7s z*YCPEx*C?jgEZax>fRc>muHn#Pg>tOpT4owuZKW{wUr*kOp$BVlzI|tN6$azpXe|M z(sexZf{DlI$#Du{Yt->n>*6~7 z<00GH*yVoggp5=IH_mQD_!Y6;TbnAz95en5CfQdcE{tvlelO<9}`cN2E;%m6zMba6IF(2E*J*=ak|yz!v(Ldf$En`mqKcg5X;nUWST z5O`e?$7tj^#kqbwiDP|IhXnhE=e4pq%;Iq}>}uvWw&5ic&G)Kg>rBi0G*ocSFxu;# z-3Z02Nm}5J0A02ynL37=*?7lKW}r3ReOwD;I;FZGO%nQ1UrnV)sZ%yZ586;f>>6I1N73}frH3jQUR<{+vR|<`>0m;*ibfaiVKAJK|C2; z3yAEFI8By{&Z65(`;2Sm53KJ35nwsM z?1+llQ${NfJO!JM=~fD-X3+laR568`H-fjx$+3gh>|FRFrZYjXJjh>X0i0~v_tr#}M?;_C@947d` z$>U&y|B~egu?5H>{1)TFM-Ug^Nw>ftop}uS)`-^tKmoW&-QD~rNbXPi9}$O1j)4P$ zJetQa7k>)wOd zp&-qSh$9S$qW|cih1Y%ub7&FRteuQfmPtun-n&mZax^==2E%Rjjvq=25#zYN1bRs45m9T3za&MZtg zWnWN44f!4qbN8%(QIyxZOOyx&kku~P;1(335QFLd&8S9yC;;iq!!SHWw2c6`{!8y3 z9Mtufi{u}4mwxDmSs~BJ3YHnCzyl!}(GEjPK%RDr8`(jga$pJpp*v!3A`tk$Z&rf@ z$H+JTQR?_-7nXO3^xwAAN@4awRsd<}OE@lNqkahHKmyxZfH)fcO>X!XG{2p^1VD9! z5coaQL~M-Um-_A@gdBRv;Q=N_+qM)tXWs*f^Rowq1wgnj%)!Uz7ax$DLxv%jj@8k8%9Mct4z;5tKRMr| z5Y;3&sObHyF8e(n-cE@AX0C8~Jls3KlLxyA)*L&IN{|xapxOGqQx?jM*PtDG2L%6r zWf+3gFW_dZ1=A25GW{2a894BOFV;^@AwNy{4`})?_UrEw-Zwau3h7iCBZBei1~6f} z{r9(v4sLb8GVn_)4uE>fA8mF3TDc*l_&?_25+vh3a)UyMtD+p;*p#AMfY4NpS{m+E zmk~1;&Zm#M1D_4QRN5K|qT_is4qb}yI}Rmk_wFWA6id_5#m(C zx$&CktJ#9M+q737`b6A-q5$uo{zQp!Arb7t8ts3^4x`^eunRgYj)&#Tg?W;kS(PV2 z`5mn~U=il0+^(NA$$!^L?y=QGnRvJDv+~G!JjHZgOrFx!Gfo4}c#7GjEBv24<~$fy zwZ>|<|4%Zx>ITn!CZ52$S2iwtT>J%&yJqzq5a*DvZWipv;{I>OF-_gdW+YIF{0Z?# zY`drWw7{Z)buC0Svrcj$eL3XyIdiFOX8MEXP4RC)IyZJ9g2wC2PiZ6x+5A;^E00*wDNsw~&DE5<`fRr_S%v&N z8Waq)A3a`wVsMaMgSPgwUsWJzH-n=J#3zt&9&tLt1xY&yxQ0aT&3MFsCuh^<9%?3g zx9|nB6fW)dD)`zV0oN=!69ptkurIS}iRXF}3`%6bM%xYyn0c7PqHOZ)we1+Z)^<0} z-8_OOSD2b%`B`qbxlc3Fu{X7RPV3Goq%6WJ;7zG$G2L*!eZY4 zQ_?@2(|__r9GF95RJ^MQr(<~^^5^$R0=erT2)Er;DufG?w$V$3jsf;^-|a{slnAuM zsQ_XCF>hld7?A(Xa_ip)oj=_z|IHX2s-D01xq^!7|AA`@Zoi%I4g};8bGW~c!9#Pf zvDkDlXz`l8y@Q1#!nA-vJAIsQ`&U@-qTqiaz7wPQf-?wQ%rO$gMc*Ki2kG1&qi@6} z5)u_`z`sJqVYbQPV-eB2$1;N4nnGO`)7mz`Z6;f(M!XYSXL2B47 z&y5Gc&Ld>IY}XMWJp7N07I3=zQPtmx|9{GBk=P%SQ!E9Nw?b;Axvir7|1cZ+o9%*# zJi{VPhbkUoXjUP?Bf0~8mwal>r;pzGY1SnZ@qUj^R1neD z^H980MZ{z%wy7xjD94cy7N-z*LFC)E-IVd3j6d~a(n`#kKSul7B!6pjZO3366mGmO(6*Ci z@nAB|@@L9P?D|bK#o$eQgxx1&a&X3gcEVG%V-Ldt$u_?~-JO%#^+xwDsq}2)W3lMU zVT5K-8uBZ%Rm=Pi&&4L#793{Tn(UHeC~qL0=+k+NHtp?wDly{-kC zwNUF3&Q1Gp#^xrXR`Mh5FV<9Kp?( zOwmA>xaKK*O~*L7*sR>xH|0&0Iw00%LR`bE2;s%-QU-b6CGtjXSFIBPGyMYhzZsOQ zW{EY6m}sYrtE-@z)|-c#K=!y@HKqW`))H+ECA1fSQ$exW{VF+`!fwB-J){H^;n|Dl zoq_ZUsE$Or8;Rm|l!YcOHrzn_q^U7Wl*hxk@CqqTK3X}Rz_6zs(j+vLJfzU%1aZ95 zoqG;-<=jY)Q}#qr&kD&0NETSKV{-P(qxv&)8R|Mb!a=wmFOBIHe;eVgR``+HR_P$| z9>LMiL?r>Pu#cPF4!LUJg2W8h~giEz3Pj@acyeoV#shXW&iaWm-iQ zAz~a7?o2w}T0x)a?l3J&YFYNt!K;jYQ=45!Ot;dQ5p!FNl&z}%s+q(^$c@^(Y&Q%_ z@)yb5nHJeJRPIBtFBocm`%TSY$My5(c@O+g?V@GnNwYaB3^bjP4ku2{B&IWk`)7IF zdCQ3!btYSfp5UCHa2=uim@*G0(QDdII2%s_`G>@khN1&UHlax0U3Gr4T%Ty5lRyH8 zjDz>!?xuXIg4SYt2CStx;)2n%w{H{o;(XDqF-L`G6l{1tQa+Uk%WwYpEiP-9-@vh} z+~SA{{4^7qHGO-N%SaicATJn;8Gd=h#OQZjYh5{~-UHobj>@IT=w;JZQ=emLw`$mi z+pmW{Njwj3u6@NO_FW6$*K_W>_OR?x^lFyeq* z_KaKjP)$$_SY73XIqKF2xNy~mloQ94%d@Stp~-DC9sKolm9@J~<@c31y&=EM(uwuy zq*N5PU9LGyb;x6cL-&v=L8-d}6!y%zsj2YbCp>doTtwm=tVVcK>4bcb_OOpjJo|Ks z1wC;%7?s7s8Emm|KoRS-hh1A@S7d z)4;I<@aG^FZ13^uL%8(%ZBl1jp*t@YR{P@lAi0`Qh zuk70sQ+{7M1Jg~7jJL|}zx>j_oy6ZgSc{^op~r6z0t2Ohte!1(iXU^NtgVgC%Cf)5 z6mVi7Arg5OvQYg}Wqc2{4sXU=s`BtkH+YP{`Q86(kAX}Y^4kT;gL9j(iRn~xF_&`- z*ZO4oriI5y^9vqZ7zs*bn2PO+V%k%=tu0og=_@*&!v7T1T~Id&72Ox zLR`<0neEB5V|hv7B=H99Dr0x&&@9**3$B5Vnu>3H$kkMsZ2{{;I(Jg$BS(ByHEYW*sWjY?K73-a(KyHHp1jPd=BKi+6vNl7JPk|nMh6=;SS+r@cG%+C($Cpg;@`9@ zc#$Dw!Tk!Q1_TUMW8O?8S-!0An^C=d7eyO-$fb*I6r6)f)CWZG1$21Os!)f-EcxR$ zag`ebd&z9=u`gg9>%oAweZJP7Q{2W8TwgcBpo@k2L83*Tb%CBHz8SmPB~!fu?U{lj>HXP(N0b>fa+G-R`IzTDmG$Z(?@GHwbM&kpv1^)ua?xHr{JM03{x>2CM-?=Yb#jFT#%o|z z0hVbivdKr9Q${W$dID(i+7}$ra z-+G*7%~)LcsJF!8_|BVxiJdQRBy>}-(mCGB=gV&C8Qpm2PL69{!>fQ}&$pzDO=mrb z;~NbrT_+l5uQM87OSdX65xMZT1jVjKV?k4QHY%;^*r+2|ZTB9D&>6yZUT0VWd#O%O zL&>VA4-_cJ?Q81Fiyg~};^ycbDdPx3_=@H)C@_}HC-4b%H!LNM9xM=Lq&J=u7|&0W zDp>*@se5?w1A24@ptv}m6J5$7UkF+r^Mv882 zQUy7|5b%=X*=y;H>D@3x$fZSGOaQkHjPY-`4IJ1b)UNi83HA5%Ekn<$22x87Pz&!0 zo&Lu9eSa8}uy+gdYZKJSlex9<1*1-wKHrN+$fU%Q_u2){lU1HlS!#nkf zubiMzY`iW1J_)|Y;MyHSlm}?Q`a8tV`Sl#FvTXoQt8+zykG&101*}6RuEczfsk1&# z$P}Sv#zlVEyV?bK@*Bo-b-a&xatgc5k{ol>YJW*uPTwUhVK;WB>Knm_p`qXn51rCc zEh?5Mm#7CtD`U}t#lK3>rF!og%;D+==~UyT9=oQzwXY@pnDh#oO*BS4iPYp*`lIAn zQR11N@x!wBZ(Y^ib)EZg!+6heV)r2j>-q;(yzYmhKl|y1&!rrB%2}ea@`{+YE-Wg; zfLP+F6zysE8GeJxTd$P$1_V!V3^A0RS#j?aeeM2hjJC&~51z)H?_EtEytSWqT4>YQ zKzrHZxf78d8VT2Pb=;B|5GuLda*EdRa>JV3r}Ot$8usD_YF8+R{GuEZW1uI>9AR7i zIY^p0h+M~!(E=~u3x-C+u#&lWB2faw9o1ZTDKN3jI}?k$WTmu!glKSrqMe@~Jlh#L zPW^)JmRbh{OaaG`4LlS)ae{Rf7RYZgkt|%;4x?Bh;?Aq#?XyJy4h;Nj~mBIJ@vvL^!0Nu!9mU$CGobdJB$; zN=^^xo9YS?n@jYcf3+E=qiSj-g;08@*PmQ7q7Zq)-i6{JTQm=HX@tnbg0DFxAP& z+92|!`X2fIG0uEGlT@{u2|;djB#(z78~pP_I1GLbl0P&9OJ?|md+JY9rwdb8wKH{6 z(7f{n#8ab!?(7~TV6<870kN$weYd|ZG>w3?^)>u=?%M9`X(6wS;{D*P(jq?$+vJ?y z)!NhSQ-{po3_j)VC;V{5ZDZjJ%ye{+HKT-67IEadWJyKnd-qx`<_GwSv63$N#1$R- zKE_vhLt2-59pZu>HOA4`TZzw<681@^g-vL)%v{pY9c4=q>Aj&wew{uQFGpG=JeZ@L z_5jjPBg8kZa#u*}Dxv!ZlsrWzRX*^QG@0|TtR(03*+%HO=6MybBNpJp3nO`TC2PH3 z#b^4A<6H`3W3(G3mUMF9dR-H+8Qe8fygcORA}64H?%9fYYSCCk_n2d)w9yGpt>}}g z*C*c{o4&`vU{#pTAmv)P)u+r?G-6r2a-BS235H{AVh*^0rPH~?fs&L_P81vdz3msi zhroybQ#7`-;eMVq6L#=rAQZ`HYI=Fqyx&$C;;_)2WB`lN$L~rXMk%(dUjZ(ZE=!nOmEJ7tzcUydl z*4+ibk>!u)r`qk@4F`@>6v>HG*(e(xr#&J!I`nz>B7vSyE|L1z^eU#!9Qz~Zx@0gNImRqIE4S+5Q@m7za7YdBZ98p)}Bt6=lLz8dj%)# zS|sk;^BBo&JXz_%?ELD7cRRyYESyxzMTj-5o*${Aal)~|3Ya-5E| zO&<}Vco&8y&R^e5h`2IYcid)3u7rB|jjVCat${AQkoV^o>xh8gOJ(Ov5lnV!fEW6rX}1xQ%mx<$s0^c zj}`=wtl!}P6Lu-veI@zG1O<#}7+XNfKkdh~Y+5{_AYZwfDP~Bk8aLinPUl8QOjh4- zHh1g+Ag#8MnZhp&I>)Qr%P+E^v8V|Ob2bP? zce&dR6ef$Pl3l33KyH*?w#!}4@NyB?DP7zxeEH_XBT93xN}NtSMeQ%-Csfh2z{9*E zu1ju@zOcU#&`IC(3CW{M*VuL|LTuEbw5uy2jsNzdMZbuC<>^`(=4UQbDn@DeuSRos z?`s#kF6eemhs*362>aC%b^<}TTq=of=CuCnVWkHgE93Ztq4z0wL zygt_$ik*0`bc(u)bP(nhtiy48diHEG0%YowOT$ekHqlZ3ooC&S>sATWG*Ba=6xO`(Xf)qrPC69{U=`S(@w>QVmqCbon$626>qU_5=l-p& zIC_=X;rn@JWZ?<{hei*hot}``9XwG_H7>QfN7;oi)aTKdJRxhkd4OUi*WKbQ3`C#- z8FK7`3i4BQ5@}_0QOwZ`6SWNt+y$GbBG_QaJMiAv#d^1dtw<)Zyn8OtF@TcZqT|<> z>%CM)G)?)8SXgw?Unn!)pSXR9Mvh3*$#S%WyxO{mf6JRbRjYWEzbe1oMg|zQ zztt&fkM2t}trYjBt%mM z&n3ZHxM9S5$?6wB(pB{Mf{Z6ooAyDet+8Cc7S9c}4(X2_<`)|8%_fQ16R#4vNC>m| ze0$nwZg4uZ9KbT7xudIeL8lWTFCre78Drsp^dc=}ANw;Sw&gbb8l9?&`fU1Vxr;tP z{0t*C@M6FJX`U*qjU(^B}TWF1!)%QO^r z+FGuXT5Nu(sQZMwf0%%t?o83GxY6`(fyQ2lDP3zYuAUR)@v>3XLv_3b^?c7%&qs5$*AmP`qbd7MF5EJq7rAbM5)@uxN)?Wd?z|N zA(O2g2~7IV;qs*#yrWS7W=t*bg$3j^{F=Z4JVT^3;WwHXn9#H3SK0ivjiPIw-X zA5W8OWpf?yUa_?a?4DQZ-i$DM$A)KE=}s6B`DAnq3{j>r-R9JM1Iu84AI4_n z(ftNtSO>fpCn<-YoQ70R801#yR2WS6lFgUyWgwKWX~#dTk2-LMHI0Ihp-!KzA`?}6 zftEuMuZeCVij@_83SFx&tE|1ImbR6X7LFxah6{!wSFUs{&3bkI6BGN~eqGj3j|bd7 zXGV@Ii-X)uq;l5h3R=C-Y~kMZaDw?YDPAv@oBzV-&@+hbwiF|e_h}z+-gjX4Btf*J zylt|oq_hEwyuK#VZDg3b>Od2kqw7{Q2K%jqc!HM~)6g>oOs{z(Nier}9ucVG88^n` zb1QwXFDC#xRm8{_9E%G%LvOa*UXD@kM$0LtU213JPoYzXrE|YkGW51-=uOw1eJ9Yx zXJTqzN*uT;PI>>Ehj`TiEM6@!vvZn*J5|k@5B$@SO_xlRS;VJ9`wRIoT_@)JM^G2N zu5=1i#RPw?!0VwF#{0xgd*!PN?Fa5tXcIzIZT6yHxUZS%)M=FqCW#5jo{6Y6Ig?Ck z*E6+EG7DW#P$)b8xnXKyQDVEzqq28zzG3=sk6%Dutx~eGJ8}=E*2D7{QOd&#Ag!rO zU#m|M8RQ)3I)eMj=FyZVu70h8q$CEfa zCO;@4L=N*235Qn@jmwSB{h&cHl*%})>PH2vM;R(OPG%`=@E z4|6$}!gco@+Mf*l4cZ?%=KU=x^U%J{=*YYyqrrL-F{b)1rI3`=(Et{IZ0`eEjuQkm zD)~sjgC7(oHW6*sf!+E6*KV>wBDsyg6HCz0C=zKAobd5IqW{oO1BEU9sLY%9j_>7u zATpSTZ!xhv9DQgLYU8>EKC}~YK4(_E>lp#?2y!w~GK{~9WFS&yoWCr8Dfqf*9pP;l zb5zto)ArHmYnxgo$mtrS?PUf?tAwp1k0?4e9v>x$wJqEG{K%@~utkjtt113lX$#gc zk5Qf6_1TD$`C$94jisg05!RuV@wg>rsisK&x1NSq1?(8Ym&bmk4&Dz6l(xlLLiRe4 z<+vCO(p9$KX$1j*0q`sr|N3sw;qv`xKd#A#p8Z8jecj0gU-BjOon=_)XU^r}6ER@2 zLiFy3*|y4m)N?D++qdS`?%2q;}7rxhC5LHG*HyB58k~H1g*%XE16WPZ+_xXB>YZ-`mOm&jkPg zv)Nwa=Y?Y}_ zn$X+B7jpP;JuH3m9>p=b!?TeEQXbKGN-=d|eg!Y_1xpp;FStj!>y!>7X`;-LU}}^7 zub3b^EHPL^kU57?N*t5IVBceN*H)c!(Js2tmWeXq$?fe?ewZyD?AmE05S+isr@Gs)IBa-S!xa) z&1~6I>bBTveEX;hqmNF9^IXF?!>J=Mv|`(MTqhQckp2Os7dy^(QO@=OFETwyg$qhD z-0%RuBC6Ql-iA zRz2xX72(cze6Q>0ZmsanP@gY)l2EqHgX%dawUM>O!&;qdvimYQvt&;qsJbsg9Ududr^svb2i#OZ{8*IC!O)77eo^Yp;K0DOeS~{X} z#j&C!!gESX;~PujC!j*lEBo2;?`nRED!o1btDY-~sPkJ=k!69o7~|Tr-3MgOIC|4v z=e_SbQ7ssc&WR^)DRU-PBp;Dyt~@;HO?t!7(*Gm>KFucfrC^Zh)Ge*?DEa~=#qA+VWNut97fhgMg<64*m_OZ|RoeyanRnVNTVT!;L< zdhl_C&P7wxsqZz;#5S(J=k6Q*iiB5u_qrqL3If9H)Bud2qOyk6Pyp>5nK?$%$WN?) zruy3Coh6eMQVTp^b4`N$(LR;0c+x(WME;OiAeGn4L${ls$S~o3Oj6KfvG%o{ps&Hw zZdXP<5O-?m+4fCvAZ zqULeHJZ+orPMJI3L#6{zux1flefRfsg4D78h-n^Wh2m~R$p{BNYXX8__E8Jd%_#B* zt^zW40fC{XRYQKRBf_{KkD+iIuSW2C{Y;(su!-jRAu-_4yX#dt$@F8U=cw^=OgxQM zGIi{Ed`8RX7%2UO%{p_s8~_OT<;U>Kxetc)Y$FXlVQ*0nt0dc^FRZ*1 zyD@Pfd!N=mDO;$i(=$+qpq%XY>D@p|qJY+-vrPR^giM@wdhk1jNQ4Si!%c>$N6*V; zU}m<ZA5g)Sv0>Z_jZYNqgDazpP;soYSbR9bvHi{kvQRHl2JDkm?5g8F zckL{j(T5uC`$KW4m!n~ee9}GLQet>$797g8jXnP*Ou4)d9`(fFsusof8hwfzTtyt; zEAX=YxN1H;Y8%@9ysdWS?~iQLacTrm&sVWa8Vbvv z&wndZgi$DBH~s;r{f(UP4^M&Hp{#Jr;@}G?IsYZ71Sh-GHhL5qjOU|uer%TCb0)TrCkRudG5~kGh^z$!7%jO3 zOPnebF4qyHNKeLt(S51_d=0n*;r9G7M|8Ku>?cR^Z+y}})_I=?iuLJuH4?WM{qsHi z{cd(Ww;OM4zr}z}YXQxv{$fl=gR@XO6x2iB7cI1Fr?_ox7VCcI%(mi>n`%#vp6W<9 zm=?R_oXHKpIp*ziyc*FPqFZV6>P34QplD~yc=E=pk9XrVD!X8r%_0ca1$RS$4c6vA zQSHy+_ae1C`vnG%VQIhKIVjGdCzSrGRR8lZz~X}#AhogMNkX#h+6&y7Ov3IZ%#S|B z&VWWe*vhjTcTz}1s@d?XC2#W(UQzH6@ELuv*?sk_LuRJ z^{uUIR`jv>JWs+OLd_T_*U zgOc@v!V6@92TyRdb8em8)Lyyy?P34en7i45-0%%wtSv)~!z=CVu&pbj#-B+9V$%+*O5^ z!V<~)m7GG07SDcd&(|~3)PhgYp6mAd+G0Fw`ypSig@YyUa>S5v^24C7$Gz&tcRd_^ zWnVI6b` zP>6DF?{XD&OFUw+AEK#_qprvp!ALxGn$hH0`&J~5A8L1d<#xr#75tX*r%@ zO%|V;V^k7P>RO_OPSahNSUv7mKYpz0fX7IDJS;-e@l?(o&WZYpYD0H@U1XA-WU|OZ z9OFa=sRVC;Y~Vnd=QJ^<zrG0A~$<%6t=VPw zzJMsjOx&f(bf{Vukc#xAO}a+t*!tX_DI-pDU2IErZNsFM5nYlh03L7;s)mI#bjDN1o!AcEUjo3@xsGw0CC}w91id#m+U}N(MwWdTkCVZNTrpt zu42;$$~w9B?V2wqC1Dg|p=qxw(e)-$FZecGrLplD;UEM_ZQg2YD&Aa3Ey!-ys(btK zbg(mII2y^_;Mz(mV%d{Rv)JtEw6W6UX8DcY${C*H26PA}uXfTK#|#Fw)NYnVlz_NH z0U#?ylddo0T=Q>(P;-*&&E-L_f;M&S5@ zQia$Yj+0gYQg!*l^nLN-C8@1BslL~j^T@mXb?Y^MSs2KCOD6ValGMQY=9j#2Y;M_| zJm#w1WCdnE=}BGu4G#2mesh&Wo3uLCGHu084&aKND%Ah{e21{ca@|`BR8xRfA*ffQcTDO>dR+Gp%^5W`)Ef ztJE$@T0>g*HZPO1zyBI#M@>~#^CH@Ea=aqOYBv26OWvzc>5NNICci))XcQCCa~!~Z z66z)^fpO>aRzs)?w4=|FaJb)zotK~M*M`LNLWSWAu=;cn(kYH~w?l(LvcuXj4XbIb z#_;@A8Uw3sJTEf*wT>2bjB+X2{5j zLb5BPtdL~y5l(w#CrPqLMrO#$-utACY%)&cJW=-Eo73<0*7)?f?$32y_xJw&@jLG0 z`1MbY;`AP`@q9kkOHzON37`;)b;AbVk+kxeY;nwgN)^EE21p+X#xUQs{1whS<<`MH z3~vdku5mNpWNPdCz?-hOMWZ+NfwN#rca^EJ@2XEAQe2(lgRZ&IF$Vw&nSB#BjvcoY z>?k)8P)rU@UC-F9yq?DupU;a8UPiir`Hchztg7|jKOLfJ5XhT_zIC3&TJHOk_ndnx zn4%xWxjhy#(-X4Htd>d;+i(>*A~ADwIzll)>@mqV`fP%Xj^p?%unUi!9lsT<7^F$`eCyiB>@d&%YKU4^!K7-Y?5rEu#txY| zpKBW8Zi^wM#d4RXpj8f+t9td$+Cs-(2j58cwdV%esh0O1JeEOT2edWYqk}JFH=hT+ zb+G_S9Lv3_sJu!de&$Es?z8oBggHlK{CA6S56k`ZE2bhh@-aEy3aY2HanKJ+bfuT+ek^$F^+F%d)YH8a~Cy#e481A&KtB4#hZ~k5}sqN-AzP-ZOO#pKxJqTMDuQw@RZ4iuq=;9v`>vD1kWI0Og#( zlDBX*n4k|~>+_YAl0Czcu^cxp34z)4Vz~esz(2xoF*lzYe|WC*h}&ZlP&Y!anF>4% z=SfoVT+;Hzb#nM-vKM=EU^nEyDj3jPXUbRzuwc*hMOUWg*B9KWDt27ze;3ra!-}`z zJ21W3xKbL)!oe~Rv@GN@go4U@4D08opFm0Vu$Lb4Z+ePlWvuh#xD?+a8zuT4nYUF=H$RoU?gXPhzopYSSfh>4UKGCz!PwMVu-CNoQO%JBI zsEUwUs#e#$2_uQ8sTypZw-UlZ6(VpRPMgBaXWDy7ik;X1Sx;jp0+b7dgLK~`oL$=Y zeD|nk5~2yV!9NU#7+g(DOv@m!yVI%1O+LbEF)2gdTnsy5K4mnD%q25>e^Mp zrPLJc5q;v$z|=V4V{6^s*d9D$=DnawzqM!Z_VnYddg9M}DYh!1rKY_DwvUS9A@^3m zo1TbSmZp(I#j_`*77N}+a5T$1KxlPAFlFP~H=xT;!rQR#`~d~#_6X!P-^-3_#EVO& zio@$+YzhWnP?g1v7`Sb)9+m|t?CNY>6#o?Z$dwZ!D!m1G0_d3T2>OUtvGd)E;vs+I z*DbFYb)sKwodmjWKRcSqcQ>1YQ$FM@IrBg;6&{4hLogLUL}M@_jPf5So@4TrH3bq@ zaKbR8Oip!qk8=?Dw2oo@xlxcBXGzHno$~a6(h2CR!r%BDI6HU)<1ut|AROWD2ms*z zvZws+{%yOlnYCPTS~BX~d}p2)MQnjulQD#<@>LgbvWTB>JaVg70aPY@Lh}60>F@{1 z6VU>w9jf6p_S#uUT}_)WW;;v%!?^%DI4N+wm(MWTy_gw_?h@XK8v$xR6d+h&pZoIx zvLf+-+w@P3888W)NRt3|@lQ4bZ<-})#v8|wdCvV&A7JcRe_!uUyzN9kJfkWB6sf;S zsUL!RSnMZ=>fi;a&fn0$GXUo}oO}o%#Bcy--n<|^oeKCRHB%C1dHac`K!U)>2An6b z+~dWcCKSkX!SjPa`FFZ5f!Ur>aUQz^AgFQn-BvS+zxuyiXeTqU_515aKi3dnZ^M04*dGR z(7W8c-#i2a?EOT=asVXeT_6tFz<2+G*n?4Wzqq&r2-TP{0&&BvDyGPG=0E0Do*`i> z2QuWrS0DA*emuC@Kd^cu0Biwc)W1XKAQ*ekz-c}+-NkoX!DF37g3&kkn;jUm=YONY zC&nT`WZe5TNdB<%KWpZA0Fpf5oebfl5?@QFXRQ8an|v zDUhVAya12=X&3t~F8sGo@}I8d-?-NRNc@A280P=?4+bP&vC>y$2I7Rp+J8BYGWtCe z>@Na=zv_v=|9^Zj<0qT~;4gmo{{D~A>i;{vYtY)YHIrtl4RB?K|7GF&V;=r`Z$QZc zdAz{O0+u<`G14Df0WjLwn)aTEHT+J=;7V6wbd2DUspYv|{IiwzRj-vkZ(zhO!W!A?oY0j+H$|L{zqrTOo4<6ji; z4=Whtas^MG%{Q_%w>yRXWHv^Vj^FeNgw|V7R0FHh*E?Y4I|546e<$<@{1N1mqEw#- zs}|EAs}^`rf7?Z&|KfnbwpjdigmFksEdI_SQBQZqA2@4sT=S{N#KxVpD%pGMf1YWS zomB&Xv!2{Np{y23ICu)j=%a<4HwgOPTUng5YPgiL7%VbykE7(xXnw1Nh2au&6kmBr zJ;l1jh;E^**0avXHLNOHyGNW_m3q?)FAdw2fEg_4G+Mb*oRT_l^&fVHg8==cq0@w5Mee%y2a=x{yr0 zF2Q_a^0Xzow(~G@LqS%n6`ZUgVn5rR;qgGr1*8mE7407>fiPa`8@|bOR<(O+8n!K0 zQ|7~xsf-E~$%K_QgG+~X(frQqhAZ77;FP*UUZjRq&D1L*T5YG}ahHgVC+%yXY1zGe z$qKNmWFPP*VC|Z;!ul|4_fAdrG^*XO=-6`iq@#c;PzWB+{(D{ z+cSNaM0wsd4Ua#04+Azojg@cTyQqy;{nA%Wum>__vURAr4GwWw-M(*bJn`f+AV9y0 zHcx^cKf^#`2NK)Us9jU=Ezzw3=7eObd!}~h{9`7Rf-Z^L`YtBHa9gNu9X8%P&y1lI za3c*A(AJk&QhlwIRtAj=FZb%TeJy2et)DzE$W8u3@i;5~rsC2AMe_aGK>pEL?PuR` z%9ct;Dj^U!6T9r2=NnsV%=#n>ySuU{?F0GwvyZt+c76y@pX=V)lvt?uIX~gh9u0Z} zdB@_{F^B0aQWRk5+7WTm%-1-loZT9ah)t%_)a2=GLN10tF9#fbvGMhiwR&V_DhnPz z%qCkpIPLy9@{wEm`j*e6dpM!*;Nwqj2(saJnV$K?HH>RVzAqKPz%QZ2g6GNY(r4hk zun)Kc#NAedVBvvag*XKR%aGi3Dmf{$T^%w?s-l20?x3X~7nxv~OEWQe(kfxNSi>mo zK0i}~#z=ebnmhQtb`&Cpodw+E3=&nqw|h9R7r-DjHcmw^3M4;bR85+Dp|P7GIyLuw z4@C_)5F9$iI`JEvl`Ct>C*vP38|-qo0@huKX?XrhBH~ovVRJ zZ~^FRyY@A5Yy{}zHt60p?1n1$`AkHLTu6C}f6OrfriJWFM`$F0FoIBE(-46AyNXZ_ zz~bd$%Nbw^D-A-+B_OJZY$r?u(p)49;~hTyi**Vp3#Phn0096?8`QW_zN!}_13uIL zVNL(jfnSFU_RTZrA*0@pfC8Y@>d0WJrhechl$ps3ru(vWy0vTzM@> zP5_dJzw!*?^zT1~;CQIyS05h!kA25~Fvk5)AJnfW^Q!>+Q7k|-{_DyA79v0lS5^RZfY&_W_sAsOy#&>#L1e*J$`HUDq4 z+R(c4+s*??bpGFIKEOzdP`r0yZ2foon{zJ331HdY8xu%w51(s1B)Gmj9D|nq6vOs_G=WG45nT#xju3RU+KwL({cU$h8 zzdV!-0G!Xhe6Ub3v%`S;>_n@{`ARylu-HLF*I{9x!(r}gBcCQAutq;l{UbJ_64a={ z5WMpnDEXuM1G8hchX(kqCil-Wdl=E1y^=N*IjAS{7>_D9uBPTOe0_XvwcH$$n=2Je zgD$Z>HXHd>_d0ZYJ1+NsjDnINz$dc*tZjc3-hVAZ&>H|+Cz$dNcRnOOf=2PXG9`AI zf6aslJQe2U28uO@8z!ChKY^JdLeCP9x&o!#ohkAVhRe>Y_oM5ptMe!O`( z@Dc?61>AwY=LEZ9_t+wR824Yt6Yd!$26h;b&mBB{Q-AI64F(uSiIBwa-wq>fFK>0FWOmNt`xOZXVb_K(NUVg(>u)*+BdS z41X6iLH`Kh0d*_TB-;8wLh>KlsdQEg2qgJoUyULI@9sbLLBUk{K|iXCpf90NU_pUN zFDGl=+zShg5a1~vs|4l@Yp*FrCH=k(MUbBqq%rQq2G38jedGZjc7!^<$7;XZIDr}( zO&y@m0*bjmqH!QJrS-e20%qfh$nrm-8sK(D_!^TkO&B%g{@B0%yBhHM)uWRU)8R0z zC`CP$NJ#m}G-d%zN5TEs6x3^C26d0}=T5)W+joD37Gbr_f!GY-dIWhtKSPj!>#-0F z&XKfLFvvCJ05tm}h~$r;1#m+Gq@kLC`M?#DfAcRB^drmM3;vVjg`Cbmb;$n>08FXz zGSuxsIgKqrLYWNLlDR^+)anfHrmkGuX_YBGP%mB~$L^^p55>;(Zc5aa+V zob*+{8+xIpB&}6<`3fv*3zL=3x&XuGSzeeyklzrHjm$qAcnE}chmAscZ+Wgi z4Jz;=L+bUyJ&#>JelK~=A@v60xlUmS9_?@WfA#jS0;$>qFz>0>@>z5;%o=gk!B1^8 zhuxti@yfslnW)MUpEF`$wbulvaV(V9ue}P4w!J=%0#5$r5`?xL=-o;vg#~g@yqVJdSvsDEw#;;cOT^1+M zHfEQp9eK1D8B9OC{g4GNA@$S4CofW+-;gH9EW zt)&qJBr3^rMz+iOo~2My zMtyXL>T@b80&WD=E`E8W0kFyTG?$kKvVtQ0A(0zxd;fx5AC8tmuDjBp^f&kiPaAno zZ(0gZqkuHl&2gbG&H;NaHGa0MVGi<1(#2r=!lQA&pAvgx(**7g@0_=jEcV+EoUE7SiADI&0*qccX zs*q|wOgYQf{leezS4Wy;q#WVxFVi#>P04hk+6S?92{d-$Vkey;vbd4Zg8O-*ZNp;X z&K3_aU{XrS%Bo$yPsCYTL44Iw^L0-VICQdt;E16qR3oyoPyJl3h7RbQo$7JOcgGlpLB8R#!XnHx=qrDd#>@VuSQPJUKveok)Z)sl)&$!!goF(vkT z#mLI*6`wKHxF#o_^nM-4I1hx+=hPv>4@UsPx*+)W1oZm(1@-3maHg;@gFv1D;y%!n zy>kT`qxG^ws>Cm?RzM)1vbrHYcQ%C{auvR=I`jtRG%lI^2&Qoc>uoWglC?ei6bzeFBX{I z_4hIdV1n~`Ien0yKW@`*|5kvZ;KGBSW#g~F9ASo`*7qhqF#Vs_5xPGj@>)T#5*lEc|E6z)+zU7@ zeX%x%UsLaoh6d8#>L(JIJijtY3)=GYnSm9M1qG_jtXXw$p0uX;C)s)bT=D!!J~+Wo zcZ+`I9+YZz{7TNd5(4&{li0jcn>L>MaT!-#d2;IHCBrh?56^n;?k%7v>MOSpS59f<69kyacpoKDA|yTsx7G-UKlY ze@jA%{7FbayPot(h8X~3{|&cE!I*-D*n(7q6BYW+)J}1fH%&tTNKE6g2N*rN*Pt5w zoxcaN_5P&5esv5#x&z?Z0J~i=_=euepIg0L5!fGp`3&-EUKl15X9&)8=|A8eU$Y=c z*a8x}{1f7y2}3AcXnIeR6=V6(V%&b(+4wNjisLUn<`Z)~MP zj9y2!KPb_R@6AD9*o-800ymjmHF%BJDA~C+GPO8vPmW=_S!egTR_pohiOeKMLl$TL z%V#Ng7_;(#q4eBT^i1kk&aJ^D!a0J&P`vDoc0@IdqAPl#-AZ<%%s(^1mUM?10B6W& z+-|$8t=~(Yg2nWGBB!Z*ONGD+QSkv}d^~6XAuCHy(9!7nH6^QLS{qKyhXT_kaDXxL zU9vJr@QO|5EGDokh~Bfor`#l;PUG&XlY{nO3`V;g+KscC`@!u_=slg8VbLaBGWvh?%Rhds86E|NUtTE$}?e(Z;j6%?!r0eJyArfchMT&}*BBEc>2-bjRcYXP0BQN9;$ zfCl%5y~QdyutY=3A(z~JDKN!0^MS1AZg^`&{?Wi;nG*HE;e#54!lI0TG&i5p9I-|a zsl$D}yDqz&{f7!JH)X!v2u#JBse=^6K+mXc37j1p8(pRQ5p9#IGeUy{>*N}$K>Yb? zga1;G)Vv;P#Z|9KO}{s)oQu2u6&ip!2211hpS?F!Nz<)^iWk)$a{~n=d#G}o2@7%a zK{Exq7`v$eiW|rl=EOxy;C=L5&Jm|~|4>h=d#5*DK?mWmmb+ZzG`7`~5r5lZlh_Ew z^JGAen-Vsp+pp@7DyD^R{p|oa9`{ApK^RFp=XsHJ@-x(AZJZY>cWmM-4HfB;;|~#B zrp&B#IK#-BIRlT{(nR}Ra_#*UbcQvTjDCnrS8!FS7o#pp+7ZaCA% zGgCVvyJZLCpVKJs0oo%XK5uMi#`O79xJbz8L`pbY#w#T;d}&PZ;i=C_pJd=$ zQ{wY~tldHTTIK)^Xz-&UpnDu41=)TO)K?C?i+=f{l2hSW$67ek?as||F}B`Shi@`12NtZ# zX%p88_?MOiQFn<%zgKc`w+4=^N%OmHO6J-jmeuO*F^-%xq~HTI7o!h`JduzKCgqE<^EW~k6tG)>ycb2Pl4~i z$__J(+BDtX%Hc=c@tRW*nmQ-E=!qH|-Iv;C@7qBtnXioV3GLoQ*wux7U`fQkf-@Wo zUZKZyuCOuoPH+^>Mo_Hyk!!^qKYEam@~Z+yMZCkSiXlQf&$zX)k?EwwD(6J5Na2(% zgR+rrjVc)r_XmQ?clzJ{^@PB{8^m{u6CPOUv*UDx3+?DTbtBGgdkL0^+^uEzZ~%8J zGd-hPHfG_@24>~t<4qtIy2$hs5dzL?P@DXqS)=v{AA>Nd^w^tE!(5krW{?HUv-`|b2o{6{18nN#s97#N%RY01d?3H+Qcy?#u zMwioy$Ex4R(oAqArc7XNZWYuhSkv)%G6inDsKT<$)ZcJpq+b_s2*yX0tH=nou;O8_u861Z@q|sYVI;b--p5tE4 z#^n$Yvjjgz^Bz4@I|Sys(u%*|GLRj8O8uF->Ilq-E0`;L&C;O5Sy8 zvesh~fm#jyywo{triy!AF=*0R@1YutrU{aRDpHh4L7URX1W0}QUl>r%@TGh7Z@Rrd0_icDk@I3mY~c00qA zMW_t9+674t;jh9w-7tTB-bcniR*@fL-%H}2*L~50l7k^NxD9inTOkB=m|t`F-@dtt zNw77Y$t@Rb5SWV(qm_T?}knr~4JN(B#2$R~`Mlua{ zvx`XrcGKkmRjAUgap_OhDdFnqV%A7 z3YO=F3BiyXr>X_sDz^TfLi({80w3v#I9iUFkhh7oD)thyty(^lR!bvI9F`;jBl$&+Nt`eOeZ!$&EdmH8 z_)9mXF{KI;S`T5V7b2n0TrUlLFyL|VytE}8rlV%(HTgiIm;>5Qel3Fe`K0Hi$)q{A zIe^3LvX9$|W!(+ao&=u!m?aY7kVs#4Y$3R5E=(PI=HAagXD%m2b5scCA`#UR%v9JyE6Bb%GzHk%740qg~6vSv0 z1@@^7-t{&JaoGBB<3Cq1(`x&U_7j6U7_Li(}5WhtO-RX~e z4d%ScWRyZTDt{6IC1f>?JNgS>ulQ$K;I`B;qV+GImX^i>&N%=Ly~geidYW_s3d~y6 zInVX<74=|;AOV!+QdQ|Yzlc2DOu(NUqHjrmzek}>e*)NwY%!40l=@_jTHq9576^|x z&3KUT(G#m>9$P8N1LsuO0z%&PftSyR=VI%hlpK4C*g~pjOJW|d|8*~35-MI)jI`j4 ztL4`WK^d8scSFBngVd~+5|-V5bF;xk8P>ytQ`L_1eXQ4R-6*A^=D`L)4sP;8Z(PDG z6<@M32K~42OXWa24WN9!JcxmT)J{Ni^Ye@UgRui1E#$}oY=3E5;!}{zSZ{nvl~`QE ziXJOU`g`~2a2VFzpof7Z7GmK^4l8-Nb_-o2Oya28gC(K_9 z95|98c}0ipG*DB&k>fgq zJ7{fM{(cj`cY#t-@Rmk&V#^+bY=VJ^o`b)=lT7gDpv9l{kH!BqBl%^%HYjxgnlwh3 zEx(up&;S3u{9io>SV*MNpeNCS2$o`@0jtN9e7^*jRGu&bQqQ$;n`6 zNdBsH&4dD)*9IRmWS6v{c|D4K=Lq8FTJQf^E3<3y3|ovD78#(L*hntfV1(+Hy3gHx z!v|;zS4&m%v|$5R#=LBGQ^3BzrvdYrqH;tzAS$A?U0R@AU`ueq5v(*cnQEo>AQwbO zq%43VN@G1m$1sy>2KD_V=D0;EA$ulDU?y%tUTEoF+1UGoDbbGz$1JcQ3&NhM2>O8) z^1fjNF>A;p@KjQ5Ilnf6jbmp#p5lm1%__EHdE}vxumUy_~+G z4!v#;ApU@;X<}>z96gaEVYMc_`MN~B?($+fI9Qcq5wS06A#l4I1fX}&Q6%A9O3~*& zBUqOd{Ro#b!b?<4UiM86!9lqGO*tcm$H9-LSiEVd*ixg>SA?ryb{s3sidTU|X6j`q zDFL|TD>MLV8@q&fcVFe4M_&8U1k{pL0iQKanEBErLbR~gB(9aNiq(6tLsUNmp>C`X zL27@CL(4#jAjl6gEXq>)W6@ywfF&R;vi^ku1+3Tj=vP?+Mj+#A2(a2rHHjM%~k>7k0?Hi*H5DuU`?g@BY=Wa+dJA23kzg9vzjkXQ?> z>$fzzFPE0z^(8TvjVJ>$UGZ7mJ$!FxkqFxfvG$}xi3ju{hZ0$1x8NOXb=29_jsRoH zYhB-)kt5OW%;e@rM5qC*h^H*5bwC@8-`tuh(L<+Re5v>%%GN6Txkm><{$KcU!pK8k zDRKYt!n!f=C+VEmCtH>W4T5trC4aD>Md~cIiI?E}aRPR3q}J`N=@H(~hX&s&wCPTO zsDN#krffI-8&?V|M`z8vCikdXK_yQ0lhFRF>wV`->rpNU4#q%D>d0BX{*WX3>==8e zr8%)Yp~j^$*T3v&Xz+2xaM>t&W@H&9Fr+jcf4fp#=Ir|TVv&LIw^vQT>EOB77Hc_H<7CH{ef^qu!M7etyXnI=Esw^M^^Z!J%=2f(hiF%g#}i@lNl?9g%vJUX#(d-L+u)7vJ|P^q#yM zT-EK3JZy^N9O{s-ados~ctvQfbGhn;hF;7S^ah{G?E{{VshSQO43@_$@e3mEYlT@= z@SS=%N;UX)7l)v2$ibYVYvo2>EdsSQo`^c*t3WE9n-x78@h9RGf>rXfS7gO%k>JJO zD+IxLkW%FzYQ+XiEl0G`a-_M{nsu4&nV>75eanxCM#heA_`A7nXPThMMl8QQR&#&L zcf=T)nb+yoFYI>w*)m44t-N5OxsqSUvp3#zc#j4joAka%#VBVy>Wh(NB}duVSL}tD zt`2+hM27+mM;{~9^1;foW%)er4uT&&zb){L)@zclzM@5IL9k!L&Fgv{&k3GPsojW10l zoNLEORdd2?1AVaa?Fs96ac`HM>td}kZg|-iU4?0^^fcG)ueT8l(<;O*-1AR$So5t( zSp=m=m&X&U6lDl)KKpL&%nWvk#4d~;!UdM+$hDo=IQWs&Hr;%d6`^v(jkKO&!XnoW zsHBnH8XPGyWF?xKMh4p3XeqUO+XS%kRat!a|otaT6Qh6-R?u_E4R3&RneLX2r z(Em3ijQt&Gj{v6PnhRREz&j#1bs#UtYx2%#<6pX@6n46p@C~roFM~*tsZyk^Oc{s- z9ey#8|Mt7{d8`)R)2LT@7-5NSi}%aKaeE^*Yy@aC)sXHDUl|HN_zLoK!2(mZ#?zBbQ#tP!9oxE1I^*FXti^(!k zf>~jU&iVLgd$sMlQwM#oTBLK6a#uN9US+4$I88{sqL|gdTcOE#feoV_@lqi^G5dIj zfmX43>oHL^CD*mH?O3^UzNtGmGWPYhWiQn69u-^~ayy{jDc4D~sf`V(pRXyD9z`D= z?75@2%Z}}$4!6y~*e~(kUK2Yw+7bB@X}$7oeJWRArRHGy0Ei=((JSaN(J&Z%CQ{8M z)pc%4wa6&#cxvE@L4bxwU!lhts|2+}agar|+uV$zv_kirb0>Lv@{9Oiqv5m2@NK|SwJ9p=RJB+pv)w+TfU0pqF1Q$V85lX6B}*ap)^9JP z7AtbPj`jqOA4GK2)nlW|9hxe3cxJZ`6BC`R3`5hwacS0U*@Z7O%ql)0>@AkmKD(#kr zYTQLN>6Et(&(QWbw0ux?eqE_RrdHkaY9U7IF*3?)MA_3JNW(mbmnTH1hmgARWiTK1 z9c&-xrW@ni10&0h@EVIZpqZz3f9cSb4aait5}461b1yC4KR05va$?1yr{}Tnh(&MUp>6MqQMdDa34f)i;lpPw-##PtsRhgVM|U+!UvA^mlwv0Z z3oJ91J&Vd+-m*}$o_^%IBGA*(o^%s555gWxHXanu#5O)q%zE7Ne}EnqKd9s$vRxK) zqR(by2iO9X7dW=6{)xC$L4TA?Vz^=*G=X)eKiO%73|%D+={x75KvRc*M#c?P>|EY##+ z<{FovZNBRaiU_qXL*y`a+(^1J*WPy6aYt12n9QkBj;JaSq1EWn@;E(ugOi`n{d*+U z@R$^1;@(IV(!oh&x--sRjEqpY9aeteXE_!RUqo;AwFR+EAx5L)>*o|uHe-V=IsA?h zRZoXWJOmc6cpp?^4$XKT9Z>OeVvp<`t*JRJSOs0vastqkR(OWkAMK3klOOXSUgKmQ zBw~$6f+)3rt~93ToIiDvAcq(7mMV>V5*r)BY7m;XfwAaWd%R7YWi5v_6&m_r3jQW+ zgHhM-JUcFbPZmG zFSX-y1J+qp?f5B=7%zSL2zV$*N==^`Jt_)DVeY^%Yvtj z$=FhqFq;5g3p+yVRLjmW`dc<9c_npWAx5ER*xQYqGW2p(*ytpi&bHC_8nrz5w_4A3 zCuDn+whg?+!@PH~qOEGV&^x%;b8(Fs6_K5%l*n|+U40wPYzg9gCK+Ndhn-OFiLSHX znh#w;?AEs8c998uY1cjBO3lo&;K9{AOjN(v`WIrE%2>;aUMZ!O%C>Eq;uKW|a7dk` zsYL(#9?~a|#MaBU$G#HIBii!p#qaU7De1Tzc~gv|dO5;5gpd3QEFHY@Ho>WeD!<{Mfc9LB z$I&HKCHEnUOgeMwg7%~jt}7t zudg8%!&h9>31f!ZYn|;z4T}WF1ZQ4dV1!SV%iI?)oOpFlFcm)oE9?2SBI>+MembZVMm*ix5F*-PFO&9nwN_fG58XXCJ_l`>`xZNqZPE z?|lDrF*D3IBJ7s$_M>HXiGH}bW7)Ms{5D|?xt4;HNB|=(+ zLtI9_9fAfN2`4}$Voy53EPung%;QLX@Y9poT3R#RZ0=J+9uKN)NT_uO&1Ib2p+_c$de^xGG}@pJs+1 zF3kEo+X@fGWt6EBkw|E2K(`L_6=DB4;`X&C%*yh|WhIFp1lx^&*sa4#C zn{BSS@dpauOO6BYy)E2ZlkZ|O-A3-Pw5qr2RQGollQ$V`9=*yt!c)j(5a>K&nmd#E zd#FMqRwxJyKkOo-Mjkm%%VS>lg=LI#`KNcq3ypLuvi=;YEMVvo_i|vi_)oL*bVx_! z-cxO(5ACnmn3D80xvjhKDud!T7CtxUzUt194OCFy#*I>2QJJSO z>5QbndcDzoKZftFI(u}*_4(?mX=yZLR#yjT++(b3b4 z9Le=Fw?$Xhr}O&0UF5gJsk|LdoDcHs+UWL%w-%y#j3O~JI4ES|<8>WpyA=Z>QWAZIy9e7G^kRh#501N6RBsx)8<9&#j)h0*xbc13WA8y?MSCvqT#e+^YBXKW zVfrp8;^}J`TI=iMLs55NUlugm?9#FBvDhEwdTEvO;Iy=Mv8h0%y`{Nk?|R?)0s1jX z@-5Ok^Y3`GF7t11QrN-h_7L?9XKzKlw)8_-_EcZ_nzrScG*Ce`abJs+Q8bew{V1pn zU+4^-+WbXklMVx~k~1-fA+7u|nTnbfk3t~iQ>yoQI^qOF|7?0%(fvydv5mO z6q{>e=3QVcbFQHMy8A)wIJmMPa5pWdr&sm8#)>#God9euA)05^;gLyk3A<{ZPZEJS z3uW_0714{ytR)fg;BDT(3AO#wg*w*Fefv|_-nRQY=~%py?DFu=G9pM?A&`@E#VUkT ze1aEIZtE>y4=m+No)!JIWd2w=X)>R`FvMvYAkYynR3U7uRHn-p+y|>APrjx~hy?IQQ$bF z-Um}B&pcGpTg)%u3#qn3h$trtyuSX>>^8#k(MW?i!LH-{g(&et(^{j2b8ZJ(o)PSd zeSNf%>Yqz4IiLb~eUuB+3fRH&iuDX9U!Li8WaTHUWOp~WC*;qvZFpolrcRw9iJloV zB)vJuvcN&^MQd&<<0|zewn97payG0QWA5wTL?-7_s2}3d3Iw4 z6^=`{8C%uo>EOqsvA3+p^KI^ynNUwvI(+=B!cZZak`sF~w`dEUL>=c)r%(TaCUf_Un6{BA1I3#*Wm2g)|Do zr7tF#VGZB4Jy%}(yqgSow)UL?lUXE2jl*W;ChbB%!l@9d5RMwJdZmxrnl0LX7d|C& z4-`soziD`~vv1F+-s6tUkhEcxTl87D^4@Kcs968V5VM6^XC>G$?h@~6xMj9h;`#5= zG<-G`2`1GgebQu7w)0^qZs$64rcw3bT3g-*3{x9M&mGHjJgOFz5%)I<)Mn8v=4{er z&kh~CnoPe=Xbe2)^d{1IdZlCSsE^uhyX_IX4xHRy3T{GU_f_;VY$>&;yY{bL{Q0%> zWfvcYWYZ0~_^?TrX6dOp;ng`jpu`X3z;09dH4MS7eljkDqb)1MYz-;}aN0%K>0K7P z3wEemPPL@G3b?cO^&6Oslz2Ksd=8=EOfqloY5Q4+$amfMl>NjidD_W6m#s5Vm=-ha zBjRIFc(c6zYOwbe@DtP~##Dt38U#<`MBQt>T6iz_=~9oJy>xu=j1szghdUxR;)#Y! z8kxbGrM7A>N+*fCIn&;iAMDvsGva;q6S3dD6esFLqh~+-pks_LCzg1hjF0!FgaWzr zvY50+vZ3Og{!{k(I`u*EuGgQg%Q70g+)&mUiEY06Kz*LdNzdKLhQ&Yj+19%)+*~pa zr7vP%ak4vv9Fc}3J~i&mmN(6wjMQmlhEq8cpRe|}E|b`h#C4R>EY>~cRrOgapb@`G z)l)3QEe!F|g5Q8+(#662vPn_7S93soz(By9W&P!DoePYKqMj|~egtIg=5d?w9S0m_cZVmzm17>jQ$Bem!J^39&t>x!w=gX=CsF$^wuW+Y!Cdd8|tF zUE1SSemSXUS@oW){m!yZi1ae;0pXn{*}0y#Np9#DjeH@83rWOHKALo%+5t2-&hX#4 z+mmTvj*V=*NJFk$nC?9(SVmDM6XLwn$uaRNfL0XqJM$)w*bNV?+gnfj;Wk_JTys^@ zarC_3*JcU|kobg69pi1Q@{#y{)#2~_sE_UAs+{K54FOSA2?9U3-QS8ZuRaKP>32!v zm{tJ;pP<4ePsf(&1)0DVKle9@bUniEBrQbNW-~lr_L%R-fHQjiuS~A1J-fR0P4&(3 zA(brNvA1UeX~c~-0n-JkFsGd+ z;td!(aujmpc5!I{6|wufh;lNj*2>l**~1b7#&p&p>@hfkf_hdz(orF<;^n(|v}KbG*u`?Y|cIdC!}cNgw@& z`c~<8=7p!X^tHqb8D5DCzQ-3@!2i7gzYRE5UdLso{&Utq6o$f?r^5>%k$LIAVMuJZ{Q!awEW% zEQW>VQ>oQhWygJGzgsbpD;xHTIFB$@>fYnCNIwp-sJRIWmj3)~K^&~A6xzQ}N$U$> z`_)oQEzKw^E2~LS9K@f(BgLfbanAkdjMKt-6}fR_BMx@!HoMeq8~<}0A;Y4)#&QWh z*skZ2Y&%*{f1%V(T195s6*w;$7dvMWVsYW>52UJPJV{B;Okuaw_xlv7-pV_VxSngg zH%@IYa2iWXA@NpV8Z7osU9s4rrnB99tun{x+m2aYy-KlaBC8K_j4=^%BR6@E?(1W` zceS@MVFOdt|FJ(gSM%T+6|H1AB2Se7c8h85Qo*A7DMqW&!j3)>`EaHZKUk#uhvkBM zPs+@OPVr``@`XHG4GDcxWl8dY_!i!&$p)3t=+N!n3!g+#Il_g;EoW`!Jq}P9*tW`y z3PAu{yYz73cEZ*{^`1ZX%bp%|^GUAgAN? zZM>=(v{gHm7J5%6ZzP`rtFAHha0($Ru8p@b)#jb`wylV^ncHEW>cvX?v*{T5a%(g_ z?ja9pEvtJ#(UZ7Ofb;RaXIRGgCAV4q{~_!xgW_PfWno-{yE`Po-95NN2oNkmg9mpf zfx#uXdvJHR!DWEp&cNXAdMEqc`+WCQednHk{Gp1Wo>{9`ukK!h#y;0{_F;#|f3k4S z7q;Y$n!l4MCca1VKfudA5p2<;XWQ=!jUj0^?D))}72$ch2G0nmzb5>&;*Ub(&JEwv zR9hvk1rd4jwOTGB$?v*MNjLb!n$_>Bh6n|94(siKmvW68({Xkvm?FW#O%9|UsUj+b z@H|@PBcUN{3e_eH>CLX&LL=@kZxno{`$ssajQA<^i+n@3#@8eoaj#K6Q$15kJoxWi z!Dg(sAxT0oX;{E%fhz#tr;X%77%qZxv{SI(uBc?WivU{GiIKn?9d`2VQ?gCrS0vOA zgx#47^HeW_;i-Njp7!bYcx>cLAwjO*w1#V3DN9XA7uu&LoxKzz8UniH%R2z%Ve%M) z!`qE`nhyK*>HzQjNHPSh`&S|m#mQr7~jyw z8HX+OZCRZgR5F_0b+!~8(~8h@zIgJIZ&_oyj%qf%VF}~>8i&sD_2%=NrzA|^%SFN+MGWAjXiPG^@=AgTM<2vnJ3)$G1* z98-bZ8$K^?$G4^r^&r6&aWTt*rKr0&b2~2=tBBSogN89rN-Fz1PX8TJHt}Q0 zadCqHcudnYO;oJ<)7rXuA%6>jW-K{k+{=z35QECnntI;Bh{~Kf(f2otZYWj-|`~22VyL%D-?5hwHS~GH$&0xj@T|6PLk|v;NR92j)h|MT<)jF` zO^S<*HImkGTu_m)s$8BPF; zTqMNg=rO&jix~Dta+PkdrwgXB8HI(QnnNqWh#;Sdfi~NgBH9<^teSZk#rKr)VBz|D5AFX7N1!oR}Y*zOBcNB|>w3t#C zF?6$+i0IjtOTs$YOFsBsn>m!2WsNvp?HDai zm_#x5awBhG-|9m23dXf@%VcjjqZgfy#e;OXTG>5U0o;PHnWJ!Wt2HR{YaE+ezpPh@ z7M1uod|RBEBUK_2bgToHV_cuN6l}Avx!EY!19?Jf-#yiBv910|jy~=GQx_UnXX_la zFma|xO37ayj=2GdxP z?}akE=_Pr}v`E{qvU-KYK1Fi0OkS#-?c}7U@cF|BIjnpr)bu0?HiXk6vhAK?_l9O0 zcbNz?jvI#lNynlFG!ZTFmjqlQbcwx--&lpDZ^l?LS9^@JRr1SBl<%L7EeESNDdNB< z6k;LAWcFDi3~G;p`BiuH5pbNwp5-P|x)AT3&4VNo3CM0uc4M0;ofuz_rP1PF^`Ecz zN+k?5zD&MI-#rx@>n)4D(=1jrU+D_;!SWBpi`RN6l3?g$sv5mL(xcP_0!sti2fN5! z2$WD{=k1p)a%?bFr4la?V(R3Di&GnUoL_KRjN1aUP4Q@%#G6ot#5{Ld5PXPOW3edo zYk9Dk8eT3QbuaHX$BD9Jt&=w6nh;U<4U+H~Q>hQ4&F@8HS1u{#lbq1Ez?iQ`*snE{ z8V~f@3Uf7Y{#<#Y>#fls_&xbtP0vuo37UGjyXYJbWEFZS$UMi{c2M5fbS1mFXvbHVYt2kaF0`@#VBmT{OY$P_ zn!7rRn$LHw7UlN>>F^U5iGWXXB{*R@`3vMpa!9Y}9U3-c71<-s>L|^SenUPWyOWi3 ztTXcKgmzHQT(WqA?g4Y7uJxjEPRWyq#ugZ*!z*w~P|QJgZNQjOHoaG;65^zf7((!eA$z4(lO7ix^B@DY+!m0@^T3?{0{`U$KJAjN_SW9VkfxqZRiGrHbg zDX;XNd!81;@Hlk38vJ2%&3iU3{{N(E{ocSHu*`E5V@}1aG~CpXmgM6NccMf zh2Xg$yjJ`Lcw|C%uWa|(TZEhSSGG86sIMk{L`s1*!3mo9<-{(+ObLsgR(!T+`g~QC zi!y(_j6}ng73Wtd68fW&>w@}bvp~;L0$o?k|2+QPZ9>-I_~`Zn_YWmM#zFT1%~8vZ zjaahg2|(_&gQpt%`NBnWMm9a&uo)ffyqI!u0++<6Sc76~83V>|>ywkf+{o1==dmHw z-}Cn*k9>wRTv6(4ukt17_`~tftY(C|;^sHmx)Z2fTTfzRxF0i1^!}ik#|AuOt5dif zw*qbmQ`nv_zP_VT_|-Zx$RN4Drq7d}6;HV<)Fmv!#9a~htN?r&0)MqqBGQ(CHvYW) zf2g5D*?*`ZVVFv-rT>9a-%GlL4(>-!oZ;43>kRTqWFojfG7}p*Tmim_A_VkR56`R! znSvp>tm{RG%^@YrhnkN_JqIO*qp>2c7VvC3ujg!s*r68|rnGS5%#kgaeAPT^J8mv7 zd;oQg7BmCrUb_KLD;Fi}SJEOr&vEjBx0ELoDLL8g0h7egz)f z`}*GV3V+!EaJh`l=2i^_aGaT%-#@B&Be??m(c`sfW8ZMJ#^UO6=u2Fre@-3h9MtI45^r=luXt;>g~JAK z*vXfAN&WbhL)=fo?UY{;=uZtrdapEi;!WjQJX!;$ZEt1hzaH=HbahytIIm&usHjQp z1i5-`cx_l6=?jenfUi79NMTq87n^K(++Od=MxgA0Uij?aZ^FjQNNTVwGN@cVGYjcy z@KdJs{GIS-TM&4G1QDQ2!Q+M__F}~)=3kcySXgV`MzLUB4BB;7%GDn&t#RHUt}mq) zMB3Wd(uUQhmV#xLKUr*H)hv8dN5Fr^Ef%17w^5CN?KqP%VB&9 z&O-bW^;8SrruB20Hi|6K7bK-+`F2;>Bp)l?y7w0l)b5SA?e9lmD|jiyBgKzybw~|U zzug*;yLWNm{`j$oI%lCuJ9Mp3$e`~=?3K~Wk$(xAPNMLRjls*=sp~07y>#5V(qmLJ z z$Aw5>_q4;lv|l5YCyrd5h~-|2LI{XoqfXQFsl|oBiQ{WI`Y1jsjQFr~{E%bdC2Zum zSRd+&if(B!HC~U|n;QzDUeY;0iac=00dDtaLuu~U<5FcWwBD;0^cs&pLe9GTF)K&x zdiB^#e0CDwGFhG6UG)DV=D@e_{z1%A0B-73?&{uR;Q0|f?x)V_0)XEtS77&rEH&zS zjUKO0nz_I|O&ePoNM3_Ha7+W8@j&1Q)0mc7epl zxJ_#}R6dw8TjeT81{c3$K<~!W8X^3!8O;Y4N@ie8LqbGMT`Uk21I76N@*9f$$Szwh z>mEl0feL<8za8$5r^_)fEm1sUS7!xqYG@V<{{3Yd&Q&h`yRT_C2Y=9^r##Tb?$7aD z_7@piS7#vuIEeW4gK>diE2ub4;&LtDH1yNZ?>L9ok#4W|@>!SK%mrUq!P9nY`QCgh zu$S&ZRm3+)CsPCIY-%KaiG7ArI~C=J%$V!7$Kt?LQRW)+hud<8Cq_vpJ9ry4d*jFk zQ}GX*UXvi+6}U2#c!!zejFK+|La9om#pk>4)~ll(6@Q)Hi3c6-;85$W3_K>qaB74j zT8j_^&%4(j z(r9%QkY8&;s_XhT-05$2fL%p?)9T1-`Ob)JQHdA)n2Mk~*@>Jf8d5s>9ApED{_=dM z7FVLqyB(mqmxxg8UeTQ;)29&&AyUs}_!P6JBH))fu}>>crXDUDIhb-xvCsy_!navL z2l#4|HH}4;5ji>ynm9-1MH&5c__-3tu+z{9Af>FW(ik7RY<;;7@Pp~#nnwP5Z4ej=QYSw`AJ!w<6NP+|=mhub z!L4ILO{BTC(a{{p2Ps=w*C3KNoD3T`!r5&vWq6_02FvSu&w8U)VriRcaw7`TMly53 zCoROK(}4ei_$?x#Stq7=?@AO0HV|qGtPB9_p-<(?&;@~E-VpPVLiQ{w z2T@MB1|o2kvwO^vYH)}yM<-o?I6y0*~G|;U-6BGNstBATg|$ zM$;Oxf|u-lIj!nKR9ufkO#H1`g^SVSgF9fqrJwFi1k5m80hpN)o<2Y=Ddhu`P$5Rk z9%A^QSwMyp+sR(&9*w%lyZrb0ZO!>q&~2aJJpz3h<)%(BjDemB7oeZQlzD#dYJaB2 z{nUuZdN#~eAtJ>6jPu1+PTWj0x*axbxB%7tYFoqZ>SUD+sw&WkeIlNhHCWS@bTXhc z6m44g)D6Xz<*zkJqOf-7MOApaVqST<0pN5Uc9L7CQe{u#OP|DzJ7fdF1}bgur;wR8 zoiAiPPL92wiRAgBH#K}UejBOv&L`>eKC!NR$#D5YDNfc@6?sVdA`%w8Tq=r;EnC(5 z6Of(jkzgd?|2-kF_TJisx6;_ApyApp_Bq-vEf_tti8Q%xR$6syI|kpXj!p@bR<2?& zCb8^lDlQ8kyp_CYc|NQ$S+IGlo)8T#rKpaFxIb^9&YtjcNufL9Ry}p_mx}Yl=-k&y zmoeq*=IfwNE0IAi{y59k1j8)wOS3gT^UYltuw}2spk;PZE3=uc3|5j31GEaZ+@ykT z@evk{55h;4Z`+WRmah_-c}dcQtuN4vi`zhzsLU28LeopL`rl~MF&jPJ+u^P;dB0Z6+V&9QLgrdT+E4L@$94&RxGlmEYa;_tT`aKq4J z2?K{!U6hH%DIX9vtyDW>$P4i{RbyI-DE{z)J6k0-Qc~7dKv>1~G(U9|Chpj;!{j)j zi{3Y}E+gyf;-#h!-F{oTiB&1xyv6mopt+mPKi_QHC`s+?=KJ1(=HTif=UGR6o|R5L~_vPyr|Kao9Wq$fA%p}@hOE6ir)7~M{ zl%lUWV3kn7s6ozOD^wTL#q61d<>B7yBhA}nF*G0539r#PU8@qN%#@AK8RJfDxgp2e zs97EHAxvq_5>9|!2VC_>68@Myul+k!xU&{u3G4ZiFa*P^J1!&Ng)SqfOqgBL{bb~r zs-zFTw?7aW+V=w`U1r|Bm3K>sU5b3av56*vMsM8iLPE!+GUyaEGVdiO6Pp2BN5sYr z^UyDH0&3VOQ^H4D*1vBT66LkZjfqZ2r+DZUT@mIqC?-B9%3c45j9 zGGce!xiA4mmYJ>SUhB(`6ipscQ0{I} zQeM!|{Xx@VciXSTXaD7-`!*GoBN8Yr-p|CZZ)+4l9K(hDl zw^0pCX%c%d;^-%m6qbf;hPqxsucOe({NeO(>k=rNHZ7Rh zQ$6Y95~}lWS6f{18`wTN|f3i7KwvSV^G4XzFy8c*izq*t?}xLfeG-4 zSb5WQF#HoBA7aOx&s7OeP=wn3gp#gK9kJ+7dr0QAlTd5`6T4O^I};TgH{+wT0Y)lO z2-Z6LYp1ZW22groNH@nRBwql!)hnkQPT}#Bs9Jhvnp2{@fswx{v=J1P^q<;?cX7S_ zLhr$=JITEjvb`4`1Qrmg1_`ht4uh~&vh$!~GOV{}?$mnm2bh$in2y~>?fCL?4#Kv} z9Yl46EW8qp0^1cG?H$$5ldxlk?c}&+EUpn1(Pj>73(q^ZoC__t0fCJzy70Q)yCbgY zZ>W8V<;9ef&tZC}Z4hQ1I?w7INk*f*_4>K#*}ht_YyGT`oKyXB@J7kl*z%X#&wA+E zntYgPGUJCpJUccVdWzpm$<3{V6GzXflQNBbQd$q(w;l2w;wqO@sbq)N@3B7YTy2xg zqETh{&s7g3u^B81CU~6BAKP|;aV)NR?8YsMU;IL_A6YOpxIz0Tt6kJ=CtkeomwAZf znm(w+qzk26FnDG&I%M^u6HLN5z4G{ZG@9Btuh9Y324I6c_Jg7Mfyl%8QFN`=Ke*bh zvZm@;A?(JBb!L2|0B<3+%A?=C3rA+k#w*@u97)_+?QHpN1oULXS5!~}A?)Wr8Ygx# z_wH{{nUf6b;emW(SdTV}KM@CfXLD`b~)lSaqJ7mHTFVG!wLZKzB zVu2>R1_6mSyzCN#_eQBGe0Na;%Z(L^<4{ z>N|^6k?9`jG~a80Rn$r~LM{>Xy*=6q%49zdy8;5j5r*m3EDXWRMHpptf?=R>?d;>w z8BI&iCHL~Wr=KMb?XEX!XDU=*@|qIv&A+VoI#jl@%~Bk#FZv#;H}{~+XEwx^__Bz% zNpEyG>{E#}Z5Ew6%{cY-Ptsh6IVV|}Fy@`T>mgY#Xi8D_WPs9qtETzTeCW3F2;0~p{f^vl;krDr=)XR_rH@rGBrxpw%Z$F z=8Z;Ho%?kTf9Ga-YWQR=$+)|YdgyfR<$+3^4OZBbM>keW>nB6STc}d-oVfmyG}%H$ zN(hP+;lH452=CjoPuTe?9@E}sqyIoT)H%9d;9})N_kiDmVWl5U<3FysBJ4NCyZ`#^ z^hxtpYenCQ4V48#SN#3jE9@|Kw_)?^;~EMEw-DovYQI<7u*uFT^h+J~b&BBcBMgLN zi_OT!*4>Kt)onWGdqt8bZ}`BQ z7kYy^+U`v|xgybV4PG(mwgD1pdJoC#=t61IA)bs`?BnHrYfvrcLq zcIcYTZ#$>vnJyvHqf#B|ykFN4|Y;@&3X>6#7 zxM7xE?ctJdDTE?v@)=3LJ$)4JoHX;Ul*u_CLyS<#?iUrcJCTB4VK8d@(zznw0Zs{A zn-7v!{^hZjv3Ni0;o_a$CnJ=9i3skumNh_^Cpt5(Fd@;B_GGkq%WhQj^E%BYhi+2x zzXqy*iOT-N9R4FJi#8kWcGCV&yZS3A`%CR_cP*ir^89fH1`bC}L-*RsZ{**a_67;s z2;(?hFq$gs!mU01+OgPE89ID1Tr*Hf5+T8LM!LQ(n^%b(?szQRW&Gx|Mt^NlpES#= zaEFb3XPjxH98*Spzlo+;V1VKv!Ll8dTL+oQSKi+mN@kA07F z$A!~%z65@#G5Ha<#7i0 zH20G66O1HE!Yt!{f3Dt9oYLkVHOjLCGrsgCEk+LF4Ox|#nYbkkTU{Ugk@)<|v%5Q1 zGGkfewkrEl(G`8s4x2Pc&i`Z))xCEt?Dlp2CAj%RanMS}NIHMO=U-pOn?BY!&Afhv zY=H-%F&7OrH{d*U!q#8^gfXiv6`{cGvKEunh%$*LC$8T`PD2 zF}^BO{HNk-=RgDcAd0NGL~Hv_ER8W@C*gYUIcuc5nK0aU`1g%`)c2njFg_k#6gu*b z^3p_`m8cl(@1HiGDr(WSxRkl8kTEr~_fFgglgOVk?v?x?OcEqXt@!K0~UmAdoTnD#$rApj_Q3ByYq1rle6M{|r!xWvMEWQ#0(Jq}%28 zy23O%(L`qEJK|Mu`jW-+Tb(ur12-n4H6yx4y?vnt^QYQp;LvpXx6ftts|Hwe!Q{fG zylj`xa!a$FD7${S2o+qMCHiI7jq_fx4Q2eNgVDZ&`^C~@`U4QF1K#284Z@QJ6#d|jqVsCsy<>F#tuJ4t6`s5Lv=*0H{PVr`4wvOHm$^v%^`g>N+F3Ox8q z`&QmB<^4m-U$t>1bR{Bwcw6RpsmN2?K4IAElgjob8?{-L-Iq`2I}77UvqoD2 z?33;w$}V{5@`UQ%S=e_JLhi#1fK5Q;_2m)4L}mTL>3 z4SDjePR%iPiS0yxQr#ze@f&kBc&_}HGvOotKZ?dos7~U4Q#8JsC>i7k^PP4>wNO!g zb*{;CZ$Fk}^OY|Jq~cl9QX9h@@K<5^&xPV{v^$Q=vZS>QNz|rcSmzOoi^P!dAwd_L zoZY%#Y9%ZK#}=I|r1#MZZ6EuMy6!gPhQ*;dkEH!g|{&4B;gsFo|^;2OmF# zD9axy@qHpUwO(b7b~_CZRB*U0aKrQ~u?u6q5;6a4@X~LcCZwjSujCFVjV){K&6`9x}!V;<~2H^?{Bj zTnD&BzQJoQOt4_arnu*b^33FH$@BL>`yXT(v7jt|p!IyIMfY9sf#)1u7FxXdFZLSX z`6dSUcehE>pxN|xRos-%LnT}gei(k|O;Scx&lxZ~4xv*bZ|r(VGr-(;GH z|AsHdF6)6+75AqXlbTiK@t900Sqh8el;9|D-`xH>-q;$4od*+om3vbZwZL6%{W<1e z1=4DS9}|xs8VUL*ASL8#M^5uD+lMlKUqVde`Ag>+XUN-I!x`05oLExs7lq}cQAeK6qSCl5(XrT zJx~myCOcawg&L{-k-iyw!QpxlF`;CfFpSyR7NhRY1iZWC-*|Kv&XpP#!Fe1CZ}wk* z^1;Wjes=* zlX>4K^4c$u<>u;6PUe~xPtX@&nB_@x-8~dY4AUZFs1p)77jiM&e@-UGvzUYqNq?4=iqZ@HamjXW8u{g zwQw`}!AbtWrz4FvYnj%t%QQZ-yowJR=#fcY6L_AOxO%K?c7Pd5&ab6_UbfNB@UqVa zAj?VP04xLK72)ImuAgWKZgS%PU48an z?ylFHB$)b{3<^=xr#DC7BQVcX&PtI@I!dfa`6~xLn+7plvhO{OC%1TjDwX)^Y&353 zUv&q)N)o>D%`+~)hjaaxnjoZph3BJSZ>%&vW-g&}SIC@v_PggUU#}V`5yWAv5wEoC zUfF1JweMblc>oqBg;3B$`}r2s`0^3i5%?^)7Gm|M7BzFEg@$c&vu|)Jk?1a?n#$Om|?jXPECg?6u4gDZ+}WTYt6@lE+On25ZEpnWD1-~kyV_1 z+6EipuuxkMF8L&yq_o~1BDLv2pFr1Ghd$;K_V~i#5U?MKO2yW1Bcv_GPOp>Su%Io< zq*fqQeNCX7$mop&Z<@a}w|(S({iK0_oW;0;$rZicAGxQR#fTpNP-OagDu<;MWeU47 z&F_f3-bpgj`Hc8_BtxR~+#0yD6^SV_p+NGLW~e(22p3y1x|87TQO>>F-mY~A>M;<> zJE?K)8Z;Dd^3Mc3-_U-!8O=DYaq(X<6@B+ONGP8)zjwWix!G~bM2x*$;i^r)iD3;p zyahCDb>lZ;khh7`$k4}J1{}g_G!LkAJN@EDx}M)8g&aB$HFy~u;_6yi?!jvtTwiy6WQK2{iXITX6g%NCP# zJ+gth?x3&^-7ZJ%NATe&8UWEV5A*bn&RUL{G>g?`}+pK00j%R z2(=vFXckDnV`7pWE_w3uG7-iVc5V@64AFKFi2EUcjdS6GM8%{BaR@S$B)FxXcU{?m zJO*AYPvuVI6i|in{gDtnb9aB9aVp$RmsP<0FN4xk@W0t3od4Y+ckbX|Xz_Hi)e=z? z#i^&mW++)YTQb#p=qqYYpGRI{sXpbIs3353JN&zCO-WJKmKH~JDKvFLq;ABL)~ z`)=2-JP zVcc7YZ0Ow-+-|jj)0)EfTZP?Sh{I(3P-livhC4T!8qD|3_7;^#0^;Re8-Q`!Q0kMh z|79J(G5H1cEb*kU1}GOYMo}P<{kq@m*}f{p#_4 z&JeG9?X*~uaSOT>0={uH>VVzF!NtYW6Ntn@AlX zlqW1c?gpWwO7rKfKN@P#8{er|zz%Cdbb4oEa4$L^O`P%BvHfPAlQqye?B_ni$4_#8 zdbl($+QiR2Ozv2wq1L~%`Rzy?^MIGITRvdv&?uca?Row@B^X&H)=jpYLA{Yqe%dj+ z5?~3?%It7q7Q43!hn-7zr{tv8+vp*0O|fKI>HnOpb4R($4j)$-gImVRISc2{5bwQH z^z0(){-MaFgjV7rg5vPQn`-5Ff$v}zBVQDG`}KmLz@5s>9f89ps$RKgMzW@h*vU4z z05jNKxH-d!qc;0xNuRM!%h43lU@`BxE35q_Q4dJEqeqFe-FCd&@oDraTKsBmUSsb6 zWhaIjkqDr=2dD?hhzYtVwAP)=gbQ2+L7i|(fjg!D@evE4(vLR0R|>~}r%a*mt@i4v z)E)Uj($m}f^>V_$E;I<-g|lnyp}Q51>+9~odK z`9q8pIK(iT92qb)txY6v|EGn3%jxHZAut?Go|4xCpX>W4PpH|G8r?>F;D?OlMR2}H zRHmL5tqBD}Ns6B-HVy{Es4KN;{V-$h1)zx=ZP!$eE_xaob6 z|C$6>gBZ__U^Jj_94V>U8uNNPMol}mwe_|KqDcO)2*2Fp4lbTkeG-eu-5uW{4-KN5~AmS zSCXVn6KxJCo7l+4trA@gxQ^+-khlL_hEH$5)Ve~?^J@HBf+Y}77i{Jk9dL%_pAn+% zV!&=1(gfq~NH2#pTw5h@h#)lD!b)=4={QLtK2Dlx93x$HRWjx|;&<{HpT12=!A|a9 zI{SaA?w|m3Bm3VYW7PjVGJZ#s$3#W#f!-9@&skQhp2lTR{8jDFgQkxbpHCex?T&MM zh6?dfodWVXi4eAYU+IK}44VkF{ zW{$shhqFUL`R9(@SB#h^!r!(k538fK3P}@Vd-;DH3#uLb37=$(qY*^K+i*F;2+8x!L1PphAr|kN+w1I{9N%S2(SzW}GbF z|u-<`Iv0jpSRFN5c$n>H2NgX$)o3bp#GywChDQZ9G#kZe&8z@>or41`Lm! zpz0B8rK=Ywo>p9hCX<4z33#R!QKSm8Uwtw`c7X%ieqM_oWV>FUKeV6Q!Z({#pH9KD zaOkelVca#7HJ*w_&wCtY9b4zH9^#r6EU(d(|nF;Pr(vjS%B;PE(%a(_00yMq&bh~5-c;jq8$W*M=RaC_+sK-8`I9|#9 zce}6szhe>*3;*9S3D8Z=;kkpILcw+KBDzX}>wW2tM!^T1u`HhD0B};KejfGq5B`eP z5OJi}z{B!ndi$;Jh`_hH_0*(|wqav>++<&l4yppc$LURUyqW2m+>63BO)WUg_O1Y6 zl4*gt!FLfM#q29+HOrSf`%3M(Q|gK^EKQ@{XdVH?q4w+vVj$)*Ls&a`4u@j7twjE4 zGCuo(Jekp`;3oUbgP}~}Vn+le{2M4)4b8`py0O*c@W!m#JQfCZ8lvMfe;)c^%q0K4 z>*sF1x7Ta&AXJ2GAr#rC%T%XZBPG9oXhU09h`Iosbq~I27ug3;0Uex34dX}P0!Zr}h+7?4Pe>zi9wz=5&hwgwnG#HF< zcd|NBZtO#ujg?}t#(m|POOjN+`a@&plT3ga&hg-;Gls}Zrvp>i=v_iKqv_=4%-zB9 z&CqzW4eao`LOn0tZ}aSRCU)tm2JH5GTe>m*n#|)0aE*`&TYh=dsbF*s!f+Rk?(4;1 zHV*b8ufX(NjTRB-K`0TY=bG13w;Qt0bQEZL5ktOmVOLsfR6siI%nZXbt9un{NFnC- z)&3=Mq4H93??^mpwNF@1e7i{}S=*er^8{gA4mURy=UDyqd2}5kJ*q(;(_~_5ZI{#i zIiGsbP|;fWzdxPVf`8<|HXr_XIq;LdKH0o&KNyq{m&L1(c)@t95RJ%9v;l=^zct&? zE(pd#^FxVW?;br0F4WAo%H18-qmI)X^9>T6fTfuudN=FUxud>bw{d9T+Zgo^zj4*05s4TgcM%yKu01X3uTujZ_q3<7A2o+ zVg$%BBPxOHy*5keZ29GL^Qa0Oj3W**a_Cgtk*PJ^`X;Z4#Sy)7QU3jYu~5#Z@lyFT zo?$vkr?;7RhjVuQK5$NfI2d~KdU-VoQ=8>V%}^kFt*DNeQT+G)jJHK^=mF*2nLhY6 zs8ohvU(P!w7G#Jnu=;j4(C5?$zp(Hoc)OOx^h8#y)v%M( zW0AC>zz?#Rt%5HYYnO!Hqizkzxwpo4k@5udX0UIiy9-gfmV$ApTci`h_pU!ahZPT< zyHg1e6p{|tp5nBYcTw`-1xog$bf+A5MKmV)Vr4f6I3P{-@08IHtHH1-okgYZK7HaM z^EQTLt7ej6Bqsups&LF2hghqc0)CBLI3|dq4lLS4kx(A z^7SBu^V2r|VB$sSO;@Syw}iSG0*)ri79mvv;ma;o!5cR1o5K(aaVPQy(pjmU&xxDK zOOwr!hSV*lx6yx73Is6%hy5pB@Dk)y_#XM72?X!73ZI%(Ip?DDsqZDN5PKEzf3x7q ziDtV-3A@?;K-*Cyd+Qw(y$*FUb2a=FcDD|OUhb*+d1eRv7&EZTx(qt}13^iC=J0(m z*aj0IJ6u{|#4@@Hd{1(-9LtEi#e`vK$+j+SDB3lU~3pL2*5BDwqxrU-e~n4^_9uiM3UH`A2Lki$7EQJ^zakkgL<1o8nH*{qT!P9b&Zn z(e(i@8A>)&Uv^HrMty`ZK4`l0T{2vG~`# z3MP=<(_%nLdCdxLlr=uTy_F-HajQ>ew=ira@_K*UdNfn>M9sKY<*h9|Gi6s^xQ`Ba`>b-R|(cr+gp6n+CzHMTc?FETgBG z+k9^J5=Ah65`urk%IpCLR<)Qpjk+Ta+sA2;{mZ8O|2?#ze@~A8erW%%Y?Z$)&i1-S zdXN^ADXzDz>6nj$!!j#l$XGzU(y-$vz+V~4pSu|GTL3oR1d2US0*f(+>4-RlEwJ{$)Mk`{Hb5 zJo(2w)#HOA%FAJ`VouMCH+Oe;8>yB;J1Dpg5v^G{0?O1WhIOqRV`w7 z5-r5Pv7n{LrYJK(SOX-G9ij0S%RPOrIrQ~dXQfr@FOd<9O z&5cR~AO;1`CrA6BT0au7+3L2T3jtPkJ-*C^g$&e$cg|?T+%+gPQAs-TiV`nQ5y&+F zcXm%^r}uu7?|suFLKJF=jW$cjGD2T+6_V*>kCtwuJ=T*Z<6!zKS>Cr!TWAT?a2@S* zVOF9_u3ER5(4uyV^U)h`39mfWVs>;;VzqQ8a7A0AEX^EXwb~k9_6=#El*>0XYQk7(6v1zaJ;%Pmp22x! zl_efYkfsdhS$ZJj$Y<+!N)+I(lb0XN$yHz0D}9I3=$Jpdd<%S%xgc{tfQI~?2vr7w zkrNG|es!Q)ip7`OCmt4-$XM!J#~%dmQ>YBv>UnEGTXTtC{kYdqW4*Z0tbYBskm^8!szacaV~4D2622Iyll)ye<9&Ii*Cf+zg3#QC{z2x4RaF_$W2V(i`nH1>a$f# zYomlB{*bP0EE;#sj5d$N?|iv{&9za|v5QNG2NwykIrF3)K8oQeFP~u9KT5=*4cB*2 zSEz*uKhrvnl)uC`N+3X6rPY5FAPb0E)?=huHR~`fDK_9xv)GS7QG+Wb{@-Agt77``L{v!ZVK%yg_Tx)hObaoFFAfG`d@Ov|DdpL@qe32NTIU7 z|JjDk!ai}iJKhIPWf{i+gT-p5h(L z;vyrOFqG)9xQ-Pzwm({_#T$Jw_{kzzoALm}HDw)jm3^%ITstzWH1Uf8uM`@W(g4j3 z`x`uU4#bVi>}5javYd?U9YCj(lfa~prgJrc?if7)_RF2<5Rc<0SN1ib3rYqh0#oG% zDw+$0qebU{>TBpY$Rv}s(h{4Xv4@-A>A3v~-hobhzQHiOTCidL`C$;3vm-ceb}M5t zou8k}d<0v=f95)4@Se@TjUnjq?dqS+hIrpNW=l-qcS{aD>V9+I)C9U0`?y2lNDc{@ z%f6&qsAPHYvd?coJ9mHf6@FlZusq43bUsYvVt6wmCr0N)*ONkp7`MIBovuayP_P4m;61=d%`lX z`7+6Y8@t^o@r@ zpHMtkTuTWqQ*m>+8a*i~DcuD2kn5+DLBdu)!at{QfrUk1aDlwC?v9FNJv}`cUwsp5 z5;kLLgdlVfT2QGI_g|0D5;jp{zd5jg3U02Dy+m`h3)+t%;AlU2GAC0y#@I;;I}~|B z6|JI3Kf90aSJIUldM0_H!b>PN8;G@xR|4_NI3n`j> zXX5+@rC5eOQL>dzx*kLR_0j!}A8=O(DC`iBkVtP5SoCW5+~7Hnfq9$#qmn7q_39HLzT2+}*g^u0mVm|-(#{Pj-I*%Th=FGHd9-(da873%ai zIE;tuV8QqmoiIkK;y) zNRsD{C@KG{WO-sfs4g%E1I=B-rGo%HMl{~m( zHudN1Db$&0kmC9Z;-GNgUe^<_(TszOlML5bKhg?0*bO}Y%PWqRV2oCX*}ZS+ybAJM z$Nj<7fBM`s0{j#@Sm%5KJr$g-bF(xD(5Lj6s!Vz>=+pIy$ksx^DH~(0V10g_!PXB4 z2VecyHbTFGND@APcWFMfF_8W>Z~+xvsO6+B`%NcWu>De?pgO4F!~S_TX5@n+&FqNL z&AB<^+)*Mx|AggQIztLxQT@dIyZW-=3kp-S%f4J|k^+C^Qs%G}RefkPPOc zuS?ANJu|HGza|M(9u&==DJ2RLo*r%jmiZni!?9guQw!2*g|5;6WVcoXUtnzs+xIz_ z2C(WPNN+U-OjOex_HVZ-O#e-N$p5e3>Q7VP?@8!h>ieqTO**M4 z*55CIrysFx72A1#2Ns$WV43}|(;%B-em#CZ5TQe-LH>88@&AeS|8LGz!=JFJ1rPH< zTxL`f_0SdisiM1@z+kQ$!?I$wR}k=tWwpt=cD4nb)~HG1-v4)S`A@vr z1$$)8S^wP8IEVIlGpeWcpZ@ja%6RT?4|w#;1OBUH5;NZzI6lw?v5a=a!~OrK7yWbI zPRff;RbchqR+#HDv^v6$<|W2S+2A3&s7~9A^Js;N=qq#oRiqHS_#JM_{x@fBK;h>9 zfE!Su4Y65(st5P3>doI;-@i|j{P%nc2>GEQ2OYN12?7=RG{+q`8D*bKmH&GM^yI;2 z0_D}v^F(w+xz*J9cY8gXr*w35un+f3S5sQgI}d1e0H^D*wp5uk9mOvdlptV5|JRpi zd3mA-jzY1*jvMu3W2c45t1fTf>{mH%o-bKk#=??3iQqPTq&h@>TBXt{eX>2IQh%WwK!+R$0tzQ&S?!I!DoA#=44K#sq}iN0wxZcFY~Vs z3hU&>^+si$JRu-rguMZm0*W7OYbk{fB_9DWzN|?-&|;Qf_x-rZE{9gp zZP|&S&Q~kDJhumSma~wR!W-Hn>{|A3AMGBrQqI{ zXc1tpgLpiZ(eOZ$qR-Sam~Bls$L{hGjB-a9Jx2m%tp~Z~liyIVMONQ@c2)&Ul{G65 z*q)Sw4TrR1%WYM@@+l7>vd?BWhYrD^LPN4@T}ppQLnwbO{h1r2u&O3Z4+RraUiE+G z1~?}1q2tEIj zj{c>{`C~;B$<*BSST6{Spb~@PyRw)#!uS zb8v-O54aMHm8wPVa6sQhNwdQdHRiQnU;h9@QD4u=UN)^nW%R4v&xjr#Wf}4{= zL)5tJe|v+Dv9*m)lPcfEen8xPFHA)jQO$I;a+d$3=Bab_i4DF%XJXGBZ#VJtE1$CK z1Bo-cznaR9E2v(JPCmZ{PHd7cXLC;if%&T(Iy%D9(s#NgZ)zr*H6=k~z$i)TN~aF4 zgr5xutx_K4aH08Pq|>!g<9`Du7AY^KT8cLup44xZ#Evg-R_`iCxlqM!Wa9{i(1?2k zKa#=3#H0uC7zU@8j@ok_Rz$hRTyMI(Oc#`2)}E#Asli|YJ4O@6 z>+sM~bZ>ZS0{t~8WZoK_ZUH3=ihY#TJn>!vgVJCQTe{BUY-_2E^#UAHn!Yb9$jY|-)etTX%+0HR7uHY zs+gP&R)Oo;v|_g$?$0Ewa?8p3;}GxZL;ok3r&}lqb#d8%+--D0uJZx~9+wCFcG8Bqnk$-1z%S=maT*k6(SJL(Gx4-`U0Wv3uc^dZT?N!0R6ocOPUzZ!m+B6l!gI?S3!@1=>w`T9dVg&ei$jN}H zA7AVUjX@^(K_>Nw?krapgSJHU$-tU{te17wb8Ca^W98ONTh|8fYI#>gF46C~uwJJ; z5_h$_WTUoDmIyvl+VVOq~V1%mQ zu}(DP7(>V6bV7T<%@fb$V?@XJK*B`k=_j>L`UVcf_wTVeg3f2z-+xw~9#D0PmZpoj zy9{Yelq*ivo<2V_8?HL2do`pju#-r)p^{uV5JKaqEl>JB(j6B6?XB)jp3B_y^~`mS z8ZBL;_7oFTIu)%UWIUY~6O{v0L+f`1W-{`t`mjDMRVMb^T-*72Xn@2#jib}O%Ik}E z@WT_Ca5|(@^YdK)tb81?DUJ{Y7aeX~oVQ*q-MU!5E?@%_AIe=tY&eba4&BJ1+E2bX zt@B(x-%h?5$Q|FxIBlZB|JvZ~|5fMWMAfpR?d68+6jEgyV}I4tZt5gKb{uIO`<3Hjo948yZPWAOROMzN%)_1` z!Z4Boes|l%y_fd&E`4Lb{Gic38AAjUwG5-Y$WGc8Elhb&d{tW zID_msK#+?LYr`|(-qIz9jnSXXkCUdw>~{UfKQ?%eZ@1ELPz)zs^RLHdNt`|}Ztm9F z{&vmLq>JwG{IF(fDth~1%&O`~fU{me zzDuOMO>}!VQ-gmInVoRo{?&K3^TP^N?UA+6iwBMZ=8KyZJe^6?q2k-_olftrZLReX z2wuC2?6SHTqZ%olv^Z}T78)KpH#^(9P=6Ie~)wuXspLoTExpkRDlNs~;h`Swn z&q33i=%#bnDrl;J`(+a-_|4;1Cu^!+e|lP`s_oo-wxU|M!!pDbPDlF04v zy^~$ogZ$b%QvxQb#7ZJPg=`F_9BuQS$W~7~O*Wh5ZaV*a?@O{|riu#bm3c*Q@dMxN z&Jj|z#?4!5QqsDg>}@+-KfpT(Ta3#=o{NY(mTkYSiR|cESif`~*-f*|5cBA&UDwHsGsPs4 z5iN%i!}E_Svpk=DA7~?2$i=LI_ue7#gITISC@s2TcjLC7t&kc0V`g#Oq#QO5?)tY^ z7TGcrBT8?B!ILenD#6!IjG^H)ZKgRy0K*fPNFBn>hcQL_k^o8Ks7+KxuLJQknlYAl>mu<$H9Rtwn7!=OuCLER`u zb7=>ZBs=<|$BKB3$wIh~DaaHNj?Y~OSY@>hN?52n=9Zlo^U_3zA3xApAPa^CV{F7U z#VzoZMqp;F2g*9v@U9Tu!MGdVF#LElvpU#xr_gX(c!wTCq@JFcDz}o;Kgqg+&YwnwQmp?J9=1^_*adcz;&Aq}jWUv8>WfV1hU< zV)O;(!NLf*y~DD4TgLJHc&W-83%`;{KISXP*sk=>iyiOrhH^DL>GyJ7fn#!kDeO_+$phJ{u5YQZRzXdv32tFlVf$!9rr>?NfmN~v zTZt&8B)12iM&03EX#4V($Sn@-HEoY$Lwq{h`w8OAkK?8VvqXAEVyx)CH~(+whkT3&gpJ-r(oo0A0(cARu>6$MucZY+S)F5SbvY@a0FDs z9u6W!(%)qa{0grj2L4(>s(LK%Ligt&Kio#ivRm_{{12&5WP{HnbC+ofx$9Hk|A@cX z!oQdnAehU6arQ&&&Ku;vrj?#q-Or>T)U7Y&r&ebEUY{5}G*-!DC$cuAr?`XW$6nYY zk?;n`roBE9L_L3q_5H;xcJg&SCm(A3q~MC-^M^Eng-b_n)CQ!~DOTYa_#S#lA7|4S zAvUDcMV_3FZ_Ef!uq@=?y>I2+0`8}6@CJ*mC8k|o>r1qE$imTNKqxP`)mUf9rS4s* zY)z%AYZtPODEeTJfrUvZ&rrgODPjcvolX9mrjf$kwxht2d*LKxw?;J%F5y_*+b-xH zB*J%SnYNqLp3RYCaKqeHQ!AWmp>T?WjJXgCOr6D=@`El~a zi13!4Vvbpqq$H+0uI!^=9$$jlHToc-LTZ&z9po&DM>RbL5!t6Zt8ZB|DCLqDd`KA` zIy&#STv42jlYT{QKyd5*_C%^|SUiOk(fF&HSN#|i0eH8$oanvzL)1x&j%I!6EE4!k zqZmShPdDqPB5#2e%pCFsw|jrCn(?F1>`D@K@9AqZc_u5-sh`=}AwF!|A*$XgV%QOc z8zA2D=&|^~XB_^Q&Qbm{>uT%ncx0Xzhn|~MH@PgbfseLLH?h53AbR?hd;>96kafP+ zb>8tKj;j%%k`~!p^%?2xTxIumE*5mo(UZ$*DJhJ+1~(9BACn{XaN*Ug zBFxS7gb30L$&JQmfBTRyXRsFvKWn)*Tk2RV%>_=&JV5Ojz&Wk&$0hN~I*{ z*1O4JF*qs3ZAfd{FD>auh)6$}E+<5wL9&!#{O-`R#bDOfy_Ma1Vif|5ahDVUC}eu8B?WwPBt03L45PmTQ)UXU z=WfE36A|5_9olu9jeCpzTy#&lZoj^4Q{<FltIr} za7SN>53-4cA8avlSPe{|*xgah+v6uG8~^ywdt4RV+WVOpS;)v~7!>_diksE%nIfY% z-mYzNCe;z;u3B(zJ{?0Lx_!N_wf+sd7$XY>-c^P^*0cf{EnWjsIOQS}SiW2H_le+lD8BUi(TNT0dkMu|yDHJ8XaSB!g2zyGZw8C-_3t z3UoeVOiduUt2VvfC1ktkR7X`?O@BPlgXbV?v%E=OYl2tjl5;T_FyEbF0qr~(=k*3)CPZN|zh%xacMu?W++&Twpe#f}h49=!^Qf`#_}`i5OSzqXjvdP!P3_e@c;P1l z^g8dwPxe=rhRPvak*%74P}T>)&-j3<&N@KidzA^>&=v>vVYfgVWGKi(8Z_R$&Y?M~ z$C{^5fLx?O&)q`Grk-*8_@iVIGcpg^KSnSKkIwa)2n@aCYwyjZiq>eLFatdnr3L+QsPM*?syr^W$ncLO(J_A^ z>BHxveISmNzKizK(q)N|yr_S!6ku?EO>Fp!lE}7?>uPR!b;$m4muKz9O(Al|0w{7y zRz>gtGr#<`Roz9@QYqk+uTjwya!y$|vJJ%bGK;~mj&X#H%vD4FIMCEtIDdDO9)}#R z`E5|*#dooc|6v=0fd2vH5MVp70ZQ&v7~#E{@4NePnWTDSwDdc1t6&x<0c@@1TSeP- zldC>}v|^#jJ&7loeFn1L->EO;gD-TeOUjE%yDag0-qr}2%MSNKse}hFEC#;(m1L*i z`WhhOGgI=7tNHd4?`%wd<;W5lHZpi3t+LDdXQ>^)A|MkzM@=(JLm#d^YFSe9JW(ls z*@!9d^4n#{=9{WVA+*c>ywJ9f__OV06JSPjw<_OncN%f*aZ_9g6nv7TC#P%B+z!HC zwe%wceIa;aHO|kLXl}uSjq2Ub`L4fo_+Yxw`h8!73hw{dQs=+4<&Rle%*NRbG-fO#U2t^Q|Aq5`5Lqw2JBS`}pxDO!D@^RCe$NbAQcse*o2<-P?ff5YCE zT%LL8jaku?DtkI4^U{wO*7Xlp%r~HmI391r7hiMOoLpDpk)WL`VW#W^q{`B|DkPY<(TNy@g2^a`*cTqc65TxiGi{(A(pj*yo!J^2CO%K)a1WNf!yP}2M z!7xRQ?7eFvY9TPi^Pocti>)3gdXd5tOEgSg1SH~nY5yvbk(enT2Cy`> z7jw7^gamICd7QZj7)mIo@be#j_9DLnUsx5AHNVt z0E}zgCB(mA3oG_hL)O|SV=WaTgovms`aW8kA#J0t_=1$<>yDZ^kW^;62*<;peXSwn zeC?aJ8wDQ$;gVtR6JBRHz%4sZW7V4FPmP!&gqS2r%}xfqFtEUlz~UPR8xyPGh=3Yb zM^B<_Z-v%HWuS5n72+t-R@PQ2Yn%AbpI$KLbhX0X)Efx%qf^sY=xK;H&fQ z;&DM;%0TY|k;s#2gv1-IbyG@dfXnvU^K|Mw#Hu+R7Yr4g=@j_@ivAn*(G(FO7Tg1E zUVnji$NSX17k136dy$UxBwyfL`GiUuPf_k?|BT)l=xjDv#E%0Lxb*$1<|E{hEuTQX zt|Yv6SlCLvdw;Gt`OJJ3VK|u`4UE?IclOE&G&o+L$x6Izgq!TLlBHlZc{i4jF|!M# zox=7Evf-jigI&n%w)7_}}nI zV*+}+iqJ=lbJZ1_wWrl8$lc_NYJ5*MImYkt8Lr$Q&KdlEW7)|71G@<6N{v*&}Me)T>ds?;alHwsCYLB*x1pk7uB))zN70cr}>cs{u zpLfAqU6uuyl2sdztneL4CS9jRyM*V9Du-N_mCb9Pll zwjp`&L3Sl=LwBXPS&GXn#uk&D7s)*R*@p(l?fMjw-uwU;1wYpUwb_~E?;Q#yXe-ru`fNU@+VeQ998_}xTDAEYbmhClBU%V~Cz)|#{-D|DzFAl2;?PCV3_7r4z z$?aEdZC{+M&AGyCW~p|LfX^NzkVZtVmQT853xn1m<7@k*qL@@o18t~&(P;kVjE@;1 zFeQrIYTWpeowt)9_4+aN=&{EHqCuTM;-S3^!NxWU_~V~yL%Et2z1BCcnPIOD;jn+L zKWm^tdfjzvjs)&RaBqhX2c8)9mhXFUm)NQRPETkgi;@G)>2PaAxk}xu0q})r6{rFs zCsO?)(QO9-B_!|#x?JbbHteng1_X8B?;p?PNX8sYB4|(&&emt0=2j zN$eNy@eaeLBaTZ!$T|L=zd5{D<(EmJx*qKB$BaA&7mHTsEn1Ef_6q)GMuc*0W9@z4 z9q82$P*o5FICo%-v9aMlfpxOcHe+$Kj}+ z-gO+emSr_)Pei%u8?ZjV>Ok)R{csx7#O~*zp2!SdyibJ)8 z1z+3HBYJ1hOR${z<~y0}gkXyO%BGUR+ph!FyIz(6QSK4q>6Ad(8p};Hh;F}5uJrSb zOG?7)CG$!Ob$W5z#FDTDX-qXvCvso%Y%HQoCAE$w&9WpCGE%e>g_r^@Y#HaQAQ>Uf z7}GJ()nFblJBgmCVT5%+gg$2gjO)^LxGwWHab4SNB39}cy?gG1E%mZv!wR=}*k)cDv<b(D=$!Pid6z;JB76JHu8q1G!t{F*FM?|Xw|Yx_s`4jL{g z0y9DW+dbk%%puTA8!(*Q8sjzV{fx8zR#f%-fr1J@9@&w|Fj~7?JVa1(aI2-xjmu0K zDeQ60Hw~YFt_L~Z6jqtbU=+mUdvkrSrY1&kafs=L*nCW`XKV?}Q3jhAdqA)o0|MbFws?R)CI6&~fBjXX(81Jbq$ z8V~xSnPw85HTQshb6vbTQ*8rXSH|IJ75|aAe5An=P4V){xyPtYlxGHaJ`;mcAEu#RAKvb(w@QXW zeIhEcD1Q!;r?d|_Q4`hNflOAwh9PSy+w5(BiO?}1+`Hz%Sj(RMR=^0SVqK3(tVkQ$ zi^VWfSx*g-*Q7&zFP-Z#sQha~toJgTy)Uw(O-R~_cv#HZJ}ML_=u@7NFFe2$=bF<7 z?08(MW(TGV%9R}M*M5w{+e2;9j+a0QaUbkS3GPbiwUt)4>2mq@P!>-Qe#Y^;Dx=J&m!5OHV#H@Ky_RXA ze`_2HKM%!OnsqhE-lke&^gBh`d|V?}c(*5cbJN7U&QD;507^ZdkV%V_AI5 z;r>&8(nuF$bZw+qw6Iwthm7JfqU{sJ!f6S=iCFIGliW8hKssVu)-@nT@%JvRU2 z0gU!S2$EyO{Gr&mOaSjQ^OO+-jp^Y#F0>W4Y9hT5&rId#tPxP>@Pn{prNE?9!8WvF;2bh6_siB-{dk-uRB<7S(MMinvMEuxT0t6|)KR%5^m*`>QSfReZK$N9A*dYr{9X+pP`8(trMOE+0OwgJOD25n1;hkhL3kZ+n4+921Zx(PX=>7tK zY8ODIwNp`~Wc;)VxuH$YW*RksJl;KXFK%nZzE0M9oki?BVuDM*#p%*@*%vjOHv8?7 z8I=2CU9JFy`0(CH@|zt9hqG!s9QAGrq4%!kIc#X@S94`r0*Q=pAYH(6S0ZM`PWzn+ zz$&gB%%o9r`8-9Tot7Y!VWsIK@F@kSl?2i!qJtP#8n9^gC?11bqLwn0k~i$Z@BD+e zPQ5B+^OS*KiKx&=ENp$dX49R+`OZK#Skw&thm5;a%LOb+PTzV+3MuldHtu_tVzUgW zwUWZe%e1=Pxmc8V+f3GMg#kEiK~;$;dp=f!_@IB=6XMnewisyc)K%Vf9%BjlRRGpM z|DS;(_TNnu2rAM>#B%6tXuEAo>u6uC2IS9iAK6&B)nO(-xZ_}BGm2^N){JtI&rZ*i zcp1i1=2vIwppw3kk7~IMms=&jD18+ze OHww-r+`UMcIn8MM0+dr1v?>fZ& z+lDYH?zAg(>4hAzKuu!#bp}&?1QWDDCMZpIf$?OPWmluuQ1HLBl)e>q5oVmEG0~<#3h<86%TefQ5-Q?Y^+3M z-?#ybS^h&yC`-|+4?Yolnf;oD!$E36v>g~QCzJ`^A>b`;mUIQ@))5|!=SimL3Qp4$ zovz-7Kn8nTD7?YcbT4-TuDQYjT6V8t9uz1}v{_XbRH98NpF~yAEN(HN#ehu$HDq+z zn{O2XI_GN>0pe(QJ1n)h?`XhKzKsPhS+KGJFg%*YC!f48FXfZbRZRIX(e6s2vVx8`z6j5<9V~P znw2FOiHRuaE0mJOEOkG)Dl*2WXR2YKu>rLO2wE>u9KkyYMoF}beVvO^;oT0Ni4cLw zIOjsZ82pDix8tWLlh{^CK8ac#%1^G7tpI)INyV2`??fdpmZ$3KE@vTFC_T*QJK|Yo z?}7%?oEk9#Tl?tB?#HshKW714rx9CAW!o0WtpuzkZH{dhC|%H%P(4wIdw`Y>hScKn zFj%DZD3ThIf7JNZF!D)Gu$pHXAwn)FN;?R2P6hh`*RV#T9%z7!){MrY3;WQ?7iOB@ z7s13TYI@WJ`loGBUfS~mcIce2oD9du~2!b(WscU?a-M$kp8NAgvnPk>bTtijvfxI^^&QR-Ug z{2LHVK`R?7Ft=__-(jTJ9E>8NDUcBY7ObiNNAF>PR;QQ{Wd>bGpC;5l$3=^}{L>^6=I7%I)IL;FgjHm6K4>yo z*5r|@ijO~T3BnN@Yzz^ik1OcBc0U28u_p(YvM7<%NS1~UL!CAZ=o}Q{I2)38*h-)6 z)`9hi1GMpLRgd z5=6q`AndTVMIlhQ3kD=Mn3a;urFN=l7vQiKx+${yFBBRxf2V&~8gnem$CBMWL^m}0UB++LUMs|RNoh5L+ zA^J(4I&o0=a5pQ;f#AQ4Nq_*ob2|vd=&%;PSr8WdA!I(x>8ZqsQvew4Nzb~NqFL%k ziCM9lHh~w^9MyrKYKOEluTd05LNxkzvElbybq(lPYZXgLt4ck^WS@FIwCyomHSxF& zaT$c*Ie;z{u*o+AVT&8RkeqP?x!dIk1Od*>nU+Fu+`x7OAHOb>%d=0aXV}Ixq4KvJ zfUF7-MR#9xL+ftL6hVALt$gN*o+M?U!d(aY3Li!4K!bp+e2@KX5ASvRA+04ad>y?q zsSjvui+CiUyB~hqdj;4=NpCm95MvM`qoFo7I$%IR-yGV}fOgE8Q8i{~qL+_h^~5d$ z#lRLiP>4KY^uFS5hKOL=0FBh0o`5X@TvE=DDO21Hg-AdFpcu+8KTbY1vPdd6=39bK zRso-P3nV2AJ#5WjN$>@E(%fiIh|+jLM;2|4lqlB?%R+Xi`~HCHsS1~A;7oORJ^2Fg(@+rqnW#T}fU(PBuuz?)lEZ_w zBKeV2{BV##B%aTC%-N7FU*+et!6OjVaocXTY>#3K?L%l)+Ob(F!UgEn>u3IiJ@9EX zzo;EeK$QK5+QIZf^)L64D?8%Z;#`xok>G=*9S7BoE6nmLT9O~r$?OZf$Q4uMNc7Z;bMutk?>(Q}c+kj15|cXOc@?J@rGX-*h~$^XhUimHbp~RV zhbw@t9w^%t96`_Tn7|p)S*YgQ>iCK^yBbn0RFf=(!7}U;r_B~`=sZnzc^4aUE0|ER zx!7yA=sWwaV^ow?VP8RU<^0LZ^-0fn z(+#b`->T*`<0x$qBp_OF>$5bYBh3MjPC;#IY9&w`3pv#1;J$4^?jdgoF1(2(&ef@088-WytZm*lZ(EB5^rhq z$;C95_;TZ3yJN3-p~=#`Q|Xgg)Aad=&Rat7!=(o-#+ zy}Nm590(51u(!sn>O`Hhe?ah^2y%5O>F@@qAn`hUA^7uRaJj=*J$0Ry2}MAGRcNIY zBSHq5I{~HG)pb;b37I&R&#edOLr;Xy&s+#C41O*|LU9!8k9`3J&{p=)jtnv?)o>G_z`tM)8k;=E3IJ_+R%OH#7x+N;6}V zE%#Zqpb5x8RGwILj)YHDVagJSEhT1u{gmpSzZneiWNbs}t$%Dfnh#;sfS+A-+m zu4O2G7=0~?nOrCg!V$TXE;09}0R47jdD#zBux-?5UxB!ldJ7&rQ3q0$RpWzT-7edk zKS0D==>)1I+UW_<8AOk0?`xX*uvg*2?SwD!sYmW9F!Us3=wG2BCk$##s{FD3pH$^X zQ}Js=-~xzlSt7uOk3L_Mau@~sZ%VD8r6j(UTd4cp?!nWhrZY_E#82`18oo$mdd zQ+f_G6=@2j<@9VKco+aS2x)^(*RAHDh%_q++Pb}7`RkX+u!!B``}hesmXcia?wWgN z+BD0*hajA`)65l57Ve@Ea~|g%1EuBuQD+dPE`y}~20ZL(0Gu!(R+ea*X8O_-{d8)^leO5y ze-slU?4N)p+*@yJ6Bf{Jq(Y$83;9Ik+mDtO?azH#^Y@|M7-$3=Xp*7=EKuT z?g#2Lw-c(Az|LM(q6;JC`exO{I#9cA1s0MSA_-6g{vB?3N41Zo1@rtfK6!8XO za7~9t{~kQ(JE21f3ZTrIN(fIxvRt%&)c_LU)1y}LBxJ?@K*0QS*3oE~A7z@S?GN;x zz5uaXk|vUQDOQF0#msn@1UR_2oU6|c#_|AISD)EB)GBr`wfj^(Nn!$0827SH&O(tU zc2qw~HVNqq()PZ1VV#YGh5Qf7ydOcM9#g#zp$uX?@tvN7Y+^aMjm>Aw3&Y$r2ugw_ zpD_Z4R^iT^QRO%^mjC%PDb$RM@?79p^|o)l2FvG&i|oU9bl;S0iV7z$23UF^GC45 zIEMzb>AXSGCG>>{+p~=A@qkHuN+KeJoyfBGhd-HuN8WL{wBkliT~4_Xzk3`bO~WGeeQ8`OL_zx?>|*zA@*yQPDQvUANYy>|#E z5CjCTWIAWz=wGfO=wlzQU7jZ;i(+VrNll)Ah-rp#g4N?&#BA_{#QgrKUq!`8p+D}4 z8g53LE_F~Ln|kBJ#Eim`S2O9#p*bL_PPiXVOfJ3`uu9681p|NFhxUUC%VVUU2p$(w z>5W|(<82Ykpe*Fxs+-Qy7yB&&&goGfYe^JL52Isg83b>)X4gXdt1U6j^Qo@21ZNI! zg20wZSKAF>r3d0(a=|p}sgxym&T`0-wGBN7!MS$EYm%CV3Q^w2oU5a8o1lT+a&3KV z(!+lAG6f^1ib1&E;LncqD26Z(dz}hq57TKc4hl~URc1qlW!6ElfzgH7I%Im-1D`1y zK)moRog2C3m$qlo0iK&k3)vPtLi&Q@Eb6E@ea}v*wKV)?t`zr&B)`y~1&`5?O@gDq z2kt)MFT|PnH2u)Hks$DA0ZJ2J@I*$jLqau`G7wjrjo1%6|E@>HTwHkVJXbZKcIH4e z@Q%M2BbljLaFjFd*sX)!hnt=ugK|&#n+?RIqw;=IIsZ+|Evyh5UCuepAg*yfNHjg| zRrHSQSwhx#aMZ+EmdgoI(AV8g;V!`BbXwU3tm{WFs1+Fm3b@fl^c{G%ut{|neoK8G zo1Y_}3-*p99+`OudeEyD%bHDgPq15^!yELjjd?2wsUyCL^m~`}uwn(Vt%b(nT>=O@ z0zTZDUwiW9MS-ptB;y!QlZ$-XMRhl$pf7>Rs=f7^3-{t~TYC`h)q9lq*m(>4j}yA& zwSeM#mPl4Bhw87WbFVXs;)nN@+_h3YO{N=8j(}wDHRz4*QtTEK)_)~C%TtGvOp6;P+bPsgZ490--F>>62I7yuQQW z%Lkkf`XX>Xp>2wRo?8%G@M7GRav3d7DNZN-tM~E91W!%Jpq~hZdolQ^n1dr{4y%H$ zTC`2c31a;8BrOsoeXXI&d?Lh#sggLx`XlZN^TGoe+zP+h58|G+W5vc{`u<{7Iq^VF z|52e{%*ZQH**DlI(Kgy1_;g}yiCM?nFC&!?wj{(}QZk(tsgp%J?L4OOo|cD+Eiydi z@yqrtI|wg5FX|r5eIkfsK~+Gj+TT66{NBoKj>2Q|2)QJleDX{_7nFmgCrZP%1bAGE z3m}aGoo4R6lG3I|i5FNR1%EBmfIbSoXGgeq39={5W4=JY?QprLF`wgVAa2TdKvNne zXg?kl`r%R-!Mm#$6H$pp5vx#QPHD;i@CGE(B^Sd7Pq9_+->AX9*mU26 z_{5|`;|b_CQHjf?R^0BdwYW|Z8bZ?HsU8U$Y4qL>hT365g`IJ8EQ{eX+6E8Y-k>%+ zd+>`B7;Xmn`3vMqa7A2*J2Y(&EOHDlWYBY~FEPu(U3_ZLWIo=@>hFiICfQQBm2EXy zJtAvmkQ5IDNK^gVS;TxRZ{3i1T<7j-3wC`kaO)M6WPX(dz z0f??cM51p2rRrA_ZKnp3yL5W$X)k#tJ{HP<2(J|k`=T%l7~o(+>$^9SI1IdX0K|H; z^NDxN8x{kLO5)uL_^Ub!@E;@h|C60>!yiuoQ60_K+!y61eeTOYd7CVYrMR02aU^do z1tNc!f$x)U02L$(^ZbA~W zZJ3rzOc(Fc5O`Yg&}bbhn#-!_y);7R4n5K&fg##}9RJ5aK{2Rd{yk72LWUOJ#!<5u zk$glKR&jwl3yn}KE_?=G2*lTT>832879hVCcIEaicV}xmmF>>hD{;QmYBOKkKwu;w zzZjAEkCC|H$WgQW3RxQWel1YrdI-FZcg`iFCtIoq+K2^AT+-=0K(K92dq4hN@Byg# z3))g{>56sJM?|u6z>L}7)VrPS51j>i<-4z&&B$lZgE|9`Oj;10STCXFxw|pHiat z1@J0MfI_i9b;^DT5G6yYd5j@ROo;-dEx``G1gcnO*+#T_!umY+6Kd~-ExZuLH@g>5 ztDs5`sCIf*Sg$i#BudJK>jeksXKk$vTpf=fumVv>R-I3>cJNgKplPcmFYdpdep81Z zibpiQ1ZS$28?EGIrY#U`RwXlHdA|J?4$8PpsO$xc52i5Mq_}OzY7Qq?B^f;C8EE9} z|B~U6;^uFLG^kf~_#leX<5t2Y3XI*RmfIW+noU?RIf66I{!B69>9f9K}}U| zHpE_cH6?k{Wi{6l9D7n!#Fo_C@4MObk=}s8(Yz~BNX5q)`0R~7;InG~^x4N})ZmD> z=Wk<|WU_#>s&}M9y3i@>j*0h)9)2*eAhfv;-$8vR##j;vtgpW<-U6KH^{FSqSf*7# zlj7NYJwe^cQp#m8X7O3ua;FSq@gO6vGhpU@6%wY+rEK0x{e&nfWPIjh?DqhTGYykKXMAAJq&r z839F%p>Bx*#7#7jqCKW%y=w6Kf+%e}!K(J>t{^|mhlSw=llXoFs`(S2pm}WLIrbTH#N?;1Tp$f?UAtp_H$(sB) z=7r_9j}5)(^KE({e){DvzZt%Ffd3xl_jnZplc6ueG}!6d>4PO64f@1C%P*t7eO!0WBW~w$ceE zq(P?U@|c&3@WU5`dT@#wr6`he4xHJzJr0X8i3&_}5u2X(B(GVNmf`W(*RcO(TAY!X zIzS9HGzc0f53u$ZA*0-S1<5{;_>$FQU$y8v1&}97CAMtiUpIhUPqFC}e^0EwrQ^@0|q)Anhmg1Q10mG8zdp zs;%c=V&|tc5FBAqh@WS#T&A}g^L?5Da_U{{w>Mg?l!Mg#8I2fvc)CR9h+J0zw`3(s z7s?Ah0_3Pa+{E2&UH2a%K+yJ!af2ui!Z}$*M$@A=D3NE{U-J3RbP*$OuFCd+gDD>; zh-KkHe(isdyAX8!qDL@xFkf5gX^!|uK1{$eM#&JIwe+f7`Z zY7Ps@&WrEQ-=2D$ogBM1eJda6G$lm?jp;uV1Y!x#XKJ>-;fSU2&IMUFV9$dQhRQ6J z%#*5jbYi3e>MH3wexHqEDdfsle0}2g; zdR>l@CH+m#c)?3=sBUDTQZoaL#+1^e;u#>yeZD|dyufM#$VbqCq3|FZ&{yoL-YM7s z?+yC55Sfk|{nAH(#^GZWN${s>#LyC!2>XBd`tES7`}c1W%3imfoe|mDGcvL&4I?9D z%gzpE@0G|(M5V~i-djeB>^-vg9?$uqyWjUWj_05Ia96ml&-EVX`C8}qRn%DoP*lu{ z)!mR{>JgmjOj*f!8!zrwBsmq^U=4Ju<={=d0_x&RR7N4!ibtxuw%2w~5h>}IbScTXoWa!mr-~mg$yBxJg za)Q16YoHvDoV(fs_;Pq4*XIR0?T_w)!-suD@g%_0%Mr7?5iG0->{j#!xzP571$6R^ zU#(=g8Ejxk%FeYx*AGv&|2eXJTaTL8lQz{TCC1La!;L1usJ9Pco3x88Z6Erdi*G+1 z;fN1;GR8-}-Ta!PpinlOqN?=J$ocf7p#EYRjl2j0cy}*u2+0!2p*T^b9e$nt3MxnB z9vY{mnp%Hb4kt7&U~%IS`EOb4#?t%4#+-Yz6RT)TC_ig&}>I+j&IWYGt9^D9TX?!?enBKS#WoEXoW!osc z=P?Ikz{b(BZ-!joN6+yI7y@j(&a7!vVaQAj5;JJS9smP{n1_PT%qj4ZU9D%Ex{xXf zm{o(eO5<~M?8CT9z#w%}Bs?fdx$dv6gRn*^4w#WMIJQp{r1O1!RmYA_Jt~4F_8r2R z#sN?N*VB<_h>FLrTJ`1H{Z;}(Z>z-W^?{c>No4er=ncba&7NL5T( zbPv-hCicO+a&GlWDV)yNY+98R06>3?=4i|iZ-eEwy=ZG)xtDSBXf@B xesOcP~Q zX6|D!XK6EWD^bNfBKCNA>%P9$3V-aO>|o|8`|v}+$EZ%oXc8SyYMJKAOueiNwgu~m zIWPI>W=R3I3u9E9cEjG6LV-k`9l*h)uB{vB&~dfJc81NoT?GW zrGb0r)|Pv^>7w?Y%t)#pzr)q^RcSW)e(YJ;YKrV4*t&n#7(7M#&y`!Bt@qsAZQtX9 z6)(3x$8Bd>LJL=bYD50M8_`{jCLT{4Nt@nuX;ZK`xoGf*5-q*wg7Xl6^~l=Dz9n3N8Rw%!!G7U+38IJ$)OJe zA?k(V^LIWkY2ZWxl};o=Y2*Otx$X5w=cR+JhQ4gdAl4AS9o-oA zNJ5Yyo|Nr!;&qOXS%sSGlA*eYXNv>brDNT|HQvNc{pz;!q}Z~b*FUtHGUOWTPC>5O zE8%8n6%WjbS`zu`%{zvjMwjYz^&n*>nfj)^wb{RlgBiTP;)ySn#;O9q*#^U_{p&16 z9xJ1@EN_C~?|kI1dMylbjwv$t2>F(wk)P=I<%?k6x_UHP*1xX6)^bhlD z!Vm@5JVn*yPSN0#jq$XP-*^UOc}v$*{b}rS%GZNV!7T8b9BxsES~9|ZlKgq*fa4fM ztPb+>AnS`#f3nH2Q$RTEr&5kY<3J&P?p9kCb7cDfX?eet$Lg5$ivQR7k6A--=^ef< z9+6`_#mPakxS)Sk^WT%;RCD&<&Bpl1-?=bFJ-*l@nTj|;oXvLMB1cjXy!CT<@VC^* zobjLP+3?hHL{)>=Z@$;L{`B3&!G@FkuPU`dK)A9JMdGvwLU<5uP~ySIrm#!e-7hF5 z5uL)#v{f5!?ra+a#2@1Cg(?5^i!`0RT}pH_!bCt76sZ@n&ZZ(vbUWl;YaK@L`nU7n zniY)hRl+wDcAs9cA5Jldq(Fs?rHiwza)rL%gX_i+JmFN>*%>Y5_R@ zD89%bdey!Y!m?ysIUl}yGss%u0*1VAZ-J~f(}?r*eutae3OPW~$}X;hDvNWx7V-Rd z$^Sh<5!*sz4eP&0b`3p`Kp%@(2$U_+r)9gHG*dAL<*}eQjlRu=O!}Wuop&qqE{wCx zIHddGgm@~{#o6hjaPu_!*!8kv@@f#|+H}0;H_}Jx5*SK-1MhXJ@t@O-iOeIia2Gf=FuYW~bkY=4#XnzL8>X{4 zWahC{*jEa&%JjqSRqIU(>N%dJ=crQTo8-Ro!oO(? zM9Way!IX)0qTz9M5aXT)>sVo!YDSq^-vHj7H zcX`xvpJ)2>!}h+qw8%tQa=SJ|ATUp(Zq}E` z@qLiyx?1UnqxxxQv7Zl19nr??4}0qd)R1VW5XbWHvdSCCq?Gc^h;(Xx z5eVC2%-;?)BinpPN%70_P_M8u0Q6}BMVJeRafSW}Ac`i*tI_o9us}0J?eJ@CgEV_& zyX-k3o=dS8y;8_CZ$ZKidMk5A-OM{q0F3h=1=KregxaM?dgs=n$S6)48Lhc}xhzKJ z$v9E zl9s{l;Q&JG?Ojhmp3HDFljP*m*5Px0^8A9xfHQS)sp*(^8WD##21)0 zx{SSX3xA(xIv^J_vL>!^Ie;Due&;76n@|j@H|G7xKso3bDA;Zq6?V#eJOm)y$sg~M zo!5)3~FP}0l!U7vuG^!s2zE>m5{I}ynr|4@|+$(8o5N?HWK^7 zh(G~3qM60A)xSY*IO?F@mQ(Zk#U7(^opauX|p({9eeN9^2Ub z0FH>waKp!bfi@EkbLp+`mW5eR7x_~hj zzlUwXIFtd362>N%FwSE~lG|Mc+mCX$01x()&{F+?E~^DX2ZAOw@vWFa&@i0nG2fLV=L4M_(?3P}AZe57%tU)^u4^~FAVvW4`@#9RfBqxi4e{9wnlCsxfyy!OCP z?dwbkd0u#@@&lDRDBYDeDo?=mT{usQ(dE4eYbPe97$i~S1#xD(uaISaN6Rm}_B5U; zb^(G>T6Q!K?a%_8YuPCa%Bi0h7;n2iI0&0+U73u<> zlVUEl-&>!40#?{(d^{cO0$BMxCS7l&t**2jz#FZ89$h6?;ONZafv21+Y0#Vweia>Q zpSU>Uh24u9PDX2dOz&rJU`SB++D)_cM)PKPd{feyTiDe%P`JA2FjLE)z-~UChS(+E zr0}scC2DjA<3lRoA7ce8CSt6ZOz2x3&C4)qZ?IhKL2MYOvbRoS&W5ny{;%)Akp;fH zkn-K1JwF~p1_K@~x4;pUF-Qe+ey2e!AcwB83STpH^jc$4I(1Vki>}h_hSQ(0q<#oX zGJ6cg#VN~i33{_B#7~&|RipfoSgy1ID5VgO)Ec%(qK9VEW;0h5s#g?U!x#h=f}+#9 zwO0?(A^Q2%X5GV^$pE8h8x@Zaf0Kg#_|vrpvNEu`V3|C=O7Yq2U32{v-O5_PzoAP1 z4+$ScW@tXsUSgTi0enN^-}s9JFdu??!-zrnM;-|SO%v&Z&tr-8wn$<@@u8mk2m+%U z$&fO{cM>){S26rlsHxs4bO2?+QWV+xOIstvEwPVSMgK29;~&wR%UV_@eN=9LKt>VR zIIS@|5`}xL$CUBzyhW<7rL6H_&<-^ro#(&$)$|I`?$HwbD!`OX6!C5caF)mb)h>`Y zFj0b2GV(*}YLF2MBJ6Rce8(o)y&Y)dLhzc0g_EB|B?PCDW>TUgn$MW*13atyWK-Et ziG9|s7)YV6c4??+2gQ)Bj317fy;k>DQM8xgcq<6b0b=ItAN+JgJd&$#^Dd^!&BD#WQfWQ``487F^#4Oo%3^Us6OgGKAPa7qfp4 z;dr(d`pQb|XZMLKi9P^&f|c`n;&KSR9CjE-z!OzEVy&{?e{g>qrRUuEEot}<*B$t@ z-~Tt~#$S@=3~Io022Hu}k8;Xl;|tOnaEw_G0UTyekqzQbT1@2 zLPC0ipLg6Gv8^yhjat#*PV1aTQSD68NVxw}Ns$=Fy(F@!lm5NR;XVU}3A*1^DnXlU ztAT=+%Pd)v>yhERxlNq;9KTH=3S$~Xy!J!E9YfV}(d%<)FCPP-&G1$_2+4>NE`SaW z*uuSC$O{JtiFT9 z1WAK7iWX^fGAIFQISWMlw>3vTy>Li2PzNI2_CjEtIN3uRh`Pu#TMC=P64RpHxPHze z23`ZC)W)Z&FxgMUMf(Dk?nJJ~l=#x;%S^H%bu7pF#b_H4l~YA7652`d6RQTurB;p} z?R@0=7t6hfN@Wij44cY~FVLAVsLzwO(h^ai?SN(2Av4h~NOCxshZ`bSB}ucMzQynq zdo64CB8SvEvqr}KP3sE`TXWzpzx)?;26;^#lV!`LZ7GYTb8myS9NparTL@xc(4u2B zv&wom?Ux=nN<8kpIDhKHG9`v~`j0lNj^?mYuyR~ab26{UDBD+=xfWNzChe3+Gqq+A zuDPfjNK&tL?$Y+G=ke@+(UPFo!>|p?sRoq0=+PoyI1=sjAb);ilp-K|A&zSU|Bl%^(Ew00; zTYL-@B-d-RLvX{9eHsmT!Ji9OBQuZi9TMKs`SRwk+BxdRaTwRZDS2yJ5m}Lu1Ydb> zN6J5;CG#c&zuGg9^YoF5;>z_(DB__mOP}`d;jxh2n{eIttDs}i#;6cV&H#bCx1tr zfg+8DBJ_Ho8IrIFntNE59u+`A`oxTuyW}tY%zA~tAn3fR&x1FpG@g6G5R;OA$u{iY zY`J21Y)$|5ent2q4_n!DzNQ>bu^gZ0cB;;2%Yi$4;Q8LJ^~tvNFcODQ?y@mdVAkUv z@64qH$I)A3MD7WzRaw(8mvaHq({>I0sPQBh^FArQ5c(N$Ta}5$QJQn|i|*0xL!v-w zBmt6@*kUD2iYb3YjY0A5VD+az(Q*|3ynEm-WQk?`Iqn)5(kPJxq^s|}F;Amsvf)>7 zG^eJlR@o`AT6~<9S7iqa-`}W4Hygr0qkXX@WuI%7i>-CfiBR{9;H;sc#4Ip4f+7tG z?_m|Y@Qx|*jWgf0jBdNl{9I+ZLqL!UJ=-aHXU5KsjpcoztJK=^XD z9ufWR8&SyLSfXfS-89c^<8ST*!es*k1C0qQbqyg0Kx+)-&|(H>FN)N|Q+FmM#0`Rp zxkH)BFl)$Z)Pp91%Y7VM!a&K5sD)$-=$z$ld0I!L;VK?}njjfJWQz7CN9rXae7W_7 zeF+w#EE*j3Xvx3sBWD2}RWxzSc+Qr6ga!=0+3u?vqZJzWAr^BXk{NpnbZk35jo4h8 z6+yoNxRs928iDeglhB;1%4WU^k)sq zGPpoU#SioLp>%*rLY;(x0=*{AkfYJ3Qz%xJ_m>Q{t;s$T z;ANbw{iLLUxaG%X0g?bY9S96fY6mWx2NBwy-lE7;RX>?}GJ)iH`BPh2GsN>Qfxb^1 zGA57jj<86qU92LXTKR?~vlvj;e}E<>?yu=LTa$s}&}L)^3^T`Ag}VLd_V|ET4c5x@0n>;?RnPZlNQl<-j|k~K*yR_HUHLbn^nXSqhX1u`S@z#CPJ@eTJ4 z4j*kIQ9<`s+&(ZxdE;GfzP|87T}{RD5ky$!)%|-=FMWq++{k4}oJm?%hS)LbrfnDi zCj1$$Qrq~L{s-}0I5f4e!hIV6?;U=7Be+~9jJT=2q$g9&iI_!o)3J-g|8%QwfpEg? zCn9_Lk1ltLzTn#B(^3WL5C3#{4@Y#=A#IDMV=sGIg>)k~34&A_Si zblh-ZHV|H|Mi*O2L7(*V9apOVBLt zx@m)m4{o0m3Qg?m>h&4b1-&pR+)~0VrDV~T@6w7mm?jY4JKd}h#AgkRcNoz4e{6Cb%>6A;13uzG^TY}0V+;_E?5q{RWhf!M*nUkk;-_WAryF7XQS?eWM}ei> zJ5SSzTYg&HF(*?XM>4n~8jl)JFJp0tA3hUIAw}{WxU6{y*CEK{v~oVPB7^2gU7MjY z(Rm^;j5Q_{$)8MpNYHXsuIq%kp_XofMa7Ug<%9cW4kF1Rt9T@-2ZOK`;|{aMbLYpl z^2)oP6qPWhFqf1nYzBx`&HJ)-Boax2e@%vocKFXQaBRh}sSiOK7h6#KtXV?bjgR+r z__^P8@Jvx+FiZz%&V8>^;6XC8ab$^D2H@~2?P@A~y&1}5 z#*tMR)_OG3)7&uDNTHWSJ=D|qgfoPzGB_8ENKT%ezv6fd-A65%WYVb3jBd%z-~JXU zry*LKVDpO4>D$kNZ=ddweF-X2i_1+QfY1+#p+}~6@rFXJ8 zH`VtQdVRtkt9D*NGmIAwP|?i>ix}VuKecE zd<*-l+bbvY6-`ctSxvGTP%Fja^xlHlJ9nW^R)zs|`{dhJ%82^dl{=@2`b)Et4lCvo z)nwH>&s3@}M}O1Io7HFz=L@Xc_qFOg+eAWm)JdvDHtcXReqj`VOnm+qo#qj4h~vOZ z5=KU@XGWo=0C}ffJN6^jPOz)r&o6Y(**fg6+~D8!UGg4fY%FR=bn@PMNo%BrO1HyK zV*~{_@@=sw%p&B>#NP!`7Xq_7{}A8W+58xFIH!=78?V&HL1Yzbtv|_*-wQG(xt> zFo*<}VFuu-GEA?-b?Lr}A9ZACO2NK7PIAnhG*0+_@1$ZQtLa$x}YsvjreTr ztRP0tzWMVlq-Qo+MXtWdi8h$w!4*5rg$g~)uyeTQ`Ihs>8ZjkJUg(AwlHg;-ko=3H z?i{x<%N0CM=Ek_;b6UJ#J?FsYy+!}o_WE_Y{D{Z?n@BN~qgydgSqE}n2mC7bS4E-y zE`jQ(GUi)LhPR8Os_C7N+;-8241^};{o((o=k`CFZ1_E5y1i&X6Ky@J38j5~n34efBoh~lU@?{%x5>BTUkz-T0#2uU=;T)^Fz zKRG_Cqvf0Wb&=NoV}Kp^w&9{7I2`&7Ruoj#@fJG}yAycA`>oHPlgv&onU2*iA5I3c z$N24O98m+0ZzarqvQEp%+LM@;!mpQIxbmuIU>#F(*P9C@MToi8`ld<7Vc#XL0_*-H|c1$dAC21OM<6Pjix1t#_)lK-x!vi>vY*|pY^&(dSPK-+G2 z$>8t3r=vZZlOIMWdw8p-=>27xZLfVq-vh|U8HA#bhbTEw`<*!u$+l*(HI$^EwF+DP zpF>Kr=#jP@6U&8{y6lSjmY&(`{gBWu+5fF&s*SU(9hX|@*0V2*AMVKYYk$b!MP$l0 z<8^;^q12Nk2_0Y_04Hqo=*so9ckO1Z-#)Q`yeQ^!S-NmwA|?^Ja(>K(T zxC`}}6H?E&Tqw*&lz|ffZk=l5pfm*(wdu8jYGu;OjZPL8QOHe>V5_*#=)I@i=4iPL zvvTY(@u*i9(yX6tB&@%9toG9=rXuMI=oWksbsFY^qbOMFA9;kTKwo$fvQ%Y8PYd6F zV=K*fBSirQ(leN(ll;{j(SkUnhRB}z#F+~It1JvF>1DxjdiGlQ>frvk6CWk-0K{ZJ zYSDilQG4s1s*o&_LEr?E6iiephP&05Up<@<+PlpTwN+&IEoUU)z&Ra4YkO~vQ1ZUM z9Ccfn)fa^Fdv4(f{#PaKLm;&U2T1bp_J;}*8f?~?tcGB+gR*Z>5JF^v<_3A>Lku}3 zh`;}9%Y`;yrRr$O`T+Ni;E;_>ohko#G;gN#m_XFpOoDYOlFg)dVmoM6d=Ix|K*5j0 zaf<~iF~kawEG`7``$C5aD^-oD%^F_~nj)50fpm>O6=hKk8BNk;byl@WfDU|Ip;=|! zpVv@<%o!=;ed+43<{h1Uq7RcwSZWAFu6SPGc|-As*Nxaje3W(182y8mSx>_lx9p91)Q+|ZcT?4wMk&3|-DGa9TnlcMZ3P-R2Y+O}3zD8tXSIdLMn~8TT5c1{gG9)iO&Xf>x6{0xK z(mQG$eJKp;M>W@9fm3(%{&)TR6^zk`W<^Zp*90ErE54{!VSFJfEgwobZLVVy1!rn0 ze2|0yVVw%!&u|4XqLu=}r+gb?^og0ON%JjJ%OLO`zB>Sd#{TDph@|)z(&|??`Eju_ zUy1pjFu+-!1|&yphXX?K;=BNMUp3N(TSp{c7aJFoDRBw17SI z+L!9u7Jb=UnGCGxR5Hl$FVFBjP;(uM$6fMOFn&nr8g|Kupk=K2P*&U~0Fm@x*RC4; z3Gc-aj-S74Pf1K@I@oG=n#;4l^MmX=GXXP#uV;|B#~CF42Y7E|j7%I#JN&N1wa0!- zg9T2MCM3G1YI|i=@BYLvEJ!6OFIvJ>4L5?4t23Kel}u%f!VZG4SO1T`^u{`)#qLj7 zaz|+ZL%i|UDRbR2ryV9*UVwuADOmkurLL_3ZgQS|+f}BFQFDSVISeDRok1er(juvg zm8%}$qG9_^@mW>jX%az)xm%trG;0$H6=V=SZ+91du4xRZ%;C4|+OHynL915ZoS~B7 ze!)V4F#x5uJCz9%Umha#%D%h&9cOZ}h>3`1pQ$vit;PPHZ@^{}x3IFqiw_Zlwh9SM z`Ln|iBJst7%Kc9o<4-F0X4*yV|8Og?G5)Aq?xfcccEs&o<8Dg~%WZRJBfepO;nc``)t@N15xa&up?iL*c98-GuAIb_?%OHoFw zFACm(EHnLvAw+Idplsg0gL68VBtgN?(gPmGVWL;sBm}Cij*DCvtVuFeoWFn-UP5|@ zr0E+X8ocles)qEl&r3ENzVS`|^R`RNTK3~A65M6cMG2BF>1 zB!0^M41TrJ1#=|Oz5gi?6xJ}HBtI_g(JcbyAb$R`ASWo{F1@XHpY2E+d{Xwgqu9Ur z^b`XJqlwvdGym^xi6Cr3Y)+?Bf7+h876ckqS^--6Otrd2dd@_Z=|M(0K_)Z;)H{$Y zJ`~+7WPYJd90YyH=sH?O`VY}SUq1+&a-*y;Z$>Oz)8aDj_YI9^jRMois@ZrI z=6sXRr*;1L)-pJztmTHcUujw0KHdT0JqL)W2_X@0+7vl8LHA39JA3g(m^%Z-u={M1 zQz?wx5PfT7gMK}>$KWE|ivRnevC8r;bX$8K3_eqxKK}a2f_7#m3Mz{ZGq~f2HY$!L7tMfi%)ae)~ zJ=cNIU>C@?fz(cfHG~AK?b~ZHF#IFmIMi(})t@?e3xPEpe}8WvZgQ}BhRl$W3#0yB zSCN5x@AIFaKnHx!iA0^;Z_75EuihfiISGK&cM|{jniC)r0G~o9%;mNFeKeFzv`L!h zx`ANRzM%7d_M@$rw{-DHK(fIl9_5r9!rr^`NT?9iQkc~(>1{Om80)@TgE@@Y)CBM9g6Ygr@NUaMF$mQ)4bJx4zn8@g`lxvIDv+GJltr-zaaQgc zAdDQXbV!!5#;bHi;EK((skf}Fj*7Iao>i1V6Z2h)?ilud0-R>@s)g*niVX8?!)a=!g(ox@;hL;}1z-JJmA z%0sR&r2P706@_KC)H?GMnS>(zQjCeLqY&S7x9-F29)y}{))}%?Kvv=T!2hszBHJ$R zw0!%kQbAq=Xp-`2vajUemP&&(3x2O#0$EBklBJ`TR;PVh#s1q5aIQ>0b|HOCAR{#Q z^;bf~51&zs7@?QNki;WsP=Dd%gyRl9RZp@+HnZLzTF5f#G@qSFv>#9;)^Z;OU~h^g zBioPazGW2Lj6N|;rt=X`P`HrVsKMwn0PRuN2URA)S`C@;=^HLhS9HI?y4WWFdz2%9 zBP&Z3eG4+KF&haF-B&&~S;{0Q5{jf`f8iEI>$b=$+n(d zc7GNl1Rb(C*4$QODnLRs5jb?t^dVz}fFr~kz9o0VK2a?U1*d3RGSWw&(R%q7;l+KC zgZ#<=G0WqvAy6f!^V`fkCh|7dFAl#^VcNh&#y%RYgU#X)l&otGz!@2nB$w|-!b}ci zW&q;g?CsR#{u2DkyUpjT=gG4j5Z3G+gq7IOf&PZ|V8ubx=sHsqfdeHk_pv5-Qw$oK z*9~SCy}XQ8yDv6|-P%^D{&g=(Sz6ilI&M<5j3&6;z*t9Sod|63tW}Y>>9_j4ymnjg z4g!rqCi)Lk!HAPg%Bt5Nu$SaLr~KWcZaHVc1gXpyTwwWwdZugYYL3P&qs?MhC*C_@ zO#B`T{%yP=W&N3#OORic$ubIS3Di1S>1?V#M{z>Xzs^lwje*8)i+keVnqEd7L@LD z7Mr1QCERTC6}d?8N#%~H=H~kXfG1hUKerxZ2d_8LWOn6yXD$B$Tun&E#EE7@%V-dQF1dAAM!Y0 ze#<()ktKu*j2gN$Qkd*Uw`eY6!cJU(&kpD`7}S@YdOCjbkVgjZ;Ii?OjBuP}UdT>5 zqN`9~Vu>@zpM0pqz;yZRPq%5lPZs;zOWG@=^Ocwxxh&q{zBu!(|0q24bwGmQcx>lC zWCoRvzo6rXc|jEo@X6>`1RU9}x3CBz9}S3hb%MluvmS8Wt~90rRAeA!3qMjbk9iUN zd1$Vi_A`!)C7%L$n(&WfkU)L76|RE3>A#!XU#5PJcFLWMG8NlBG66vK$mHjt;vDOt zl%|J;y9q+LCT@BK^%+{Ba7WtLzIdRJN~Y)DP36atF~`Ai-*y0M^Y?GlLuoZ}*7nF} zA46U=XUyfsZarNx3?XdxFQ1K@^MC0RHJG(Jv3z(ahXmYQGrGk*AYjb%KBdywMsjSGQ{(0!ssBH_ zF_zxMS~IxU326JCcX}31G(KEU-YFQ1UcORyy{1{?95#{RZ`tOB*j(ce0_W!X;W95n zLX(?L5GN=IR$dP5Wx%!Af@HZd+9~v^T$5uuIay3XPrTPK*hE{J@Gf?Fd6_fv2?2qu z<=tB+IYC(Oy%ey%>pXW5RB&=~;v7Dbs4}T&8{X)PXloSsChUS8S4|gNUpZY7{e*UkOB{Ra!+O zLCSO#S~Uqx&?QcL;106ieSb@u?!!B-+FyFo>kBM0((4viR~4o0OvSlPQsKujV@2s% zc&nM(^EmlQG!1(RTv+Kb4k0SPjECNmYT^BSs>_vC0q?WFctlA47$Nzjuro$OZFSJ% z`~a!8OZ3KkZ^@dhNclOY4Lafa82tEJyHn%IuH))U{hT^uS= zA6iJ2;vPro`_V~Ga6TR9jYjTpv4&%8XY=s^yX%c`TJelGS<((W9Xdf&T%73X7v40f z(exCy^_~BISuxp%h-JFsT=W+dOj<9<0DCo*CaPlm)vlvZ7_|5vjPR5Q93ke>M%ZrjhmaBi4rHED==Qu zH>`%!tB68Lto{*d5`CckI%=0QsB_@7qo-KaoAMR7O>TGj=2WHNIhNwhffT(8S2pb6 z?Dg8W8?&8?@fowcRdx5%V7RqmR`p$y=3Xig$r7T>6jEGl@)@4Iq2ZjNnj)ge<#Okm zcG03k!dA_BD=8@yNis7w7b&}-me>!GkuehPyIxOnu@9pcOHjDF<9SVHiY8IH^vhW9 z>zHZK{z%g(Z*J(tYPI1I$qCqJvmh$JQO#Cre}MPScp+u6-~N`RE>3!l-?_t(;c}N3 z*&7mdK=%{8>O!FhANAAzI~NQ+lw%V36ku|7&!9?AQ8#9uOk5oFKgV?myVR-!w=V?9|fhv?o?s#M^s$ zr1T4DjX5|7F$b>mbt=g=c^^EgOapb&-H&(K`qOB*DF`S%a@nU^Q-W(Xgqz%o zJz{mWdNiqk?vnAORLQwS1NxFOR7Zl+*Db-t_1?3%M~M`-=022<^M<+61en8o`Kr{=FGX=@?^Ms1F_!@$qPD``@20IceAhGP@i7J zchxl#m&2jDi9OSq7WDFCR}I>MvvjZh{#af32jeA?4BePRX<8hwrZ5V{o=nx$Lv*a~ z-HiTw=*HKjR%6SZ87`sF z+61rM6#w#|N-Pr3IuOlOpP|oE{2HGbN>XZ5LKxc0Y*!u5t`xPYWcyTu_C)nAc{Go4 zwSVzcgMtWme34-j9*mQG&>XA#t2RqEkcgKJedXpd2IKE9PWWmK-s_I<^iWV<(5&9{ zZ_(CZe(+iAUb3e0lQeNhK{~GNjJgqVICu*lIm`2-Jukn^m8^i-_}RB>gjyOX9_G>i*2D4`-i94I)4DQoa*DyjDjvs{jh_nX4|+A_VOr`MJ#~YroJ8 zw_AJmzn*>n)f+kw#GfmXZ!|kw^Ppi>yt5RK#0wGc!h0S6ps5U-|Jufos}ePNf z{|!^)%`Wk%Jf@piGFqv+?9y4&7loqAp0_mqC`q&&-1 zn&rHG=(G0W`WDt^i(tyF8O!*(wGPTcE;f{ft(ppq9zDZM^|VHn4>|`s^+BD(VjOx! zJ{b6mYAtszr*f5gNZ(NU=Nk3T$QSaa3M=#l=4)5xw><>s!57>7@wT#RY8~Ikbzf=b z3GV!fZc}OyP(=A&`(l2xIeJ6bemd2lK`E~oMdz;ldcEy_NmF>ia!R2=YS+=M?%hHy zTy^3fe(XtPdR`a5t{kthYn8aiEAyWGm?ZnClbQWNI)%)n&*uB;^G(BIYqO5U_{{*l zxKf8Bzt_d)-=bqucc%HMb7^(B?RHc6&Fi^cPhNz-H`|EKV4gcvAT84r?(DsrBvRRy zT^$JE8vv*Z~?ijz8Kpu!03q(uNDi9z?WX zJ0IEsT7|j6m8y3Fv<0a-Wu8`9BXhYncgMQxCUX7s?H??!ZQK_N#0Rn^w5h(j&>5VwDrs(KmoUKj-vkzV57 zZkqZ5EUVdG%W736R+QD_yHKOuBVqFIl|hQN|IZigUPd9)+~HoHi~~~a4xG=cT+Yd{ zD-^xspp~JY>}+pkW3){aw)cZhM2V`{Dm}-|U1d7!N_Bxc|G2C3`umn26&6nBKRyy3 zb?spG^9et+ejj-db1t8`^R09h-rN1H*In`{?#8X3lP^!U#d2myqwU}REO%|LE}M2e zIWhODWXU1dw#D(*$+LHtsGi;e9{jyxG_&33oaacQO3)3v$&9P-#me%Z@_Vo3DLiwS zSXsRJblqp|$8?r;y}|ePpc|4f0+gNo0Y~Pj=faK0Z7(zNE{WzkD!q;v-9!u2uHG5( zkWZf**?7%&CXuhH4&5XP7y`L}BY$)9<#=n@eYfQu2#onZcYmp5Vn*qAj}QgBAB;^u-8B=MmYSl^+NM%fm3O~dqVDfzg|j`H@bQ~$2;S<{J5dfc=8O`*GFWKAV!L}q z;drfy)@*gG=Eqio^@n>40)AVoJJ|b2BmUCsLfx5W`7?`nOc_-s{kM+q=V(cLxSia5 z`SI?%rz@56YU{aN-mh(f1^3M?=Z1z`&ww4tI*Ub zRgOcaT;X8`#MvgA_kQ{vg&DZ&Q_aaFJVoO6yV|D~{A~40=1`1(uG=Cj{aeic?G~#ki=Z@*b!Q1Bwrh; zT1;@}q*PhhcDrdx;Es;+%xf3jTTLkX-+I-p1JJZ5$4j{LS-xa$3(ED!txW|WrTP;7 zKC6aD36)r6SU|FT(W?iW*t>jX^1S>!0D@Y&zVR2V_b&AoiRCx@{zcJYPc&Pe%0Ldk z-do`3h;!%5dbpQ3(r=V7DMr=5)E*FPBL2NvUs(%nj{YUXRN26g&IYY|q#km{-i#Gm z9r4}~(#GEA*mKe50Qd?A5+@j~w(Cegm;eH>5p78cdSHxTgH9YVo(;eJzGdC_B3zrYxB9s?nC z-`>~0MV3w zwIgko)N$!`qD5uItP1!OQaXb0$p+P>YuXMO+ zcI0lxxtsv+^$jwI$wtTAj;r@SkvlFA#ZwAC;h?Ck|0?<1$OvymJ?sAGw}6570AkUf z$4Su;NYylpwf?hBKj$3CFJrc^J{fY338`=udv+L*z44sF>!PxLQvgNJtd^*%Nbz_l zx&}SImooxJym1nr6A_=?yU~)+(qLl!woBMzGpgq8V*|;rLs7QL$r}v*33nvXDQ!mH z+KzZ!EgvgFn|`s6sSy!#pyN|!xrMu+GIdp}U=HWT@j`s#;6~8X?sMFzhr}D#sriE0 z%Ehb3UrH3z6uU^{qlhO@&UG6?6|7B80w9+~knL+A%>!e9<_qAreGJRTjTypH)6tzAjnjJCEV`WlY z!{DfJiTwa9QV!OofXJJs4!Xnlhmr$ir1P#m-Xo{E{($~V5-UzaDqVka5TV;cX}kLQ zB?~8HEd>7m*Mc=21*yIl5zuoPVtebidyKEX*H|MW5mbG+V7Xv&ui6l+++idG==DitDMo+@?jN4SNS^NZZb+)`Bxq|5}bcV8X_%uVADpZ19?NymQ zk#+v>;TVS_k11Dn^Faj8LhUt+-Y)Q#ut$I1!zOu4bXn>4K*4YKY!R`^mLK_+@%d&w z7$ObHk|>2A``#(oT#DLi)nsTCOhTV`UH_1uVM;4+RL0< zftp7?B1znIyS^GgJiPQwpW}j8~01hudNY%ah=9;UEjR; zxFhi%N26gw*piZFbio8g*|Q_cSA*!9{2K4H9B;gx#R{ZerB+GT?2(m&R)e z%ud-&_J;vAY41XBc8u7V|M=9jc7qD@CPwRGF#uKaQxZ1Q5#zch4LPslTw z?B($6RqmiWy`TC`cb1dHbcHx{#lD}%z>>mr*m{R@0q;#v<>fg!g6a171gteuS{7&Q z3>{IYTjjfJY*(#4yq~u9&0khGSQD}fJ$MB}kZzT3{&snID>!~YU8~3;mxEfl4%-}J zT3WE@+O0e`nOmtX34@~`!BVEr*}bdM!wrT;=CCrBl?f1qbYu+%4H6?q43y#O5tj1g zl9He3`a7|I8Eum=m9f0(?zjm@OA_1(XJ-s@*kD#&QFxjixkYh|5d|7+rK-bq?0>H6 zM!BB!c79m6WpQ;g|BAkx_X8aI%K3HSZ^>K58hMXuU3cHx#z=ai=2G(z7wcwW?M9!A z!@=y@Kg-bX$a@Vhi9CH< zywEFX={u!jfMTGW$Q?V=`TuBp>#(Zcc3pJREz;c$lMtj^lx{($bc#qL-65dTEg~&~ zNOyG+6@7v$H*1oQD_St8hwf~!X=^S&6@xD*o_x;>&Dg!Mq?F-DhcH_%b zfxSG79RUd(mM6jz_5)CILDz~Js{T~bD5%}r$r`exx$6cR7e#w^Oc|>H>v3)Dj!>N2 zpiC3z?W?nUjo{A-I30Me)k1Psd)9j2F8Vjw6jCRNG+aoG<((zM8m{u0EeF2+_!-AM zeB<#x?``SwvWFEs3q`BkmFnGv9op;m&0ii!-uf4Vy5zz)YIQCr%qGwX0n$H}w_d3L zkjjz}{n0sz_Q$<153j=Vg6r9JSS{7=?Of@I+>Nr%@s_7YU>5>GXGA7b7>P#VwEYlg zmF#?zE!NntiD=!%kq}<IUSjmD&bauiDga<_gsSu%w4P!pljSXE|nrTttGg z>FhTV+ocw&s?*>U3Zp=C*rLrsQ)*9Av6MwmLU!p2%k?g!`=l4tsRO5&1gJp9o0&&EWW>6lJvW5DU(+9bW}*Pba5NS0C(G>cl&HM?xBx7QR7 z&;SoT-bOBQaF$}01~$_~p`H?r&u}uiz`JD^D`ysI7$L9eq`F~)kj!kAPQGRWz+Pmh zIoy^{s;m>a4W1!g-O9>EQvPsS;}|9}Xfpkx-#!z&^~w_xhjxCTv@>zrwvw)YD!hS{ z7JfqqqzI9yH+K{!ss|lmNuwo>1ly-eUC3ePe&{gh%?3>nt6o*XDHvLYH^Gb6dp8^E z3f;x=K*ONdbNmDa?)u^AqzyCnw!vTJjhb|s7b5xpvL;ZYCF5u;H;2=E@u`00l{x=e zIq=K(6$<^c0Mi%8FCQNe)tOx&Oo=|F-~V}`j^eEmz>&oLa<2Ma$qPA3k+%KV+o4Jq zZ&w}RccZNH6t%dw2laNNd5$lQpzJADonM^cS;41s_dY7xlPbr5ma=WqHp0-Jr2icR z;O(8)UVFshs+B)~)0*N~%s$2Vy^O71H6cfHt4!#1KcnO^_Qa)8xmz?c2Ep*e{dFX} zcG0ptX8P40V|`p`rT3qRozZMdZh+aAtL5CQFuIB(t!DFZMqYhT=tsJh#po@&W) z@HL;MdU1OFNnpp@(^k3kl8*iBCC3y&d=}zh>|3)8n7=6kM5?SAxuN*{z7coB{&C~Q z9onhY2KQtf-T~V#iwQQjCi%cTK}lnW;DYag&Pp1XQ_#4eX(G@$29C;Tm9{hEYnpr_ z#daK~YV9BsCzg3DOP5iZ+}xA|noDvU$4!BJoaQep%7Frpba@rM3!?!a&NLoA+~m_v z)yj@r{w0tPsg$jLb|LS z6>rS|0BmdiT36X5I~d$9rWHm#Xq_t56V$D7z{4REh{Ysk-;J}d;28Syex|}v+-K;L zz%$9f)r@yG{deTvOpUL=o5!ycs@X_))Ls)zPB+3p(?ypJtie zKVhaf?DeX?xGt=Q)DKGB$L7B(K4Ol=v%9!?-%U%-?@ox zWc-Bh?I;8bvWtwI*KesL2`>`vwB94c>Q%peru}Oy`UT`JUJ$nD`gY>$XCiKCf?VZ` z+=8Kf0|g?>>Xhw=(bW?$+ycVw9~+kn3E5%P2F;D8azp9QC@Ujmmvb#Qx== zk!NTZ=REA&q*9I1!hBh<;s$q}dWYhjnGdc8LTpzgHB&|>Mjn^ zcicubAI zA6Bv5d68zcjau^J%71i+f3>B{-bWM+g7%~^1cZ)0jtttVh7fX@Rdw)8?UTv_%@;T& zQQ%FlT`^-Ll2QWASH61m5@E(R<}ilmfJ)Z3@t=C5gTOKG#Y0*91rObZ1ykms`p7kH zr`Cz4EcLveo&$A;(ENn_ z9d?Wi0=TXL=wyAWZwMK$-%PQ|3mDpgpJBbuslzINo*2uM2#DgUGv0^oORUzuL`0j6 z7EMPC?;RKQwS34cezGV=d|)!m$6=}XX$-x;OgPf zn33t3YjnfmLGUOLJiME!DwOx#7@+!0%t_u6MYFI+?|yaqbJt_Fs25{)@+~De5&8!} z7l+lv{c>Iv&eWv3`ZcOcm#xHV|2sOg7GuVyTATZXpfKKFZm8$KF8`slVjq6?L^X<> zKi@?wM1mo5vcXL=o>c?ez|w~|EmUY%pcfx)p2lgt@23}|UfB0^27Pe*cwDvff8>Zt z{8w`U8x14?7?Jyb1luDhCiW;j=p#W!8wO6X-~Ai~>Apn$HmAB$*o^@bT)Zx}dqrRD zH$_}*Z$mx^dg!^>+G7~MvMhYMP({cSLfPjo}%^}i7GKiL3jE{ zU^W65i%c%9nsR<#3lu--^0MZCmA>#~*4I5o-FHXuZ`lv|7`XyWLr3e~?CX55>jKT) zu=@f zO}A;OCrYFqy_>q0_48N)r9k>Y z-(>y>-6tqnL7pZXXN|yjY^wIb#4c$z-v@1de=w$kI83H3&(d^`++}J zzBrM^|3P~h(Y+s&!Y+UQF&vLt!5jg8dv%r#hMsB)xI2Ttl=JWj9()o4f(I5a3Nz{E z`c@ke=P2`6h6Z}|dbeU;se$H$+gtXf9-{JwPv7I6ZH#Iy#+SbwY;KoaeKw4CgThZ!ZrW0KCPCr~M0 z#bta;xrE6oAL#%;3z5k8M`4uoy7~Yts2EUDVDc%n(umdXks-S{jxBp9z!wSSv)}#g zMTPqjJ__6}X*;D>UHUj>Xl~blh%rU`Pjit@^ltsyPueNLUam6#RXUjvbs+!O=VHu0 z*{-4o=%nG&^j9695E!FbAOX^azduCJ!*vVx{J`FZTSIpE|8qn8e-p?-5Ln={>`Stp zEs3o29ZD8)y^0p``CDpl$*Q~!kXpQUxhC*ZVgf&Bn6t}A(q^_T34nDYJ&OKO6GWw$ zhf4S}h)VzJg7^>v_MV%tf@pEB6+Q{VAHyqqqk0g{E)@Izmw?#=RJsW|>IOv2*naE0 zC(j?DfKcQr%pre1cMNhflFNTe8gt>3^0k5H6j{%g!5a3dJ|y~A)T%PDlie7z?Eut2 zA>)i@ViZ{U4B_cCpqkh~vElW?jGZI+^I!iNAhf4gxZU50Us-PxewDO;%9QzyAFj3(1n)sa$tD?&A|M+vrrJZuG1&N|4xmq7w zQBUzVOawgi@qbEfnAm@{)c${5p&&)+`GS{SzDvkR!424yPX3cb4njsRB?VF?P}k3R zk^uyeDD2+|;1AlVNFHBbm~>e>NeO(HP2N>aDmyg+WI_>+f2YKU+=%N7vWY&!4j+Jf ze`Tg?Bb5yY64HSoPPZ|0qX1I_K(FwC~Vi_znL z*sZ2V&nZESW$Wz%PXVuU5Erc66GVYzaTNGI#@mW=uqV& zKy%UGe=ki?;J$_}%;n$>i6ca~z5+xepq{1F(#3LWj+P&ej4;68fyZ^7{yif2jL(GG z5kV1}B^R}HDgwAw;O3HHsg|yZJ+WO8g{Kg?$uRs ziKE2Y&!T5hNK41_YO^cGvzC_6e7*CzD57#cZpQ$v6V_T^LdxF^^}!4>fB= zE(+b7OFKXO``8fw-~N}hm%w|;dFeL_rZR#2C;ZT639pOYA7}TIxkoGZX&bc@>sMHf6<}S=p_b~RKpFn zXzah`?n8+0hRXO8*k6us`Shoixc0xBF1e#&*RF#mK((jYEXs5mydRDP7DuZn^Ksdah9bo)V&QJk6 zq<#%NPQhMP>9QjyuzB?P>}#J%msQ7qD0K!wwJ>SO!$q$Ya6uqQ$FG@T#_pl7Y=naS z=7EsXmj<#G^3&*D-{s8+@5hZ05w3`n9Ov26f$*;Xxa`nWp z$y}#pS60OE#(fuml3#Fiz{w9_YjvHk{!EpvW&Fj!e>}@87e<^qauO>k$#5rCYTA`m zm;8xvI@1L;rb$Ch=FYj~JXj(ey;@lfiBU-Mql@5!ff!(S}{67Cx&`pE? zkdsPlf`~Y2P+=^=ZyBwBWfUFmCDR4uIqFYc8B%Gby`#3?uXdYM`8TNodft-fn{-)F zlK^}DnQ!NKC2Jh46h>Iv!&-kfB*kh{|&E%4ZWGG<Fi}Dwq_!JvZeCZ$-M{&se76x+w zy0|_oHZ2>orU(=;6P3e!4AuVx&71$GplKp~j5IHLKD?$x7#F}j%VXwefb47xiF4?s z{ZLHMg`_c{xbA(h-YG{W83P2t#x+f%H6(cH z8M(t#9$Veg$R_tCG25w%Pr&1LN~;&NpU?2|xMB^J=ZCj&1 zM|hl77v-s^8@*UkMc_q+d~Y%-Yn@6f{|fwWYJqeE>ME93K8Bu&h*S4lkwG)NUD`x} zu7a@BI=&C46IthHG9{=(vBPvJH_xolMz4)Z?P*#y$)$wJ4_<`-Jtkr8zBcV5 z;>-}=eygm?g1Fo;;PO;>f~}^&75STnfP-x+Bb&Y+NrHFQKGNnwoktvKD7v`*5NIC7 z=4?JnA{v&k9$$zLoBPlfOSS(y?zl1(muhumjsBqeM!yGRt2dr3j1d3u4nAaF`Xbb| zFemc<@TD5UX)Ns4=p$HvGT$Axf&$Hx%!lK&5Rx96qcSZ|Q^p6_#9jyT-&4Y9!=k-+ zCpEsdC;lv1we9@F_dXGd++FXm>`ey60aELiC+q5^rc8N;w;_nP15XIko;rRzMRT~v&e&pY zQ8{}QlC6?bNhl;@yKRn|{sg7zwEl!ZMBbi_6?CqSSHEB;5zL4MEwl->%#tS1DCigc zW^ZvZTv8E?!Z+(jBjWDAbgiM-1`%Yo#`=OO^_h2|t%d4vhFCt0#}|8Vpk%4 zDzZOa8wB)eOpq$xiA>>gOWuY#&hZw=Z%5NEy3tSfJ`)i=iNBP z;1{*Qw{9-qo8lQ|%i6w%LX6dOA4kXAb^g{um-I97wJQ>`Fr z3;ca>V%SE1stg(L8}nZaH`s`|=(Ii;jTdY(A>kFaJ8!-YIPl0t>T5MiGbqOJ=Ez_a zNb}BGs{x*?&}ikQ0qWSt18`O_ziO@^MtrWg5c+*j$m9|tRT>8lr>%$fEsI|Zc7ruKLk&Gdejz$OM~Vhruk zE;Px6ot1ZswyTZs7YF~NM zfa*!Ea^5qAb0BDwJLD**H$G1M^To;CNZ937EVgoK`qLBbX*iGjc^?_#Z;Gcj%5aZj zV?CgLbMG`7`M%$voy(>7xGNjwWW6V?zjC`l#m#=3&Ou}1~=)nua3@Nd;h7}+Uq=Tw` z+gf7Ytd`s_G(of=&O|$v?jy*&T@-fj=X58_AvciMY=W8f=5L)4J{-575q9 zhZoRxu6DdHHs``v5pqKkTQhF37I@@N|LlI*uEm>>dFPLs>&|!tI2B7)CNwGw5e2K3 zB4@$?RG&Ma+t-jm+xeQkSVRNVp%2QG|6GSYCyo{a{;1oNrP;m9#`>Qwefp5ayZVHtY(%n3EVTZC1%o=i)FW^|c1LhneK!1h7LTe6)L1{k;qA`1=Pux$Fi9Z4MS8|Ei}66wDr?+TL3C2<+NJ>47{1%heBSjYSR z6ak(OsR|yGW4y-4$$YseyubH&Gp0)yV#&nmg5!F zLTB9LCwhIFY_40_98xi?N{iN`a+|+nkKdT`T&vUL{YIZD)>x`mts*~r8xO(ZkOn$d zEXx!5ff&B(mc4lFOx)u@08NHGVeM~ms7`=Ia_QB5V1Jblz4+2MK3EbM63nBo$gpds zGa?iE$h0ITOxl>jF~tUpk;b?ib7wIeyMsc&PSMQ+ul0kY8EL_f3})avvUA|5k5g#;8uOxv}U*x{rH3DXn1l9gZccF+V zmEXNcKrRf9#l~XHPVY1gaNgqkgW@VF{!N-)-@V#yB(lh;o_UdQI=hbq3i&%Py*I!2 znD>)w;tm4TZON%#d#B2@7W}-n9F~l%5q^nkbN0g8f2eR4R ziNP|XY$q}x_?q^rzdcN|Dx&S|*C<_piy*5>o#R4v*n-uydMMync>FX)$ zrHeExMQ(>zSV|rCPnTG}RTtn3`WiL74e;06y~lEh6n94WP*`Vi`G_>a7mus6e63sB zA)&$i=vhFYgJ4<^=ihHA;G6A!nml}n1vAU`zIzj9vP**# zjV(BiUZPC%#0EXd=V=wD9m|?I7UPP3{NTQUD9Wyx=6q#&;`Mdz6PTJjL+HK2kg)A1QeC}M1oSq;_C3)i?m@B)0kM3kI2@Zy z?29#*`6BLXQ&EbywURhpyfKR}hW!ZF3(DBUKzOif`% zyK5qDas2%50^jB8(9DoE+CaLDb?dL+1F%nUuJdn5U-Z{b4K+9Iq%LGIx}HvKS!yOa zf@Bo1Y4tf-6CTe1ijbzRvGNfc6YPufTs8NYEwU%fE&BMKN&BeEWgaq=q;6(VY}Pwq zTR5`Zp#(Zmy*xnu{38mDtDfsWu$T0`=VV$BZo5UpIpR2Y3Q!6ifH3@Lpi&O-8sNBy zOx0A8T4>}03W4WE|ESo(wvaY?a&I#Q0S-esJZzQmyxA;m_}(5tl2q6_>xL2e`D$~C zVgA<%HBbkJ;xjtmGY!1T!96g$!;IDFyf~OkQkZeapw!cTwvyC<#!BpxVQ3sH2oQ-M zI~_sSxafW#vHY4yAGpDHfExtw+YNPdw*Rn3eTg2MFiM@UD5ZoNa2Qy(lY4prUQ|=O zg!{knnAIo;X+2V4Bc|NOyDVY1jwXB7rCRRoMLQ5WlTX(lF0`p7?Vx%NT|X+ehDVC1 zX4VY(r|Jj~|GX|ig(qIO-G1W--W)}-2q-ChZ)cnR>{+9+btaRz^ci=2A`$wo&$pN| z&gagl$n3tCV#GR1?glh`<1DZ)#oArj9VUN_2NAL?njO9EI;bu5YZ))3>GACo@}u6V zjh7Yx%3`v9Q}vW)De0l^G6%>WWYH>eS9r6dmtPZmsV@XZ5odmtx%fRaJeMZSq~L{IqC8gkr*8^-??;Ku2*f< zOFWfq17bq$Oto!H2g-}7h`-sPxSOk=2oU#Tk~A1XFe>bOX@!lNZcI}@-e!5-{w9VS z-K**Qtw{XIl^-&tBTkyMCa2>zLh`>Msc(~LS<{Uo!7b1&)hFg`(G}w1Dg3sv=?8*( zw~62Idsq9rM|zfeL91aT1255Fj3>`?D_`*Tm<F*b%^a{qH;V>sn>o7?!K_R z-h_QlUpD7#I<*@S3p`@!E#w$j1d@Cb24S@u9W}drb(7 zMn2u0DOKBken)?qn#$FTf^6q7r5A~Z9VgKq)Bx5oxbZqxcMX@by|<1K##JhBR%`({{8AOjK0~vSEVQGeLu1qic34h%Q(Z#zvGVA`G;(gX|=7r zM-9oJ6cn<^*TOcJb27wr)cwjs$=&zo2IBx%JY~H2J`|F-bVpfKI>Ps%%g2~5ys>a; zn!>7!y0i518&PjkPgDO9nI>Y^??V1_bJq@}Za=AG|An=t{;$=hf@J}8AXpALtoQ`? zj*v691y2Jp+vM+RY9j8}u_Q?C88T=We6wk+Ug(1FhPBSEt1M%}|1PeRh2?M^QBRlb zccx93JWdy0;bsf#RQdXH>X@iK3uSz|ZR*+8HBFt;-MxABjN@ca^^^KXYUy+g=_i4` z*1xnOe!uvPX*1uskXxR^2kkDEsI9^A{-yy|LA(DHYB78H8vPFh5W~VjJEcul3 zRHMSXqG&9`9BYQTRQYQn+iuUvss;j>Bqdga(N`PT<%!v-?f?fP<8gPA4aO?pb|B++0#fdAX5_d-__e#}IbesfS9dQSDv*c1rnn-h}0E?Zv@h1jyB zjz1zWyxl9S99`?;LUe0m;2w7@(k=UZZZjh@wotTB$e3+tb7#rde`{T|3)lK#O2)2N zTOeh>BNXAACTS*7s8<`gEm+KbRO@tGD%170We|x~E{*c^(k;E-NueeU6mQ}lfX_LX z;yvF^eQG6^D(s@T$T~~awoGEu+E8oih@JE&d20K3JAuPF{vt2<%!aQDT_uj0v;XFr zd&-T37!Hp^L(da)-pqheP2{}}TXNzpHBOUsdz89->#uYq(Q-zHSZ}KG`RNBl_%1nE zH-Mms5&sr<;)(+_3>imea-UFKvYGqM?G3Ca#sLrVy|&yEBfdX@1NTK3b;P66YD~yn z{+PAA2!i)sz$&l_`csh1*FN|3IHEBrh|XSY$_yBZBkTBq@m(S*kBAFeRBzOZegGZK zUXQ}#j)nPy%y;P;fmXB1wEy?3A{+F4h=`6rUI$FU~RU-J$zJprDo3v!ca=8pY-Iv z*l{zD34H1J7unw&Cc9$jGL=u7PQz1R-<2V0_CpyTMd}#H1KJw}^$#DJ;=F%P6?E+q zuD3XF3ywuCOA>;92jzem3EU*&UOkCE z*jS6}A_Dc{FWj$Ie=1k37}b0c_d6GhAQ$=^i^@n8jr|3K!(vwkF_V**)fO+6=ZnqJZ8(?S5)V(#-p@`eW>Or`CRM@?PR?#T0Rw9*uwMq4^1S4b#U_ z4W{;+NO&0m^7^SY-riJcsG>IOi`tX5AB^x=^wm%=8yN5LR#~aY@W+Zed=r(KNH>mbR0p>nGZu)rvBSzSfj&jkh63f`tp2s3WZSyZ8P9j$w*reM#z0yQj*UTmO6!KtycfYW z1c?}_3gO4^M!s1S`{^1(V;Zi{J!h;)^=~>4{J*37kCoH~zp%HO_o@}DmyYah8BRDH zoE~%D-P8ccN9^cOWrL;dwDT772WC?#kHkV5Fkn7lm+?{~nRjh-6;4oL)+NBMo)KLW zhx%o<-cnGezY)9TK#B5bQbf?twDME})!jD0;ole8zvzEUW@4 z9DoU*fe7nGhhn_(#LW?kjoI=t;83&F!bzW?gp+VGeoK)i_IfA=Pb7%I{(xI%Kc zp(zL-I8FMu6Dpz&WEM<(hEacHXLW?}eQ%NnrxGSK1}`yta^&8xep;Colt8jh6hXCi zgth9ON{d}z>=C$)u5oBR4zueOr@RfjEs1f3(zCwGV&pUldn+b7W<9>{nWo6q5w;8b zt~{c)b%zDHmdm8bG9R%he*025fZLN5kdZ-awcv6v2*yXpjD3L`dJno|^|bq& zJladi*i8iOmUut}sK9ec@u&?5`u(Yrf@_@dNvbrGocmpl)cVQ{ElihH z5i^@_{;06Os`~=*lA{LOeb< z>2NEu3X6<_+A>V3kQpZ_!w>BW_)1q!>-}V9QY{+fk2gZbD9i&8iHW61YVbQvkw$T# zgpk7!J!dFX%0MZa6`@eSLbIY}8MVeF)DFEy!az2}nMm>F=TSDwxoNZJ2YrYl(1$R9 zp&-ng9dCZGiLKF|X}dHq*!>(56%ED@k5!jsG1 z;KR^qubj*#PHb11dFrxFOy=~P-lKH^i&bvkIP49jsMi&=K1)TR zf5DdP=y$5Xa(^02Jxr{v&UK;4$MgwSGz{Ti{Y7lzjs_|!=EHi`vjV$MlH&gIM6@M9~X?x$K01;OHqYkgZjMl z7Lu-{1648dzDMc*Ciaq6tbj8H)!z{XZnHF46yFV9ric4xicCkH${T z`Kj#6p+K~Eic5pj876m;2V>pz70oO|u4-IJ;TwZ;!z!ZCUtJA2j4Q#X!yQUCTURZ1 z#HNn*v5|8+U6vEmOH>VbjBOS|1$lcYr+csatoOpeXo7M6(e#s)rgKWiVTq8S2j>x+wPwYfW>v8pWOY4_3 zXTDZ>tW?q3NfJQ|zZ8=`@yf+-CM+x%s`_k&E5Ysfo?wpwPcpxM`?>q_yo(8UX5fajMCb`7`g}0C6m@KW95YPq!$ zZKUHpNk`=wP{>#fj0z)jtWV=uV|53=I6p?|Md3wC%)*AK>ABgc{6u=$YuHwD#RaR# zi2uY_yu=lhl+D5svX5nb_&Ux~1CGhmadsLzrSx`V901C1EP7%Y!u)UfSO(Sm zulO;%tlDk`hS8!8f66lDYd0tQ+sj!XMK?I_YqD#X5^5J1#$3ooDD)CTvl~UvHRpb~ zbg=vE(wMQmuj(7U8##}s*T#fz_@^P6(;u%y`w556y%#$R5)51g#x?}!Emb6@&!eRG7IxbAJKK!Fh{L(!(ivDS#gz_g zae?MKTEp+P5GEd&+8!h96&}&cx1mW zc~TM&C6`%b4F$mTwoLx;4KbQJQg=g43Fi4y%NPVjcKqcg!1I67Y2%%L3&FD zI-B5=7o;w^ihJx%Myt1r^FgD{a(<4PoV3ZDzb2e0BXkAvN%uB=myRApt6KciuRSQ_ zAJzG$?w5`gmOpOs;+6Z*QCJ;duDmSnZoEJ~m8JKYkd2|;bcRCuc#K15b16HNn9b%c z%qM!K-Lc8%Lw^_!{KDYzD@P^ zpU_c9qv{i$=`V`FO*gD%(^nPi2se2kTE~l+P2rpwKz%qq-_t2Zt#S^E0u3@ zzKeytuIY%RW~gfqXv{=7;xmU!xsZ>=)EMXBa4zb44pay))03A_Cw2@xgGGzkcZ&7s zu+og#zcMzr@v7CmRTn=CV7jsC{^G_Ht$*mSh(9nrnp|CSUDYoG{`H zVQf3D^*y=L%74a&Fru{1m5K99;Bmn~sSzJ^!ekD6E=x9y-OI`>0AaBbh0z}L+59$R z>iW8(z16vZ8{19G0>~%WrTc3lH$}$+(UiWzF5GYIk-=XudV_I|UH$*%d_++KU)2YI z?xgj#wSOXztD1D$Qv|%gERs$Ok^v$~RXG?`s%x5e{Q-2L9C#DMpjqcnEf@r2AcZ0J zK0tu1w`QRLj03{(Gw=CDim9)*7lBI~n8g62958cKc!~U7{ZOndEG~u?KMqX49dPX6u zH*Sj;?Y(!*??_;f>4{RRD9ac{Be{MHqv=(GeXX$CTnY+Bb{lGx= zdTp=zuS%b9&=|{K2+{flt>d&a@kAU9A8U<(vCJ2HF(#M7XMygV4C$EfxuRk=tJ19pmjV{!|@7!C~# zOR02k!oLs-8j78-;>lI@ww9W8MQS>u!d{oXxwc2977FyzU)k8-9^;6olaU27(wH`w z+ZbF7D~wH$d|_p?E*vSYH8DlnC8S ziiugk=%UUwGoXAi_3g|!XBm;Y>wM1-fYz%PpCilrp6{|5m<&p))!3pdrwGh8TxZ}s z02oXPU*i`o;0gYFF}nYnxXPewldW@7$b0{wHWdsE{7$6+WDX+&ETbK$O=7@{kn5Y| zU|RNaSV)onHb!)~!0mEJSUl8*Lis`Tc~%AkQ?%Sccnj;`M!`hzn2x`Bs_YO|#jUkW$mJZi>v+`lKLgmW15QuPE|h z#SZ!ETON4X7_sb0bbedHTSPcoFH9L8XIzu$OyB{dbYRRGa+TGqb9$5upf$MPK&$&X zLUT~28W#89=V~FN31L6?!63A*w_Kx2YW}PBZu^Hm#Tv6UN8kNg6Z4nwxZd9%f9yhI z`Ep^Z9((|00*Bsqtdla5y?GXEKX6Ik}(ik$vC<>8C5uYixN%nXnPm)0c z2P^B%fa^^Ky9N>CAcvSPEQ+BVcc;PKGv~{Fa9ij2 z&UbFlqKr_?xr}P`XU)PaoaJ|yu3I^y`@DoVdQy~*@wt7)rmL;fVcX2g@#^|+Z4nm+ zt_<^~@8x>p`3X`b9KXswqGZdc?Zq`fN0Ne(wiP~wu!|VBY{qq|_E5uXkJnFE%@aIv)sWS~Q!m#@ikp+U* z5n3_=Nf6bRKiw_H>*eAFt<^M0dX46_T6H0FJq;W~Kn?0nVlpm6nigMg-r!CnIKPeP z;^r-d8C9HoQTFNCqx#LU*>*2Y0FC}cB^udvdh}FYmf|*=-*dp@5rw)Wk8wpqo(VW? z33tC7EtG_J%IuaN#YbH-18*CD_5z^W`u7=wRMfi?D?515s}?>PY~{qRh@B@s>b_Ld zm5EP*kZ=%vs#>JLZvGDNm!;E>ljQCpl6L3mPrY0d3=XIfc zT(UP`9;TWm#6@6uXi6yh%H}5nfMc^Mh2J|Fd{Vr>-d5U*tG}P!aY3fO`QRCjvDOT% zEsF0CMA=_`YjhJvC*|0iElnLS*5il^>esEnzPq2l9<~<>8z-RkxNmBFYV}(P49`CXc9ezX zdUa=_QfKYu>)cvQUokvcHT?D<<83&!C31<>5e_s)LP5u~+ zPcv-tWb?Z0{74ogv3r%d`)g+x)xjF)F~%Q^$qNn{bH#g8@b zWQC>eVd*~xzoa$o+){A@bn6uqM&|`Vl9khPKx35QlllNUSb_|wzJyVua{~KBKJ=QT zShqp(sVaGNyk?F9(|AXTEUN!y#tzytIeE$D?9cb{;%csjnTgRw`gP9)ix#4HIL7jx zV-TzB&DfD=gZFrO~!Qp8bqMNyJqS;Khh5i1;Tr`N1e7_)s}Lt{1W zA+e<&Az`Gdl;YBtSR1*|74V1Ju5mjM$*le^_TnPfv=IpaYG)yG6KK%HI~JY-XMAgx zr59)L6&6A!b~VNOnH9*P>9C0PxQ!V6R#|G}&ga^l)*;Z?L;JgYX*2?w)Xvt++|9!! znX)n*DMAjqCm;Jeg9yY1Nk6W){Merl0F8qa5G_c~(NPR^>ZAy@i%sSV9) zB9h8E9U%|abMs40YdqgLlRjQl&2VcGK|adi4k%UV2Yr*v*4R|ON-2;-wgK(4x!8(i z`U$d#{U;J*9vHWFzuP_AxrMul%#4L>|!`~xoZ{eeYA`+ zUFJbB*Wk$D>{P>^xlg$6iTH~#+#vFVaU?V78YNaIUq3-JN0oCT!`DE_FThW<hW&_N!(=9X$v)EFfZOEU3myM!%ae;Lf`!OB0Y{3|XN7-(*m&n_?UBPn+P;2ykI zVm{NCN`b*-h@IJKA;mOJ3x-#I(>BaTlT(;NuO?%8?VGd0$EImmC6|RDkGM-gdfi=P@G#7qm{V$@xiyNevz1 z!swp3w$YDHmX?+XWX`(bsSyu;6*ho=hr~MK%+ok<&x#gw5Ul%yfS0SBCe`)#b~UWH zDMkQ6gjOwWS0<8L{F%@74LiXw1nzDa<9icUT3hHQrUqywD75D4w48e_6~UwJHd<1r zhtrvD`tA)75=nHYH#T@6G6IkMIN2vHMJKL42&Z}6q%=}WYHNF^TQ3ovV@`10GZE(b zp&o5d^e#^rx4v;+2%F3yC6 z7a}9~@(TeDGG@UI6bqQXzf;+$ctu1SdX*G!8e6!*V~6yT!2~dy; zzcNYYlS!V+X8_STjyZDWvazkV`hXoLXJbq zWJ(ew)dE(Vf2hWI10vENzXSno+Bh6bEJ5(QUtivm^Ln4v&m;(Z{A~EuyhB8wISM=v zO#lhi@b7z+x6K&Q*lutKD{PJl{58do!H2+QwL^r|AHO+O(hE4-!R0G=CWzJF@nxGl z+MlaUayaZ&gx z_v?3rS3UmSlQe=^fhN%nc6_aKybxZ0=(%WQ2lva8^A~qZ!&l3-VHGC*s6!`HBbKIT zVegH{-oh7><1EeZ7f&n)9)<-`m>Cv;C2Xuuc8p&;&dywhwY1iEEj)Jptdia`$T;=1 zpZu(Jv-h3mFu@lIbz)qICK9IEy{Xi4R?cV43RyA}wROOT{F$3^+iFQe`cT&Bar?vh zOE*Oi{-LP!GrRayG1!Tt|IOc;9M@vUZXs9A+LPeeA#bKs+wFSuEd{58JU(<*TP|A2GlGoW#oX%FLnmjcd!6Rdj>>0W~?lDXI9H!o=zgqH*$K zRI68&&QIM`wC#?L!s&Bcz0H<@+lTopcEd78e%o<+WJa^Yr+)L4$?XC4nL;)u-e03~ z76-wIMm*7e32}=Qeq2L|mTxPPIV`O;P*`pLMyuFYrsoS=oR!Z*LG2`A(`{VmvXeN9 z*4siLlEA5xrkKRVLEJu5WoOZwZR-r?$9lf8@u`uzWnh!>DHi{ zK7@Ib%DEE9mmXxg<(>d=ei9hwg`0aZ!A02M>in1IEzkZhny$jF$?xrNq|(jkl#m`Zkdy(V8$r6Iq&plUJzxx^ zqy!9FQaVIRk&;%ryFvPWzQ5~w|AO;eXJ_X=_viiur+LV08&;VZ5PUI;A?)Gbg03q9 zz<#!r^j1P<%K9C3<3RMa)+%*S_~F4f#vQTl9>|py|E!`FWh!@wXM2yI#9zOT&?EiX zKYwyq>g1~fm!1XtKSRlMYypmk9u()0aW+B=6YA> z^K!cJ8Y6~kN6G~IN>#s|$s!-~-oxXFo=`wsmVWik5-+h31u%&(y>ivdB1n$&1+O z0_@lc5Gv*ngKxjY`71<>zsu^%k4|-q3-?mE8dntp?z~>PTKws`2#)P1$4N*h+*{k@ z)d8h_S~__iM7YKFXN~{H+CXM?!dkd5b6=6RO&9xok97*lzvX#iwp&|>aULmZruv1L zxKQYuKf3($8u@IIGK8NyBxk610F_M7oXGZHd6^{q0+o=50ZAsDu>mpOYd=q_r;jTE zUJU?(&>)E$Bjor1l|Whot;%!!!M9CVm=u~*y-m{5%A#ED&S7Di=Zmto+pBTVyI#0_ zr}%Ng!k1~N( z5OBu7uAEPkf34i=#Ljw8E10tZjZ@wVGJDAHuY4!$6e&SERUq(mX30W;AEDB;7+UwF z|5bYVy)i17kNK?a0fn$8r!((jxVn`9$2>5vDz%S-V!Hu5QY)2uIQt z$hZivr@ z^y9sZUilu!m%s#S5hnFaOztug-Pg47CqPpwMi-!aR zn7Ra4kC!dvowh-Rgh%Z_^vQ6kIHGs*VdLGmjz0&}r8|u=e5`bP zO!L6e|5h7wjxGJ(qLYzizW$I|zam&`i^EecT!|s7VRt?8-QQ>IEZ~|m@P0xK!Eyum z<@M2i^bt2o^YCx?8uW@*<^)n?D??Aixx$3!qdX!lyMTWXZW9T7{25Q|!o^59Yk^$h zo@x_-Wh06K;q&qq{)*+Qtid|JKI!|P0AP9F#-(sYJg&&7k;7SiYVbmbW$8BMBsw^; zT5{VGI*E7o@}!p-V3N~(%~_Km2rjxSUXp;7YrJ@-2&_J?Hm(J^S`6wur|cT;ewi)j zUg}0_@I{wRa3uuOd-}IyF^n`}Q@A~-e+65EDH8DWwK+|P8mR|4^_Va*V~uWf62q=G z7bRdH(%y?BqL0Xa(4TxC4eVmxmLH>(;6WIp85^|UgsNgk%6s4rusxh0Wg}?B#yDx^ zJwh^GpWCt%5cBXi9#!Gp>jP7(khM549T(Y1bqj^e-fZyM59+$KiYMxC1v!kFXe56j z2=UZk6xo1rFCxFh%MO+udv6SJyz~(kRyApNd)V^MRKWPhBlWkKuJ3>_aEmO`sFf^j zbpZ2lrB4_wrzP&%raTkKPQD)}0_#FDM=~}(iKwTxeEkbYC(^!=ib?(@%49OF%7oYO zs<5^PEcX0jSpMd#ilwbEI9uFI!o?OBBPm>--5+TG-G8MN5KO;b(%~TrjbHeH-9)_u>h9$;ImRI(xDVE@+#%Wc8?q2`F+Lj}6d-+_Q+qBKQq^|nS|O{)u1NbiBEkzW{weg{*!x-{ zk|&Fqinnp7g>0)?gzfnW_<9YS$FmXkJ?hm5;h#Wz@!3TN9z-b7&@m%yN=Hmlg7w-J zdyZj8P2Uc+7QDL|)s)`p)y@r&_2D>btCR`*EF-Fv;i|C}Po}MWT%q7JD)* zKMpyBpC73E4sV4!u}Z1FK9yvjjd;Rz5VFXkjevN2zUd?{c^aiYrb;@j<}QE-EhL#7 z%jjWMdDXJHtf5RQWriGlhrtSX)6!zcJ<*3WkWqzUHMG^Ysht0|nixVSDC zdaWU823=+3E}(y)fYWy}EOVP75(p?X+-dRSiKVS9y-ibJ(Ji)~91y%B=G>@*-V#_` zojtQ6|5C5tNi|~gR(ed>X&hGbQHi#@zoi4xVoG@Xc!ha{03dxNf1aAM3?G>D3Q+Y! z;eQaWT>0&mpi?*7UTO(>?Xj29)->EX{8BLL(`X=m? zoX7u}N7<#^Qka=S(WoNaYrb$;SoRQk4rP&CvxMuD#hG{aL0UCBzr0}q{NyW7F4P>0 zpMT0-r~et05vTWc!7jNum|ZQf%kTfa01{4Cg9<~xdcF^7_3*gVa<}-@%5%|#@C_`i znJ!|yjNOQS(xz>}q$GpZbkVj~mmX20GIbcX9|Z0Jc2;=b*0 zh)}oxIm;hiY`Q0OUqK&ES;mNJonTv|_Z)Plk6FClv@|=5(iL%UW~#5tfVhX{hBbZ_ zC68`Al%2;OI1SU!6IB84y8MLs7n*DM8b3Haun*xB)wWw;rO)g*b?;@JWS5Z-y}#lU z+L6Y^e-PzU_=@~?xV2Lj+JP>-$BHA8Rhc*FjoY)&cDs$Jl{se_tMkZ68k!zD`3511 zl~wI^TiuLU{kb;tS+^QvIgFyqf5FnH^37?qhZKR=AWACIYsdW_^ztF3(`L(0?S60r zX~&4kv_f+nzTJ&x(F|E={C=XzAiwqbY*7)QG#gD&=SyFBq->4Ld4I{`iyTY5wEIJ~fIybR*!vVcoHpyB-6~(pDz8<&F_MjvO7)LSIa$?zY^J*7Ku%bvCa2z4kEEqt z8-r_@9da)h?$_a*q5*=`LBClZ@UJiqYAEdW0|#;CZ)eb`2vu`J`>?1{^|HL={jTNy!Qd~wDXNcOaP4sVH@HmrFwR+~i3C&*nq|b!>$Ff3J#Ao{ z#oEBN{{b1C#hUwpJ&>VxI2ds(muUt@DLtc+UUbPi zcisR68JB2lA)7JF48PYFMjo~K;keHebIa!T->5PI+% zU`{bBF@Bl3WM^bMH9&6LjdPTHCig#mB zoOm;sH42Rj&38tlj(@TjtNbc5no%BoglWvYQGO4X3j~$v^A=z`(62Q);~*T_`j?Y> zun9w2*JoUbimuRDfke1b^CD>qGyn6;rR~+y0It0$*WFz%=^8u z6iM&f=Jo>mpO$}1GV280A1827S^6DZj&#tdL+*s_^Fq}HM9}s6Jz;%aO{d@3F_8}^ zu?KWyMy+VRpM}YGcdGZ&_RbkZbcc;!^2x7sDr4U(FqD)5YSNmW zQQAGgtb8$2PBJKvA<^^{)s?^)Vz$)in-=(-Hp45b=agU|XY{<`e3{Isdv5YvWkz1Y zX;?{UO|Za?#dP;;hLaac<`pDY*qXVF@R9<`(*9{wTC$7*-t<&kFYz2;4EqfO-gj!pNFJ06~URB?lJPf`gRsh({jqp`>98gV|?U zkS}un?k`=+dU1)iVbzAz!mr666b*zJe2Vw7iwkMIup{9Az4b0`b!p=X_JEZBH(1cx z8Z96Hw&#ZIL*+gXAdUH@3U|@{O1XsL*hky%P1p0<%^gQ4!*YAl0bA|ZoJFFiJJ+B0 zgEVm-4db8iAHLbN&Ed-|`}yqi;yy-hJ7Tr6u!zDYIpX1i2%FReI8ATrW~}}C8CQk{ z=Z3s?rw^t0=8-u$HP))NN4Jn%K5z4Pjr}|P<`O|9x#!Jvb$0!VXVCeVx0(=n?Zn8v zSwGLdgb2T90?deccD(ORJ{zTXQK2L6^#wLPXLRvz@nz|BM?PrKN&>cAlAedcO!!3y z#4877P!?f)b8g|I@2T4QE5e{RcgcBbaOgQNhBCB!jyVHlg4A$OIejebHc^tj1(> z{$~#6VWtfMfs{4aMz#Ol)y@n>uB3$xR^?X}a%IdD4Y{P*QY0P_I1)##A+(OGk~d4Adg+WTc<-BW+lvn(I?LF0e5jWPK{g6*(-$ zVe|&x;;`szB?feSIk@~}RHyW(w35UnMQ(V!O08v(pTXWroop5!yCT#d*O%Pa^OX3! zpf3V1Ec3{&1{P3h_0@8Z1ZGkA`x%ugD)oEiNea@ckk55OTC4Y$f%vib)yrF8n_YXY z^2~OX-}a=7c+5*~4+h;l+@C`R;W{Nnq!p~WU_Gxvb_syMmkiY6raZoKoybqZj*12qf=#W1wpR9?+0uy1s z9?WPlpV|Q8#1hXCx>l?q3NKF5fz4w1g5RRL+%%%?DE$?N!X_rBUi8W-K&RFsRvGaN zy!JK01Jn*g!RG7Qg@!XubVxO-1D>W=j4k~VU2?xs1IUfB4;R!AWGy*r>57nV%d?7r zpY3Mp)Y(=wSfe&|1b$&gIlnW<|#b|83%u)Js!^8 z2fG)lBDP?p(oNF+Sj@frV|J;QjA#`3o4TOETK-VEYqw~6 zVO__gzZzfd={a9Xxu%D#Z`rWtM^d+=pC%*r;@^_|_KhgE`xdJh&p@@oW;+;%H!v^z z9#yF6eNe2gT3y9Q{)bCfrMJ+3?>O|&unh?4>2pe1sQpSZ?D~=xd!UPRH)1u0CA>e^Z_>0~0oqG? z>bsut-#bDvms?}|u4awoR{(nO>8k=*wXj5ut`kvszz=&SN*4`#FgagUtJqUwLpbx{ zEsp7pJnt6}S`$84;r{2v8v!KnplwbrAB@=Z=rG{mwF=xfwb-=c$(w*3TN4Eb|04@X zg^P$t%mw?n<_aZKd7J`1dpF|w7Ui9j$uB)suX7OVNx94;RM~Y>o}}Cua-p`;=*oFg z+0S}~eyiCBO=Xev)WUyLjZeFnEyS}4(6J)vhSL`dQ+mW+kQ6n(KAHQ|!}t41E&Jq3 zPjM@JPSz8l@{(hSkCzKWe#z?h_6HI<1^@Pnl25(6^MfqeWG@tH5?!mB#$^9_dL$+x zoS3p)^3Rh*(K#FsPp35mWuC2foS5%OxXT`WzxnC0^{0h)W^OL%qRnOZd7H}zhC{eK zE;Q_#SQlG%$TNjC%Z)FLIjDawXt-_L^E3JBG2`j}AD!XI)F+_m%gkY69{EQ}58NF* zj+5oD6xLVSUEC5ho%bwQuRQqCp%c^@+akpt8{#l;WBa8&*aPpB@3WKgX@7rlji==9<%ATx0XVpK(`B#GyjBN%(gxtwW;&I#B=bBkjd3^ra8Fj&4ib4X0* zjMSquy*KAUBdPvW$Fra{e~e^Ug({wQ(KUO(K)4VIrPIDdOev*o&PGHSv)No=Di)2e zHtSqpq3c3;-yB75YfDJuGo644phJa2Q=U2HhBbs}=V-Q5bDRhqyQPxkh|StlE?>4* z+%j!@;vn!wz6`OIZvXWe4&aUj%jK;+g`zcQNA#c@HOwz&u2iBPlv8`!TCgu-DI`Zy zDlj;y*U)<%v~sf(ffP$uyJ|>7{6tPeSm}z+MRkY`oEpvGEQ`!nKmyWT9=~2rhE!nd zxYD0x2oVCWl%gVvjhRRH(|(r75~ceVKBwI{%jqiU*|Y5?^G;#s=ULq)`Ep%>#EA9m z5xdNUk_#>Hh97(w3|+k8__0iaZZ${VX$U3ha^sy}b3NTvquYoR?mi@iU|ClSU>&p5 zQ$T*2guhm($`X~I@G}k*Ii{4QD+BPBX`=oHxQiR32#CM)>7vEuF?F4e%=Mt>+}BK5 ziv+dcKc-fQZJUTFYCdC3&|c^OT13By5{MsPk(#=F3;+C8-w9Q*?_4d!=r=C1kLR^s z@I`PnoP@SdX8+Q`%LLys-|}naQ%B#z6QRaHRpau_Yf&_#K+5U?P_#Q)tf?NL1QRMA zFvx>yyw^B5{|wnqRC(oa{{;WRDrkYKETUegN|80bJ|!mQq$G}yB+D1?5X1*Knv%~8 zC#oi;kzVEu!;*A*{m|AA&y4rCx`=s`Xz*FTR=V;ehBLFmFB;T|{C3f0^SnVFLGPF=6 z#Sd0wAUf<{cCVji!!vwj?G0Ajy~ymePZgj(LMJj{R-9OX4N~0ryg(!g7uK0_i|>Th z@U6R=5IxvNEog`++VU!vGIe_mC+K?&U>s%~i=TZDyBsW1%e=7w|BEZ_+*(iebvn+S z3AUJ6pXk3VCyYK6J_ILRJBnAV&JFL(XMFJy;cAS#o>yHnsh4m3u#*Tn*CT(VrEIu= zM0E`}AKQLiy;DgJ{wcD7VTyBOE++P1HEnBLFd@sjK2v};Thu!KvW;V1ciQ||UUXaD zVQ0+B;#ipE%O*UPzVAmx<`9ODSXV5p#aPKryngzl!^AQod}nY++Hp8<4;YWH8<#&% zsb`(3+WmIZ0?cg(FI=XW&9P=kFnz6Zy|Ool-IYNn>%l(!g(7bX$3Hwb8Pr#mSsT=8 zQ{3tIb#ZOol%p(A>p~}q4j!|lZ!hQ6nZnJhhY4{HvxqoTUtbs;NvPO-|LlD}LoD`| zsswuEjmKl#$?nHpf>6`(&jAV3bn~2VY2KLI0gMC!ns`0}Ff*EmdqK#Q-Et*LoQW|$ zj~4gmQ*k)Y8BdkI~y$3JY*ABYlT z;;4d8eoYx2Ig(0MGOY|5kBSaSx3nW`YbOw@f6y2Ka>Tl%3@ITkKl)D8v`Ary3~m12 zR{7|Tv5YnAHf{XGN{oZbLxF&~Fi@Xj^j(ry+DVbqnjajxL2Mv`W-PClLCe4kofstC zK1WB}i+`~O0wDsQuh_gC2N!hxzAJz_oG@|P$$TTz7NcGfPS882v(M>8CzEIO>}$Du z80omR_dV(GI8gmdY>R$5{*N)>B(Y*#m+T?tUrRm+WGV9{QegZ0O>!n4D|tgGxG4xls|{orVgm zj2y_}y#C7Hi7E54Br7F+QtyLT|KjN5TJN6k9TE_*#qFvEl%H*Du2lig@{(%ijt1V| zDg&{1%60rAY~^NOwSg?RM|qoqNc#f1l-AIV2>Y=a;y~}%(=*z$tazLOxO7b$mi7bC zhnD^R2)B#51!W!lP|zyYTjm3K-NXjSuf~r(2{ffe_@k)m27xXF1tF(o+c9+NIr|7b zLI0v8K*%KN{AQ%}E>7?;TP4Upm7fVxB|RS&BX?mg8D$3|PJRICHeGJs@faBn=(!JK zy(EoWHs`m)KBSf%|7~WFqSi33c+Y(tSN&gL%*)+rFE2%CSkC5i)VyTu3Yp4S|LdIC zS$o+412eS`I~N zW7RkH-=9@VtPyJe*SdX%+{R0!FH_=0vl=fkM})_*VZ9lv+>h+Z;_t%WpPH|+rh}8%x;G(kz zWr_tIWlLC1*JaqNv&CDqBiOW^;48tbSwR%Brs|7aDa%MM_OX&kO+>lFfFSfa2<+$C zfwW)utKqWd=MK*JTp9?V#JwrLBb|4T1N7gi(kgTcXC(+VZnBHC<`*IJg6}zm&&Ca0 zT~9A-$If?4wv;(3!&JBg^sfu=NA6$>=DtS`+1@!mo*muamo`p+?=(^>e+9SJWIVPG z4={~cMt0N7Ssnph@4sbHFLj2Go*zp#sYh|G74zAlB%jy{mDK{&39MHMWRzmCHv?I- z{8rz2-1LR<>f*^JRp)Jxf=Esa2D3!X6dw$OZ`vue7U036??C@G9kJIhTu_+9~bt3y+4SbLiVTBmjO6UC+9_%^fGQH*lzG<&yaAI|BD6o{c9( z|H^paef2oRGF5T;M_FY`zcYBic50=bHU$+)+Mt=IGzn7hsnJ><|3#0Tn=3#(@DKU7 zH|erUI+5zJiS9-z;MTVTL1_tD9sxE#U~F?B{$=HM*D4?Ac<;-wE?Fu4MlPzidgHm0 z{f12PZBkdW7OfulDAXW+DR@7&;6CYtAck5s13uCdbAFo4R^%?zjI^im-sLs2EKED3 zcZ|4+gW&2~s^zLRv$byX#wkD(M8`O>XyOV5tSryl$k4I+T42svRVZ^vU>_}7p#?}_ z?U(yeD<1YUtI!brSRmMgUKZAQ4<*JlInk^Ip4h-TW0A*iD56B7d*W%;&<{AVjZH@i z#A+H<@OC@&+e3qJ&zKP^a-h-IxCEu_qOa~`=2}ouAj^0B$G>@jww2$uomR+}s;>q= zgYpX>t0|F~P*R{GBV~N)=*Oo$j77%4d{J?;%(V9JhV5c!W=tsJ{tbIHg?id`bzg1) zIfDPiG%dy_C0dm_Fz$z~SGu+cI>$U150mKhrVD}T2*GtsXQwa2;dXkv)*JF4X53R9c^7}Q1&%Q6 ztm=Pik1M`ePa9l`QuJ&SL;`Oy4w3mKoQ$$aOdvv8V@5o^w#aA(|V1lTq;_-l zM3WYyDP662oJwz`$tk2_3OxiQOdUf~n+ZsAMz@Gzdn+uJ<=sHD77}THEKCe>O6;mt zxZj1}el|Ad(Q8yc*XdtsdBLgA%2{d=B=h;IrMwz3W`Io$U%tB-1ScX!!B*-xGd{w<>8C z*H88hv8w^#(?}CX0#p-{aqZ{+T_O~u*(p{aJlW<|1yq(DtaF8)rfo|vOPk7#%U(LH z!+!d!Io?!UVa-WRj=|r|i16!QlHy~@n?AE^n|9!V3d8SkJdW(@-lW@nc zL4ha`gc?iXGv8kCeQK1mCg=lGWJ>-J^pF$pnD;frbwKE{hNNQphYqya8hPPp#^D4Zj#jz#dQ@L_c zfSOiLZ%5ubL>SZdn}54D=~3|JFX6l!Bt>v(fOsREU9SXiV}efo8AAi3y&gC}X1&a` zO9mFXzvxKD?Ilk!*NmX1JCQ-(Ok`L!KeJ@!b$@OuiVwnkYP+a>st6SY%H6KT?$nP- zW~9CTC02cZl4`Jp8^5_yIe>)!W-(1Ju(EhmkP~K2_im)ZG?_o=zT#spmnZVY7GR0I zfwbRhadW^!+!+=y(_xMN8KCH#9==`d^zDjLIHbV3wKNW)njsuntL{1^s;(#1`vnZ> zTq(X}Y&--CQKx-f!lznp?1AZhl?R#*f8Dv78HcIus7@9DFC1Tf&IE-BEZkG6zAa#C z4#TRpWIaB&01vO{m?M9$GpAS@dAwXyq*vLD8rx>GZwA z%+E|sWRKVx)OlnkoZT3xrr_q{3sE0ZPr1!H8aFoO6Za8;=en zG)e$)7wr|%SNx4Hce`q5`l_AZ$H|?kC8E?VY6Py}J<)OQz1}+A0{MzKAVCQ7K;aVb zA?yziYi!C<|Mh_D&&J-@KEhs+zm4$6HrI%BHk1M@8BhpDd7`9dV5AqQ>A+q*CpVUOe1WV+jiYj_UhX3XFgj6lrI zH%VpP?pdM{5vP-NvV8&q5Mb{t{YV}r;@pTC@AlEg9w@0d3Xim>A^-`ebj>sC)Z(UL zveB}xRcT`U%sh$s7`?}cBd7%t#e$G!p-Lp>qg!6k>`CFV0(9Pm$BZA%%hRT#maThk z`&}cIXU9D2qV=~F+6=LYjqi*R^SWWZLw(n4zrzB{oubRy=}BT z(1$$`h*Ogs_X)O)E9KkIde-PRd;0t=UPo09s6dZd^cqYT1EblA40Q-&P1-}VRKm$b zG}Ul0rf!2GtXhkj1Wuv0OkX}ABheTi;;Xs02@y%l<58iC zL(p_9TY(28)#^CNjom$5J|qN>Vc8Xb+w@(K4T$#&dWGvmiQBm>h9lGSC6WyfEmLrB zx35wXdE1F_4+G6S;;h;IkizkI9^)5@D|v0+W5%kgzIg8MZUMd|CF^HKQxWMy6u30Q zb%+a&*fE6tjNfxq;m@b`WyNu`@iD`&9e(j`;XV)sUtUV#AW~VT6DKf9j~Snr==Wbr z!jMAcJZclD=6bH=-4kPjTpL!~o64{s53|o9q`~qRG(q`Uq3!^4DjRZSNiU_(UT68+ zH_mTZMx)61g%D94L}yC_G86axWUL`zYx0DJH7k}>NZ`yzhcx3B{iGx>nvDXRO{I_# z&42Bm5UIo~@|t2o^GE?2{nax0R1b{;m#l>4PAtN$O7%ZvtNc_}O|s5J0oSz|!RIAO z0!)bi@9{JUnnMw)vUZ1ILu!g3?JE=Ah~QpWItce2a5iJf-t&7&bk6VjQ!msZ#Mo?M z+?cowrn^|sm<%VY{Uq`w#f$h$tO(?vOz0--Op#B$W5Hq&mfUe@4tub)vq6xYXLFMo zBOl0dyel*0mbhXZLW01Xes4A}&}Wktc9J+HPAwbEpCVH(gm8+#qj%p0p4;G85ErnB zj3gwVZbMDuey57Tzqri2(Yl>}YqWxz^Jmk{6S_~l`aJSC)}%qkfE%do;!xT^>iulU zKmM7Oh7_!XN6WC;yYmhXj7a*5jOHZCB)nO*X0W1j`|GM)+)%Hs<$Iu&nqv@1CE~1A zSvL*0SY%J)w3mL$m6Q6Q-rKJuB7gfwdOiKX4v1dnd&v zsY%Za!Ql`33G9JgyC92%^lcezM5RdyrgA~}%I?NVhMnT4*-z!9|L7~c&89+2Zxjf0 zPpR0uOc_KQs&%-@2SAW@C*V=TeY*82pY7U2-DSrq4-+!`(faKJ38I_M4+{?N;XY>I z0cgN)53uttuhvpH9gCN)OYL&uISKR16x9*sjy%M&(8@#?404#%)OokR0_FC5>$ua$pv6 zOQru+_uVn25PexO?MaH1^LY_PK!2uoF5^oCGD+mHlkW>xIJS+b0J)#C6&}$2q>Z;g zNkwkN`1Zh!?PwEFh;Y4}{mZ-OYPZ^ANBGdm6lTZ<3Dm~30`;d@mn)loV;Wo8$*CPg z064fefC44lxk@S=WSo-iCpkbm^BiX9A7vLivBk0_o5fQj?AiHduxYv3dwH;!O`N_zyEn$u){AGp@TP-(O?K(i}u-DEgaDXw%(f>3XoDRa=^ru0%)F2xB5|rcQCC{7P)tqCDN0Od?c1E z+W8;7@i^@|fX-2(Lqo{5a0ttO@Uvyn$CtmKwFW-S4GH4fj$0uiY2N%^uI7B*A8Y?Q z2wCk$+@15y{3(^tg{k>u@kZXO_mz+AehC5RY;<*+RxMfU&yqr4j;QM-t@OD_IFz}l zNL7VXUTD#b?T{KX`{O~7PqU+(ZNX**SEZ=7gVoM6h;Bp3 z)jP@&*V_F@W(UbWioMX~%tr}=Z!-ocKuAlge2=-4}-(Nn{^Bm1rOH82i{S-4^nY z0!iY!I#R7ex-Su*qP=iF(5_u@iag(EB9}_|b}{~?s|I0GT=HD^J1j|Bk4yJUsbR%q zf&p=YDuYz}A-(@vADEi_8z;Uz-(pr}FDG!49@W7h)V6rhQa3mz?tMg*Ogu+PVq-w- zx41*Rky>lH6;$o`^wu1A2iO=rORqHG#ht@?JYykS0Y2TmH*6kn&xXeK1V}#uwD1lo zg6!xKOETFRyYw*{pynha0wWN0$clWF~#RYp&m+Vzf5zm5PzVN8gw%%x9Cd>}NSO z4_yGTe_0bG-ZWWw9vd$?Q76YDznb|-bnC)4_ zkYc(*dQGw_Q zHu0sww#PeQIP7BQrU}0K6n^Xs)y5*7$;Ltx4TA8nbfSsqKQH9^dJBk1+*j)PK^(U8 zygsTN3R`K>c8}269+o_-QXC4?-~^32w>_difQx{>KmW(wJgoUY!f^KB2S{yRSa73C zE|_D4JToRP6Fc3Fa$1v$nGE-VX0Z&fEr)@b-RF?c0ag=zp3h->r_$TjU7-jUBUHT% zw*AAVjmkfM2^2zCQ`iGHdq<1i1emOihzWbDuIaN$29FnbApIL4?K*;fcrr-P}ZArV_-fh6d=HR3V@&N!S=Adw? z9~+O2PZsy&JlcU3X44sf1~&2nBBVpxPlf{L%Kina86+q&e)_P9A>upMp@)|bT4>aV9?~TfgmG+2>pF6a71!c#K6l!LLalT^@&2R+8Z7Kt_Bu>50QDPRCI`S1GO>g?AH{EH8I8__!I;MbodGEE) z!z@CVge1j#TU6IkLIa*$O)_+tXKPXxeKRwdwf*-g)d3%c#oedQs^~<9vOFa?^a?CN z?~A8!JkFac7CinZA$@!jDSF=6qlXO($=u|E|HZEP6#Ft^4zCZ__Z2ixnFiRo3c}uj zz@PN5&s2!j2z99Vtp9m##SBVPuFX&9r5#1qgBw0%;9%9Bzwmb_tR@fRb$+9oVDMcH zqr4SJchUZ=U{k6bU7ftC@1_AO!}?II3O-ZcHGP_^K<9%joZkPJcSb}a30wTa75jEV zkPe|j)4#}MN-9DFdiF4BWYfASol;xIlue{Lo3?s4!hV*?;tzft7K0OZVf+=IvA}BJHDn&)tPb@>y8V_+sR_b9_c(2SQlP2}IBsCtx7zfDrzqucZ& ze0=6!saXbmP_G$*yS&rTGpwSjk#-Wqx@GKio|Ij5QphO-`+C-=ksS_z>eK4@i~iSo zwZeMLa|*xV%!Hwn_ow7J9-GO#2EUkRVXkR{ZB;6qsiP52;7`%c2OZoU_X<$!@yYW( z#n{UXzZaBo%XfA^S)}ntB^#gRNCk0?(msyQ1G&AruDFLOK%xAPQ&x+M`nn!|K^f%H z_EQA`d~l$Ip2xgul|bK+dc9drj9AqbGibLcj#wRs=P$6*MPTyEe9YM{T zedFLk;=04~3L*&>g?S%e32V`=iGM#~UZdrh-zemN|K`2#myzz;IX-?DzJb?d%Cu1# zr|xhM8?%l1j>{`oBk!HOp!B@te&l~m-?53_wqCmFc#SZKlkBWlUZ1V{P=RfIp?5(- zU3%h(AQRl%lRz7w8XvRs8h4>oQnp4RV%4V#v;scg?T(^_@#^V89w}%Q1?f(knCFCg z3y2S)qs4MD0FRdTv5JL$CXftqWZUB>M0fRY=pj!(ycJtow^B`#5ao2wF`FGla*O=T z+vBQn*HPW~F4$cX}n0~SDe9TE82g;7>?PF29rv}ZzsVs%wHV1@`bO1=C#-M_? zH=Xj`HwEuvG3a&ni^?rqnA9p-si(2^omuPV6FTf4{kG>&i+-q(%{0sug$;QM9vD_X z)fPPhGI}W2-bCOi=~OTG{GAnd6~~liqDJA)Hn1+p+;X(BO9227^rsBFwn8(yD;bKOux5qy%tSw6B>Ful>(KM$_A7Bm zsTZ*lcHU|r+T$v$ws`toxZEGLf*ob&`DR6Em_GVdo+u->h+#_lsYBd&*FP0$y~5kp z*V;(zIWsKfkt#v*NXlH|%tRZsY*t(*$kIju_XRjUu=3M@PeKI6?IQjt_B^fK)F-S~ zun=Rd1#0~41Brg!{*NAo{SZX&9dKTbRcWL@7C77`X4(NoLiW^% ziBWThJ9tb9boYH4_jsskt4GZ;D{HMm3q0dWlDM65SCylus0}#Y253%xyx^(?QX!&A zY4IaF;rvqf)B!1a%V6hbztr^fUaL+O+OhE*P(FXU5>5$r`^2FX`LaW|KG-jD)~r3W z3I0<&BWOh!cK~wUaqC}+6?;nx)BXJjJ4&r1UX<_R=d^+`8%U7%qt|?KQv6%cfUBV+ ze5LvoyAQE*Yl;(glYHlsk`Av}*_ch^puGP`f|_+St^^>?5*@!V&J<4&LnUF1P=k>z z9~UX-$sgx~HtsKnPb~tpvcv|o(qm!ga7Prs$!Bzu9e@ITP2zw%%X0Vn%mNY)1BdYYf=Au;=T=`oFr{TzLdxq> zZ7?ITSc9U=M?Yx>t5&7Tb<@Yzb?v4O-V8OzeTIRKscknuE!5&|e6@r_oqvL-`x8*s zpiguDeC0qIU5rS+tR`KC6bXLhf6@n^3Ll(KP3op9jO$V}qC_3O)&#K5T+P+u9&hv{ zfe<*HE15)|N>h!y$9&?=*;4Ux$nct&@va-ZP5io|sWvuOrLjL)B0O;W4uNaCXKCRi z5PA;jzuV$v;`KXhF=@{>cve;6PGMkKnXf62K(AMkx zDGEd#If9Yvm+UUoAw)w?r=zm*{dYxBZPA5{RB=#P?1~hAk|OL2895}019 zViNwWR|2eb!s_%c>*S295CceR79}zpKth#rtzbgh#WKG)!uYkOSmx;f(zjar@si;;xvtF@ps-1|&G)dabiQZWT^xU!t5qfTBEw?bu=|f( z2Kq0$Nj1bnD5M33#r1|@ByKnEk%K}MdV+RFz5qlz^E#rMxrMo=H!H;*m1*UY8=Sqr zH<+VS>g*Q0UHmBQ!5^Lzm_PUAB}|-*Zs>#%?Vu62k|b;CXWHPH$>*-T?dow5G@BPb zgSdL!ncJ>?3wtdBFnUppUj85#W~Q2zk1!NTMF~XYEOO`r5X|TYjs36G<8tag85*0{k27RFW~s6z^HpX_k-_ z8J5$qp(qlSSGIC5`JrruGR=_xfYCW_|JF~~$D|+FO{-&9?J0h|I{@^1i8K`u0UgyB47!JbzW1Ud_ zVEVya5;wgQE$bl+B5yOYY~`9MBM>qi)2ta$l4yy;5D9SikjJxL&yP&vuGaQHl@06( zXTL64jA&FQdXR=y(|>H=VE4X>b3SM^teySx0Z3ZnqLCFb}s+5 zg@8T7pvTj%aTlA857%t(#Y3s<0gv zi;#!<$CWXANrx6;edJ_zw|+-O8c}7v$$ZJkv^u4Eo5;B*N0<{uZ(v;SI#J4)yE_%p&+(I46^Ex90&&(-e3w~v+C)=*9 zd9@nY$j~{aP!c>lA&YaS@HA9R=T%Uxd;gBbnN?cz5&XIQiYv&uL16plOWZJ~!~4k9 zy~VWMM{AaLK;c1yr@f-+#PRrkTS4pKY<=Iejf?|K_`At*-TO`VrMHxmnDq*3W8}*E zS7_b69cps{EX$OoldxYON1So7)b9UUCTVEazVo=T99QyHu&i3Y^7D-hx@!vkp^wjC z({w?`8;!5S?cZr~;>9#0a4tTXAn&0`?gkt5dbJBXNXD^JX1sZD@8IU#7xvH3cKhP{ z+>m~he!orkw_;r!V%HXa9?xk0XzyDPugev`#YGb}@3)#%NqIM5FQD(cvD=KSeV6B; zd5nIDW&4*0GDJJ&Yooh3F#}sbRU|)dAhKg=Nb;iw_nFcj;KjcDU*e2FCzmBx$x8(Y*Lm*10t$mdNe zzcZa)PT^%M@zYh5fG%Na>RshNFTAZJHV@ohV)DalA_B9%MXmvbA&gi1{Z zNv|f^Jz$=^`%J0p#QmT_IkCLAb>ilDspJadq< zse`>HllJmnw#rS4N*3OOt$D9hvfmLDu07Ol6Pb+nIm$9=%ktR~iAY@I=2xd?gf~ey zq7FTK34-6rYd=j_?s)drVB^bE9|^AZ-P?mGa$79;b(Oo@@>k0gxmU5_;g%>-6gUw0zeZ$)Js7X)NY_HDCAg8yc#C{!plG?viLYXT7^QFJTLxr zaJWHN{T?;95($T%Z2Ol_nG7BW-TCTlN0StMCx6oS=~ka;BKv7jT^^z4W-Hi)zvo^u zfSli+KKll(r;8%Sf;Al1IKu)vk_3oDyhHdio+)vO+S&Hd9X>Qs@MSi?p3D7-(XX=t z*SX)Ng7^%EN^|4@Qg{uY<6e=g3NRMVwEn1cMlh@T-FIM>A=EIc=Xpx<5dv?Wh@a*i zXLHkMAEhg^-nqJ-qYnSRYCL3QmG+}4(lUcKNN0IqGl6wGcd&KjYJ{M<*`WiySRZ#E zne#KUgS6EoGYW+NFPAjqo$?OP#Y%q@5JK{P-|k6Een}xa^Dw5+7Zk~CCO=$>i zSdfu;|8R%w?gvUn$C1auj|Cnlk<;C|8-PV@@R;g>@D26#G1sdLIs%)zVXcc+)?t6_ z&zZpz3rZj~!KO^AyrwPuqvSM^;Qa_4S{s}NN99p0t^N7xVwM%y1lkYQR2FDL9`#_S zJ#a6*B~8854R;H~J!4g_IVCKUkS2Fg9(KG$ORxE6izDCs@Ilo54+hkJ`xVqNY+&g^rxj~Gs%Wg)NP*KpfK~8@Fz8k zG|7ENu~SJgX9N5esMQEFvn-z1@z#tKLwV<|-@NP-*x()MKKX$^PXGJI2Lv(AZnhHQ z4&*z1;${9z)#y=i2fYE4ZID<}UiGnT;rDHjWpjH1D@L_;e@mE^Pu#&}=A(sX92LDl zC0l22)@?=>TTsr>Y5L%l0H-s0aNOA-_A5%$zO3n#$)G9mtG8jS6nndnC{)x}=Qx!m z-+i|vw;VZHZHChc8`;;`>1Do#(>ZUW3|_3oz>8Ha#w_$3MUDr~vPZE$xijVsGnQf$ zV^H)-%GA$nNe)4-SwAO!@{wAJb_eABnZ`)}Yj@;vMOHfv^6sG_O0#qm>-qN|v zNF^IFWqg~lokU^O2WXdNGBXrh)W=SuAGELDndlqFyFJLpv6-BN2Li;?>f2q(qsj&JPz*#&2fUsC1-Fb}DxyKlL z0P|mJ*$1gZTB2UyZ((7NF#An~@_3RlssY@4F6XNPqp>JI|0O$Ux&P8`RXX^pO5ewC z$xd!#rvPvv(5JtTyHs?8i&`BZV=B7sHqaj~M_QF8nRkv&S*FN6rE_&kUN4xZ<(Io( zo8Yt$W)q4l%@Cm7%8RGR;+qlJuZfr_H>u(yx9nQEa zC|w9|lFQz46#~8PZObH7g$;1wXR!y@j2{{zw2qesBo(kvl80 zKNX{>3LpL_6g8-x_vTMdu#eWe`z^+K7=bXSw}DpFZG)B{Dc0b{trm<;1pU#R;TQX% zx=@><>%Xk+!Bj{hAC;XO*E(lN`u+7%f7LduBLIszMT*+V_Dlp^F&cM@E$)5g2_~NyVG;O6zjegl>wap_?)wK2ckr(J?5?uVJ zjt!WpYf(>cEk#)4VBNW3Q_ug#ve@Vgm1F$^#>>MVNqxO8Ix0jDiu)>^BWXpZl`MT6 zQaxQ=)GalRI9e}PGQi!fW|X{g%eTVs%rd0f28or+kosetION&UrhdlDK)o;bm{j7A zLdFZ^hnbBH-Lzi08pr2uzYk zuq}{Lg7Ok4$Y5HE#nBJIBYc6@SwfW>WNl{U#zNRfUK%ow=rV7LUlr?A<_rCj*B~Qdj9A#*{<-43eO!)&NTO@U180l|vRo0bbNC~z! zB?z{pV|&1Ls~Z)3%<6O-tn%2HHk0$;i-Amp61K!P9elTo@{({^nW#47)o3TRX0{HY zt4n}1g?1LO6f<;X%QtX3CkF2|6*uE_qR);Ti*zdY^(8RY;cFg_^jodA9JTJrO3RUm z*a%1}sJIv}F>1aOb-UunZxJSxZ+@Ue3NF^~*OeyucQ*En=g!VHi%GDG^q_;{4*B8n z{)<5Q#6yJaNsO>MK?{9X(Dijh<3vGa{n_PDBGo8-?2CIc$!00yv$oClL+3B0%!JA`?1)gc7cU`r+dTayC7Ld9v(?%{jN@i6z@H zh(4m_%MMtc?1%e%-zn047Lp9y6AN5jaQNQ!_j}dF^rTOUJFJ){k?{awbC?3q{cV)a z_r%}RwTm&Z!z?t*Qsgwj<8hl8~c+f!qe<^v1qhRklk`l)KD^{v>nFM^-v1wXY6-kecD3EsM41*)f? z+hL0xPjMdIz!Ii9O}InTw<#DRiv)uObxSS433r`H(Z&}TXebjk@(j}s!`(Z~^JG3) zAM8u@ez#jjzh*6q>s3g-`YVP@6(3YY=-R+q32BALAH^8oR22n_BhH<#&`^BTlEU z)hUAApiV2`W(iaF{cnEuY*ZVKw3QCaa^aNg!+Eh(LM|!(OHoIvIGqnIOW+)+gT4>^ zx6$X{p01xAQ$fzsw9=@{W}`Q;pU9`7_^; z@vDFjqqdNXq#EDt;5ouQ-=bJ1tET2qeitg}Rk|#TUP0ON-6CT^KM9CBB zKuDRkcs%KuUZDs+nSF10vk8b?{o-Tlw73J_v20Zcu=5{90x5*Y!S{i%`x-zUDy4z# zN}67v0bi$?IEaF?;eW7X7?{{>kbHeG0l=I?Q*)jz%6lNfk8}Z?)yy$CiRZ0^LNTxb z*p`$D!x}Iz6sE8_W+}JH48w}yyD3Y{dH2-XzjGv?ZheE#YD+GjmF6Po5r7xe55uDB zUqn%t;e}S~;dFwm?3!jVL%i3K*2JAx-}HUzK|X^Z)vJz9E6{&Yy&kp z(3mk20*gMZ^Ys4}fC{i>bgOxijrtbjLeP`eRF=?Ly2uj_wwM0iHJJU(U z|7QwlaS~R5q6S7~fm*#k{r@L2|KAV(7x?_|761QS)nzpNme_TxF4hrqD}Rz{_%zB) zUhiZ9FTi?grOF1e0YFKL&cyFu|@> z(uh7?O^-OuHXq1*^5&7M^95)xA}EvX@o>C+v``y#k6!IfMV##2pKfewo>rEji?xW; zb_Iy4(DL3anF;*_^jFzNFfgZlz@h(p90K6|=FRom2z^{e)aF=UrPahB)b7*dh_QOg z7eO#PGuI`>$;ldmqtOIclk*4VUt3t6YC(hbt7)D&Dhad!`*82`<8PcHa=!w68za@; zL}6GvyBzKh)T(CTED#;mQ8*( z^!ymVDC<@59h;Ngd5C4C0bY!;uo8H=!L-1D(@Fe0WDocWw=*QYP3Q&W=wHux(8CPj z?fSqE@%A_Iv(6{L-*NcwfqmMKZQC>q-H{eLtw^qNQ}UC^){}43->FoHm1@SC_V{rR zpq670n}rrvAmzckr*1uAN@Dg3#dSR+Lbd)>&&rW^spTkc8jNkuB zWdC6F-~T1D&+|11m#IGqe^^F&cKsDSyc92OFuEh zxA+perh$pQdPDzGW=PB~@@RWoQE>6P>rg5mOayfcDOv75^kHlfNo%t(bszDH zNTh<{k^A;!$^YuZqa)C=81O0!A2x>0jrbN$&8BHwU7Zs2eN1%{x$mVv zNtp#=2I-lwO`Qs-b2>Y5ep|)_h)33%)|jDdI2)rj9_`v7LS6JCL{CuFtpFD^&U!X0 z!na_mS>F~tUmq=8vu|%KQ_F#xT@AWTu>}V3ELke=Z5G2Vs=}P8c^!eD(8&7NPjDZ0 z-Fi`qVG^qJwp3%qE3N}Cx;=gt0g7Hmo&~Z$N&Rs>Oc!((tNID%K3hoVDy9m;z&JMG z{~@uoFF~$uJm1@t?(>Us3Z%axs>Yk~&4*bZ9GCp97XYoj@c0yo_FpQVJ}J$b&NN5#&<`5EEn~P(#t{j z{0;<3VV13KSVaRop(&k~Sx`kr@80Nl4Fa0BUw(cBzh$8rscz4_(eZV76fY(flcV8z zvDc3HWuK*2dp-$s@m(&atzCVuJxLD;S&{Sk{1 z1rFkSVC3@F!T}nFuGd6#F2~)%a<@#ytqN*I+~jA*gXLez{DOlxDu6nEi9)hVW=wB3}&V;=kE{>`#~R}VTibuQ#<`_3`rQz#X#s@&Oy$EEC}U(kU6j!pBvugWs|E zXKZPHKJ}OnB-5)s-kx?(NCk8KToTx{1&_Ql7pLRBQ^c{wicUHSJ)ELEJsO9QAsaI0 zPHXN7X{)z(;?HJEZYr7?2EN6f{d3hTGP$gb8bYz*E>yaLF2t#%OM2{IKfv41Wb@1H zA&kpS`gmylK>VclG z(+wX@e%*N5|5VIxo32K*zq9@{eM}#XN2ougI|XZIrAu4ZOob-!nZNTzZ4D`XWOPWd z5kZu;)1*glU32#&gSl3@=E}Q>cskh^Xmp)$zl#z&s+f%TW_URrO2+BHS7kNd70yg6n zCQ|b{BSLY1xj%uw+#fJ0{8PQ4^&>PHv>t0N+d&M!-l;x4ScTnpoki$Bq0TU&oHA*v z>4L@ZtP_5bPU8el`laMp$)39npMa(ysLK!>?4C6S9(M7?kbn>crvde8U3th7;)Rq8 zYgxtcsPF6T03uO4*I94qx%*KcGHG;~xiPuAfv3s5x z)=Z;OBUd$ihF+?s;ScxvZw1$kLZL1^5s>-gCzVP%jn5{W$5Hc9Sm;3S7<^%_##Fb9 zClKue(j?OO!;e0wyEzd>x>t5=@x^wF?UBUvCrJek!^FsyS{oO23$;D=knO3eHzxCx ze3orb`3h&BL7nDR5N7E70?4GYjF!YcV4EIp!OoPmhp;-O1J7$Isgt(uzPYKW8GR2Bo2UvA<#2hv@~$ysBo`;eBhp z(|rwwWi3Y=6C>Vxr)D;lVRV{2^=&+Gq=%;ByN@;JBgoqhy0CpSK9hcNzKv}@{oq=` z8pmFd*+-Z6Up%cD%uV-t8Z8N}k&9y$cofrp+JhBEb(g;>yA92Hh9Uh);=H4SG0t@G z?<&U+84-hk(<@BWiSMr;OC&o}@iFUsR%QrszN2W*)XHq{5&a1H;uK$A zaqJshox?qwAE|hTcI8vgj1G-|{4iq9c{;Ch^Ey277Xu&{ZZb*&(lq#2X(q>rc$uS> zWshk0#hYw$*~>Q;lFsre{Lk^3i_a!fC)VUO3vAdm3*A;wnzGp$E~Ca^9NkWiQX-N$ zJoRVl$v45=>TtNV6Buf!Q6lr%ZJ9OBRIKx*P{EC@25ecZ+yJP7Pq+6myzj+esq&&D zkF!nFC!h27^qQSf89GH*RTd=&Dn9fI$48GZ1&%Jyr0h27bP;^zaC_rR?l$W)=U8M> zf2I_HW{!L!jq81ls2;+oNiZ9B26EVN4kO^GX`yj9QBYt)8ukjc+&v@n6t5?qmLB>ihx|>bi)kN#zd1 z*zPus^YbM3N2j*hpSR=umr5$NdXF$PGoXUFDt(}K{>hAqc)nUy!*Qg3UUiZLbOwNA zR->=4JYv2!WGE-zB!AQIOm(9`rwFw1Mkd<8(L^_VxRqsqAAL_YWCpyIvaytYJ|;e) ztiP}WecX0I5JP+Jy2q9D;$zI_EF7*|VIGZ4m2WAOQbDt8X7+MDAyRopN^JXF_+jQI zE^>&OoK(P;H(tJ*fokWuZt=|t+k|>Km368^p)*~l^aIagNuv0t+A3L%9Cf=dT~C)2 zELlf87MsCnWpO{f>?sFyQ(U7oJmoPH&Nyl<6*ybS!>@K8|;iP&+z_gc`b*9vO0tpmNfJg?mV zQJKxL;`@nzRBsp>n5b&2_>plMOxwl@jqLcGn%19hbaOJ@a*tzU&8vzKySxrnXoGOV zPk!$qBo_Qsq~`VwJdttx-yeMNx1Ox){F8`|+X}6F*?thKl(&40a5apUYI-)S0nr!g zWq*_4wL|)~X{n@{RUqYi60h~fF@S!vt38A_w$7J`#ljkZ9Vh->nsABFl8YLi;dT~2T=C&L{8bURb09o;yL2Ah@o+UM`s3D9g_ReY5X3BV5ZIfm4eDo zhZN1s&0SHVeZKq7CNF6l9pnVj47wY;DI6*ak$%*lrU~FuMu(Fzi%$#;zaW(aW(Bv+ z0{0XjkW&*es`AaY76h360G1C4Hsl!G01@pA7X7vJHA&_-INLSagD1PfIoR7GiTxU& zJzaz1l!{E9%Rz74nk}odB#P{ZAnMxV?Bag9lg*=g$yHRtE84?iH}9Tv{Y-e?^cyGd z#9sl&bFfHP&Xi_PjjX<%dNX0EoV|@*pM*7AW+W5e#$3bNtGzS!J`z+P?8(fgB$oms zenWSQcYp53NVl=nEy|M_C7P4~&u~}TwlpV`d6ZxbA~M~xGgL4NUy%Zz+Rm>QS+hC) znkDstmQpxgbtgn<6a`bpU%cKch#&W4^Nfg^@eVGS2Uh9*;a-P2=7?6`(^(dZ{(D3} zAK+uhoX;S5#gAbkcU?eELmq`&-CAv6%tPxS-SQ8WCN&p-qEA^3l&9ib*hn6eQ}G=< z@qf_Dl2!w)6s?laJQo{w6a*`+krgaLP(LIGIv0 zrar+6?D(!ma2IdUcpy_swdkdUBIx!va5piYRh`vvrxi!XN0q;=W?X)FFI19ckV7&j zM~BOv1_;qd@wH42Xm+A{*UwlyKX4O^`EGWaMQ6O=?^`j~-g1BFvCEPUSFw|?;8D&w zM7jNFSJE92^OIPY%pR-~7ua}R`BMddkJ*k**gs-3^N(xFhjpsodkpVw-9`OoCtBj4vv)@?>Oa}K22({yh9@M#NlTYYGKCVIYgAL=WW z&OPe6^k()en0VAK=^LLXxJIzfFF&^hs>D=ub&M_eTn_6YgkxUS9~iGCI6nh zu$6kVj#{enitX+aaeT=u*&BCnL}A^$sXGqEl91I35PwD_#Y_C)n#6;++Q)dZN znFq3rPmKuP&@+B!5`T8{&AtA2YF?rVt5-15tAnCa+dqjtJ*h}RKB$f3*}>}^Ph?MG zzM;Bis^Dz?hQU=KHPW#C>fHCL552DB>#$h_H8|Elj=|B}wh_Jn33s9qTLgLIC+SzG zbf{IatBE480kj7l;Fm|mzRp)C?PAlc1%)x^wG(72rF-pSmjemgla|G)OzfxvX@krC z1UP^GHXJ<#U%o_d5m548?gopUl<8X*RdSWX40#4Ki~>A*iL=&sd{9DT1)7{=P}|vQ zK@&rInsF`n<+yUF13c&bBK+zcj`+$3SsT?d4xe`KXxPR4WTp*Ybk~2YXu8?moa}j>I3HO3QswOR92T|S+Z4JtHMj9-}e-um7 zA<>yfXg>I9_zszdzVpcB(y|~64O>w~%vHi>C=Z=tE1n0W;cP?Mo9U9kS_TYEv~>8s z^8%L&el}8kAp&i}qk3Qy%d_uKB$<9Ui-mXB*Ieu?E&O7c6J1X_#A*JMfrJWv(#lzv z-dk^Y*qENRt!)%Y@-@j$-)M2i_o|jk*sWv}I)LSReQiKO48#!qG>Z$ZhDX%N5G^YSsAliKA4cn$_J% zkDYqd#rWz%D5ncbK4}S3x1qylF>E7M9zTr==@vd(T10+2=(VRqk3T_%af>DDQ5xSn z{C!Z)c64A4y;{RT9L=5@gyk(YoNtBYB3`rjjBi&QZ~(W}_nY3?`+J9jpyd{)!J+n; z>z~SLlG*A>>yoCR=PNO1NIJH?X1W>s0?)b2^J%kQc>i|8s=@1Q&ej%rm}}X2B&WgB z9nqz?sO4fG1@$q3!#1@A-ptXR$i8hAYB(45q3mkZr9>X>USEMQMLDPHxAE)A?tU zPy)bf36Q%ZLC|}f@r?-P@?e!s6Y;D{<~EC<;xY*d?(myc6BG`2G9M4wcFTK$7qg6h zPcN_rd~EZkM1PY!nM5$c%qWz-%5Jl;JgxS!b~Z9;oK?M?P4$O2uJm?A{kgg+cDItD z;RBxMHE6>Mr6g<-RewWu?7encZ%NZnsH-?qMx0jLhSLwO17r%Ya({HWqy9|dHM0Im2;-Z zuhO>GUCjHB_ay;Ytu>TNqoenH{Y|9ES;_fE0_SloGgjyG`=Q*3ON~THZp#&JWT_0T z1NzK*>3GJrf{M#&YOAk%`>5$WxB3nIc%8jkHgs-kdT5UBKTa-t;dD}@1u1sk}lSIg}i1wG)b{jT$=)s zb40XBI`;^2S2^Q6PFq7c3guT47a*_|O4o%iELGNT7$ORdKyWDbQ8`Jc3VFDC9VPE= z#%_n_QBRMafol!ZfG*P`PA)h%`VJu#UW_UGKtK5X{$X%^;>PJziR20}ai}?6Nv?%o z9N8ivp2(s?`5D;{GZ1-y4h_G#{tly82w<&dk^aH71=mTTPZZ_~LbJIKcDQk?HRGS2 z!l+P9I5f3!>>#z{8(R$DwB0MQCYMN&CDNs8y< z5w1-^QE;{G>@E&;KpO4oHPeAr&)JcbS7uV&#=#Th5z2u49Z`xJho9Zbh zrxH4Trnz3|;WcqB-Ouv++4-UN2#xRWmRr9BxGgj*)FkKAlJ|O?P)Ei33wVT+)80px z2~yj7333fJf;7dNlew0~XVS}Qmz6@$vag=)G_GGSJDJ3;5}RVlHj!zyzFWsrz7^+76RR~w%}~x)rKfv+e30tG z!S-qWz4gm^MZwdGF1RYIwfdgEjOx9hR5^~TGv68#&9&tQV0CM}k0)}smp-;c^H`C~ z+NhY|#biIDysOn5|s@~erq7|4hMdm5jepBpH`ViV1mMvqp2z?xw#c;Nd@ z*Of%*KGtO}_M_Fl^wn{$_z|DxAVSob47I=vjpxO6XeV>Bw8&Xz?S9)%*}YRZYb22e zsEbB_Wu@&tjD%k`6Vg4;EVJ0Mg0{}FXVKmqPIb@P`o46ybU(6|3ho==dmSh8zMd`m z2w^7X8a?IF@s`N2mOEU^H@Oa+>Q{?5_GewX{~$+heO}mOu4j#WcMY;#J=7~gSwigV z=H284Ph~CW6m#kNh-6?%)l-9@Mo^b$3GnL9Sfu@!o^y6Fey%z*sa4g)kKb&uPeK#9 z(;!xs=by!-UiWuQK;kJl zbsdW;*|_Wa@~(%w?Dcg-xTlq?^+V*ZV=iJcdxKJCpnP9NM8Y(!gz=r^q)pKX?lgX!cF|eM9 zPL}j71K_U`ZM9K*h3LYucReQeP;v${MTLyMfYA~e$L^qZ+_Fd3SQfh=(|f~3=Nh}Z z>Yj^|&vYLZl0dEsP3Aa#FGG`sP$Z}>4rbfqn78|UFAY)UP_o)=5_&C9 ztA^Hh(2*!>L^|u`J58FA^OGH$%epaEdWWyY2lyHqFN>97kvnGtM2H|X7%Wc9HEJ#P z-lTPP3(>6et|rR2X7&n+vYD)q@zS4ln`4eMOR=3JRb%E**R#fFY|p)fcJcO$l=_|? zTwPB5`6d4ns|r%Ga4W(kq54ms?{UPG*(@4y8FyK3Yr1UHw`DkA_edMo!<@}8NslOIQ=AzEdcU zGs3gbLCO1KVWVhaVmstLJuFZT*fRcq+p~ZF`!BnerQA*+emT+Svyws9chd6F1|&kn zZ-9J34{+(&eW2IgP9?MuXI|6Ux4WB@|0UNh$9c-)g8Ke7Bd z>6_X45Nn#sR+UGi5JOWTUGe?eorcW_(G{($us?BPC1r!0ps~Y`Rn}c796jjvMY0E*$D;?hNoSc)5*3C z>)t*yn_uuHY?#eV?;FBCyd>1!!TxjF{&}31>~`^z|JkdW*SeEX7@WpGqW z=f|I^by1xA?eM3z+Aveosi6(icxd6Y+j2)&xMswbx9`BK%kHjUytX3SGlNl-H1GFQ z*U25}xWAaJ^=HJ9{7L5N8O)Rl^%Cl{bs8BcgdF&c;>C`oX^oZF4K#e%eqd^l&UZ-C^r8AK&-+{m0i(D={zLH``VlOh51`k%L^LB$K2F- zsz+IY($w0`LDG8l%EHOQRi5BH%J}AQ@+1O4L6EQZ_mWY<>g1QS8k43s2iP-?;ulOV zyLm`9I4E42hI2bgzAk;^S;@;s1k-ypO3N_q!}VbiyzT4SThDN3 zc5c}f7gmny_-vJQbJSM%c_STd-Yq9K6% zBmOK^o++a_PJg#

PDNbWo15SBVxWB$R>C!v=&Mm*<1TDV48eN$xOMAZ8j1jf{XFGLZfr4lI19Ch!j z)VPJ)_JHepJcI-FpeAHFO8$$bA=uQPuf})G4Nz}1{BgwMFEUsh|L3xS$1-z=jo6($ zuv&#xwdsB}KGYP&tw@>8F6CPyPnh-V{Hy?Qb`@B%-8Z*U0lC^=J^cAZBWAdm2WfM@ z2o2kSBR&nrcazRZKislGhn(L8IO`8@@uw={3hX`RDDBN=V~H`$ZSSlx9fYf6^L_?J!Cpq#3>B<7vV3$bpem%YrhO{(AD+FF~_ zSb4YyfS)u-({aSYr3^Wipcoh5JUs4As5#40Q4A0tJ;huMXtE*GQczSZv0%SRQo#Ch z+|C%&zU5920`jhBFvQ=&5guoja43G)BV-Hx{+jXU~I2H#WUasnSVlu%I(_ zlW+wFPNoaIjy3btZ;z=4`r|`&GAwNdv%`6xECfAhz)z?i19G&Rw+9fDRpXPz8>l=HUSmoZ13F>>4-{4kX!A4};1K8jkOC zu0JC=S^DB)>1SZ59CF#A=?|- zIIn*{p>V@9@S#aWFf#d}u#LV>-$^CUVRs)F-F&5G<_@Fczt7`DIMfJ3cnJS$)2D6* zmDx-yt-plE`b)gzus;pYCx0eM39}S90TFm|brso@W@`S;HC5)>C8^VEJ3GZ)^gReJEP`*&fj*6CKT$xHSeMC8 zMi0Oy4XIVH@3@(vAjlC;Z%h8mE89R?S$8%7G5_6)(0Y}~^);@;X15ZX&Gj5chlUB0 zlD|M|rZw(oH4ADM$B&7HRz~~A#Yacs$r5w%tBjZdDs`&yX~8;&eE5Z@8F2gNPqfAZ zh|#&U{_t2jMjv!y989M?ODgYDqm$m_(e&#j4w!lHbo4N>>H8%VDz~}Be8H9BFP}>F{{VveqjyA;d zM&Fx|LCvT&Zq7dD6HtghDQoTg0Lh+6LKY~@H_NWm@Y8+F1NNqo32S+eA|xJ8)#BA2 zjpC=Rt41@}uKkY%=5Tyb{drr=;a8n}DR_-rPJyfIT!p$=t4bXmo)7}6Bii(XSNiAI zbIPX>0v3usaC_;HI16n9!c$>5aLqZY;UDsWe)UFO4Gr89y;Y6h<}Axg<^n!9671=> z_%}f$|F`k9{yDB0TxQaK2H{_eLDE+8MG$-_zPMu#Y8|@!Z>^ihYk*PT61*>-1I{a7 zcKKK#7UieQKkz;<{09`}YenIoW<2x2VeS1bxG3&(#*tqn;zTg9j(m<^r*@u2PSegI4YUI6A z>}WW5f!I=c?=(=qZ9U%j=Dp`|oH*v)Lu(fC?5y9Y+z%fP7DI} z`|#IPyKB%e7$2d-y}vX5sasY1Blg_=bz3)6X0^A8ujPE1X(h)y6fUi1Fi5UMwZ6|P z>c4JO)&Q(9GpXybu?~Tr`z&QqD_}-Affy1{LCxVkejl{lc5T<2`C2!e!nZBIYCg6d zu&DSrVDt(*9x-zOP(r8w>8y&B=I4TcG}YfcWT)8CQPOJn+!iFF{{YHPH2n^#;KAJ)cC71 zK?B0OI028eW4E8FHO*;q`qB3ov~YLs>-JGZq9%)}SD>B1Z*Y$DISf-w;42}EI5AKc z4uVwvhw;N~AYKF90HiH5LAL*Ouu}g%U*`!Lc&Q*-G?tmJO}Y9xoJ~Y8&&abqTbXQq z3{N0-n0h<2pvBA_gYa<>Zv(WSloisG55Z&OM>Wu6-v#fZs+QrH>Ap2z>t%dd&~?Xt zGo>`+E_1FdBy%)<5`gXXpK7TFyhCd4E|8;2$QQ-`+GVZ-868LW_!Owa7(ICsj>?63 z$gh#sH`~RjmIub(rzd@DA7{{x;+i_AS9G8GQXVW4H2E=8U4`mIxeE@umhwl>q`HDG zvxvvjuA}N0>WNZb)rf~Dl?d28mp{3*`esC0pB^dx?#n^lA0XOE;cgAaPthU)G$Suc z!Xo-J$$(F;fL%L?{>YE5Ys5>q__9?O*CU!mib`%Vyz!_nN3NzyBvgU)@P}ZZ0cFnl z)yyf6m0Lqo59x)yfjH7UrqrN-5w#@Gt19x5*=61l>GzGau0^kNu4W;KBbD?91J7V++^v-L zYnA#47t*B<0Au|@oWXyY7vLx%+nKwM+oXw0SOPESck-GWu(W=xe@`sV%N;)Xp)lV6 z262%dHt1yk_+^Q^SVm!EQaPvVyXPEnM%x-Kyks5@P9UocUocTAOM0t~yy}SSVeI+< z4MmYaUW5&|k@pf-Ij@Q)Ht)f=*gUgBoJuyq^%#kn@rxC8GP72J@2P-smnUd0%nMsz za2%mbY*07Px~DN-(xUA1$}HKlR9eHtYSoMi1bCfnos)HbfyzD~!+}D1eV+0KZBh=Sz`>@k@kC3$}wWElS@K9!FQvLrF^CW+|% zM&D~kdLMt39CEThFH+E(=lm{~l;(=^jIE+utS{NJheh;BzhKuH!aWR5HN5&F?*S=I zC(?AE)7N9-(cug$10g7$Sl)|BBKGu}TK*1@)AnV61a}7hC0*`q}*0--&o4%9zT4&7EMK+CbHoER^_gL*+}H*VjbRj zGJ1FBMnNN~6r{QAdK5r&!-I)%Mh|^78Ab0fs*Yt)s6|)NfF7Cc$L)$qfB+E?qH5=u z+^NwjU97}_c>SHT3_R{PtI<`WjRHnze31ruq$4N&;@>QQ_LdWd!&@c@-+Z<*!16>hgzJ5M#bVMl5bu^Py-CkTu{YJm)%=J6iwNsGZ%`1j{A@l zAZiu`X8HDJ**RnN*f9+yNi_MX-WmPay&EhHgO6l(#yG7I8FjAvn*Nx-P)jq=afm*6 z`JcfvvdU7Ycjw&kQPw|$Ts&-!;`zrujekO>$EaBL`NNa=cb(Lh@v95N!}V5rbrLmn zG#_k=$#zh6;3Zyd@)cV*zu@pXn^Mw8jZ~`4qlnvxr~=s$(hMm#vbC$?(?Tp;mo6FI z`3D#P;x<;cfXv{N^BObqeyMaz8e8o1C6PYYk%LzjIi>PU`jwntNBW4se>ofQ0BFuR z=Da778xmHxzQ^*EpPiOJ&6W+%21FG*5FAaiPzT2;@#ImjI*{7ngkjMDDP(|f=W30+ z0lFXLiHohH{SYlQc=5xmlJNbUb~V%9wy8ya2sB_R8cPYMB5H762d(WIzUis_`w2rg zwJc|8kFNA#mKCuY?Ez#RLo@c0PYh!8(Bi_iq>Fsa#-N-%fwZRC1Pz4INXs~1wsMnc zN@S&(Ph-nz<0+iUeI^I=@0BRaqRZ0DkVUzmYNG}7a??g<)(FQ3zf<-j9=xMu+3I~J zi%W*z5-&lXwE^&A??(J2AdRu@X{4MMG}CqCK?n(d#emHlucxS2?*%#2rJaAAjMFPs z@j!HR+X&ah@*uKXe%$|!vpNDmN9GPaFizl$YpZ@guYAsfkr_}-m?&R@jt)L+iFMBq zvxov&Yrg(jtLE7o{|lE}#uA1c?l}7HlN!Or<-u}wofpd2f&x86t1hUFNIM2r1~#Sr|_?C(F2zzmwkI zvQet5{0>O;P(}V*XE0D|vD+D#5Qbc%ukde~l8v4zHcN^5d@^TejIYl^SuTGeSdgt0 zkF}k^->mS*hvZ`)55G$~KXKEghHW@$@3>71Ilu8F)vtd%XCI$8{??jZ&^qY=?MC;^ zks!qcoQDoN?pK#8kTk1>C(}12Zcsde~hI^T0TgRaJcE!P&IA8`D16D2%z!>rySV0u2y8o!1!b4v<#X~L?C3^Xn_mG~Lk&LxUH%%zM zj+wN|TCttBGB9tWDIpO6;^t=%H~bQEmr+SXfVYQ8`wfuf+@aeeI*?;DHeY*E`&Wji zRj)@TVAdG}Fqd9F6a#&u({cm581qGGZueP`x6gR?7p{8iEQ_xXjuqc#;xqCS1B{Jm z@ypCfj~=*I@dadf5}SMX(!%qnG;jQC zEZZ&j$Bi#N*iEyZskmz|%M4UY4+QTr447PRBh89;SyQVR!=DfX(TH5>dC87X;eMg0 zs@nv2osBwg2-(UpIwXVg`O6?Ozpl8WoxM<88(g!w6QWg0uESFK5ou$RWa96nc_+3( z;$`pM%z9+zJJc$2MG6RDDNSb(`~IH+S7)c1 zhR?`~5PK9$tN#lcXN0pk&JTYAH{W>(2Lq|arSETFgv`Bhpq~F$@ZFc3f}Q6_A9Xsw zm($E9FrpL{Wvd+2t=g>kW?uvU05)KB8MH1q_(04Dc9l7b%a92f>FB5qIY4(R!fYbg z$q;nJorsx$$f}OfX_lvA`PUVutsw2)#_)GAa;!rklfpAPtcM?64=#_T!3!Pxi`5j{ z75MQURhM7owLdboob~v8xq#mN!}>QTF%?b2gkvAdDLHb6LwiY=bo@t_5l60ZFM%xnr_fr%2+bb? z-(W&4i(BQ6!sFcf2Ff@I?X)<@x5MAsNN#TiGIQldTtneKSeE{VreK_HdbLnM7bzGu zjAC9<#Gt=a8t(;YWu8IL_uSmWk96K_(q3~QnlBLL)i|uEg}7a8D*yp^{&olxG92hU z&ceX#u=Kr_@%~G6QI|0hndt3rp#thkX`fN8_(OaKAmDh90=@4Vdb5X8#sZR)6) zMA~UTT6bKt-_IPC&>T_z>#X)dpz!xY+PBQuV-0MJ_aMyh?$ray5@(a4!xIBnk%4bp z(dOC5uqxomusdMD0y1-ojsI#!c$h^B2!(YM&y;gfZY~xB^?|^N5|{N$&SbnTxNMF% zr)*>M^8n-wX>ar7DFua_nWL>WUvy1Pu&L+9SCeLX*Ys7|b*>3|bI)z)xp<$0Hn;ID z9GQ*XATY{5`@DL5EAl)%H}kj3 z3TROnWTVXhgkgXS-FpL08qP@bT2kRvu2s2TQTepDU9s;(EWfw6Ua?@*5;Gkj@`Hxr zIlgx$DN#Sc93Hm7=*#j*U|15N?BN7$*UbsnwFMvcxT#SVloEk@rE(09iX`fq+Oyt6 ztSf2+-!rcgVNk4Y032^)w|mR!dJj1hJQy3nH^_mjdL&nZnCI#x*7D$9nWH=3*2^bV z75Q?}HYhfaAE6}Ex|JkTgu(@mo@L@oHI$5rAvaa73cdRBfFz_ZHKGi47yp%7LSz-W zLn||1#&eIvMTcAE^5}fe|dW~hoGU_jtZ|F&S;4e~e z%koJ{z9cy(^v*a%U~5sLN@?aL+8NH_0VQmZ->TUB`|7U^aQ#iO_#*trCYgWNndi9C zfJXg4I@VO?e_b#LDniGshH^Y`#xQLF+1~e58&OK*G-5Omp-evUkOMhKfx=qySwsor zxfPBX?i;<_Hz6^N(64kn5J}6ldn1Y_oMi7Hwc>zX{|n$T|K%wdgZxm>SE8#@71hb6 zqI`kXy9Cx)z}+px<> zQ9}fE#X)OcsC+!-QkOhb^eo+c+Qph#2aB{v*T43iQSsC~G|K!p%(X)>IY?JG#=NV-eTb9ycGYx*YWy5A_D91?O_t;FE zjP>i=QD;eGuuUEi>OMD7(gta{__n^^Dsy#WXoCUda3N8qjzJ&RoT)GDYpu_J;FT0P& zLwN~g|H2;w(%QIe}P6NIR} zCZVZmAXv+POLg?~buASif^B<=iUGyFm3z}pJpv6>|UR{B|#6K{u7-tV%9W8yu!>4qL6}a!3CZ} z4N|);Vk_`HA>44nEp3 z=YO53`3fgIR^XHgYGz4_eBV+S(;}@Q|0=qcvw!pEuJPh}Ue)^&_;i-adD0rb zf&C_Tu6)=s8GmLcP7T`N10)^XXSFbABO@U+`_|_=4O?JYU9%*&YMD`$Zn#xg-Z5n_ zAbJ?b5KS!#o5dIRm9j?cnnV9jMx3A-)Oa#}zp>sZ$@8;zJ>Hx6#9LM zQQZUY9J1j4VG4QlCm_MQdrf2#a;RB5h4QoP(cbL0;$~lSa6E2c8c;{&9j0E^Dv}YZ zj2)I5C}DK?5l-5;?Hne5WYGkbY2IIJvzYG>=Ku zfX)K5UdRP6U&*jZum6+^E9$TaF8U}$?PKMN3w|-?5~<(Wk?JEV%v|o&8-!QUjC^Vy zb1C>)5cC(}6;gRaN7v={8e(B2j>zOhkAFWzz@awR{PuoF%~~(UP_Z&jhbD+_;thJ_7YPK)BY*-lJ` zFf)>4AS8G^f8XlUN2}?Sh%Kr~#A)?4ftvMezlCkUo(Xa0ByCsa!%Du4sPp>}d$ht} zn?@!7qxigJobnBZ*Vr@rQ^d(eujXcH8^%3{SwpIS#P9U~jfp3VCKMhV>-7U8HKG6AmU2jDICLHO5K9Lsq zc71d_O&w8LZ*futHD6MSh;k;qT_@MP#m}}dm)`^q9K*afQ_`BPI|-sISnUp3Ah)b} zWC3tGaBBPQ`8ZvCb(L$s|65qylVo-JqBDJ)yss&I)f5c?L(<-Q-h1y0l)+R)9(3h5 zz0BEk^+Ee+`_ZE(xE#CeRK$l60nHQSAU!biwy3aD{=!dOD(fnD*>;py*~f(6txN~t z^sfe`dJ(c4aVzAoH1py8L6ddOGFDvgpxrsSTHidXNcUHpsF46_%0)e?P zNr!r=*3JzcvHBxwPuL4Jp5u%4-DHsed_hgedTxnne~HOe1`T4H5?Xn5Zhkk927SSi zs0#f19(VCnrB*;maCzeg+><_Ve`T}3>c||LL&E;YB};1VXIMO;+*eX`C~+0#KnAfM zx9s*t(+*fX4ivosc2brEnU$t->(1ZHl?-+SN(2qn%wyC4s=%)X9YQ4tF6Z2@2CegUl&ObAx2$O zQCZPYbWWHztprV4{`iY01~aEAIU?orYpFHtg|#J*{j;ZZfa^H_Gf(t{7*tF!!KZAA zQ?b4zVLU0tbSqwNxbiCFPHiIbF5RkUSOAX~x-E6r+anI}VT6t=1Y25{zJtg3tZI7T zh9`tZth=&^eIAK0&t&V#^>F)qKiiJiBz(;d7YhqrG$+7NG};HUI4lUaL~+2?UoRZu zQ=O*w7OLSq1aQF5$UhT)xX*ha%f}a6PJ6!Xr+SH)1x(#-{adbA%Yv0q^m2fDWv7E_ zzZ1=ElFT%(;l@aWoN?zIqhHt<=xKuY@FLN8ep$(gBJ{$(OTt!+JR!L} zmAHT9#jA&<4OX{;Q#prs<{2Ijj|gh}L;IVtUivw~Uxe)GyNAj&TCXooVc^U+;UZokwT0!Gn zUsnM4Bk*ee@FNd=UI|*blqwOpHSz5=SVS`-NWjLXnJ`|ghe4ciV^3b?`6pY2I_c3z zCQnmj;NdF(Lk$b=zxpsWcil>r|INsRF zD9%FBo%p>M<;_@Kosg_Gd`y0R1Xr#(h17WsfdVWKhmFTm+6P}Gi0{ULty9~jx#y=; zHEiR8sCdVj^A-K!?+8r$8tG`7FPOwNkqjDX7&BYXUDt6%RT2pkC!qM%Y;TnJ)S8D&leDy|ZH`k(VV=X$z5jSL4xIbdpuG zQbHYsXDyCy*z9yQrQY{c7FB zb2(ey=qyiL%z~1eJV~pLoW_Vi^s-0NXGsPauL8+Y59L2j_1Z|QR~ zcOJ(j+P$069t$S2F8>(wr_ScaW*N2EG_II|Do@=(n%CC-8hTcul6tAjE#O2^N)O)q zK9}erWu+__tviB`2!o<8zuAdbo{gQ5&eXyo^BIWkK*u)oZ?pl(jtnon#p3BlU}EG& z+Nry?@>Yk_;XZaws`mRHFLFQ=HFccu3H$7>@uWKQIGV1TsZv6RzcP1i zL=BU9(NJTCG_+?gb_HzYn^e`{nS2Ze*ck26W+y56^6IOHT(=t<*IE@z_2>;ch#ve9 z`GA=wwLe>tnQXMZrheZ)98tFbB2y5b&Ypad=5TKd`EqSRfazoWZ0DlAJrH@czvvzu zd?}@+V>35S%1qyX{QeosLo`JBc|$_|iyD=;$yL1YG5C4@k<}SSpTLL_Ob{&fQylL}H@eqKj$DH?@lgZDWFu z2PrHW10F%4LL7Ys(oA-3l$JLbk|c~;m4CpcVZYP^Pu zL68>fH`*~)ZXM)?>sdSIPO9lrR{l0N?DDA$KNt;8lI=8Np6NQJ(h zfiz0)EpwFn1MH!eGs-K9HdGB!!LB$+WnFs&lxCgbH4Wo zBKo|^FvjKwHEUx4!XTn3xMl7x-Iv1>dS^ z5IO}z9C^ZcJNiTl?WlTE<3fqHn0l=Xf=46n$?=fUIp`*+f7m)7NbM|0 zMK63c`7GPfMDX0%5gvtzrCbn5^wkma$P>%W9TDG`4cG^USjvmVsLAI*Br_gqU3FSd zKs&?3#kf{c^!~jQC~HJkQ6t{eQ-UaI5P1U~-n3rjd2&8mPQ+riAT;0@xfhe=clde1 zvhLt}D+y+;?N_mVCTV3>N7L!&6peIcxHHP8h|L1~ZF0AGTZ`PHV%6#UoE)?V3h}v( zp7CWoRkLlmZq!c_a|IK$msoznlcqn%XrljS+&hfraD8S#a^{kl-j?(7@mB$m`!`J> z`EiXAb1x38#>*T#ee-dy{wVGEQYz((pz+8jp_I^{5p{Sy*YL|U#kim%MGy74vcN{a zNl=BN?)EPWo@ldP9l0ZR%HB8RjtxI^^*8S9WO5m2Ve(mad-Hhe4l9FNY)b1@3Om1Z zjyj&D;MOCff7iexaYA3XUwKBZ9VM>UiRg30caF2=QHk>}|5Gx8stw0xlIc+oR=-OF zOZlhB*9^l&eHL8<2@E=uw07I0uBi+9a#0{8*>3RmpY0kj`4!djjfKe%YyOyIW>##~ z?JJ@Jm=WdiinM+W`kD*8J)dn(GjtUsmP^>dI}(#0^7CN6&3u;W^vP#a zT}==N-)$k2D@wi@D^nQ=%`qHs+2|pFkbPNw%lEEO^piJ&tmY=YY&<=mL{Gkl4v&@c z_s=*>-`o1Ds5+}WD}y=-o{3N58M>Zj@92w7Y*t2{7G3N}IUR!{djE)Uj+~nHuK9j+ z>Cz8$T}cu|=S@MNS4(`I0NM{1`!{kFjD7qjh=c$HRJP)&(w&VYWM+zOLEwGUNyKNk zp=jx|^~^uAELjgl9BjwUb`%`Q{2d;i-Yk@SCSMUPiseySnmSVQm2pF0I$XImB;C#H z(Y8_8f;=8fFLyDpH@>dN=m!f1Kgdv=x;h_bhrV*T`ebBGp+=uYPwJRH({|+Wb_ddB^LV29ob}YYE6$i4$mwo_cgv6;i`B6VS3VPRJMBqH?DAK9tg2=Bk|>Nsu&L} z0R@k1k*w%DOid`qYJ|^8(5LL6Vbzh(_a&~I+cBAe@sps02OEhfEAVMv^r`vg`!qBG zpE~<<>TJY=;{vX}H{?|6BO@tMPu_JLOEL|QfT%T4=ad$>noP(fJy=Yive2)jnfAus zI)8GfBvMeV8f^-GPJDg3X%iz)Dms5 zFWGaE+u4l-YtEgbTZP|KGR=A~s-llX1IsTjIM{%di!W zyjQ-ZPY^lRZx0ZG^HJXfiCr`5TxPd-(ujxWDc0Ovt#bu~<*J~~2752D9=asGH#aK_ zP9r0TToEsQWNxk$g0(=>hs)-eQ<|mU6PO)KPP=84b~G+HIxT+8sVmR}MT|}&YRR`b zvGI2Q$aKQ)n^l(O{*GMf%Af-k&-MyKJZ=LY$ar?(+9v0+MB?d6V&7*(=X;rdRMotQ zq}r)*r}<0<4b${jZa?nBRtqV;e@-gIC* z^y+Vt-Z(s_z)qa8P-*jis$sPm)kT&p?$bMfs&@NT>> zBS27_YH`e)pc1i0ZoTt*DR;{1DI%s7l}bn4*5mjMYA$cwuqQ`)+;Y}LiC``^SE$*n z6y(s+Lo;tYQXKrO^Ks|dTDDSNJ1J|Y1y67ygK*5B1K15wn3vMCQtk*(#yoPaXetQN z4*@nqd1`C>Nkm7R4AFs}r;$cw zS%xBus)2Bo$#Lqmka-U$A~hF+YkZ502T}-bLa@NU4J(&M9COq#E=L~`o}|8dI71GR zOaWBXbPnwhc@w4d7J34{vu|?{gGBowR>~0w@HgJY3_(Vkov7M1)bHRpoQYzHEaci3 zGNHkd(?io~@fRs0RxY3#2Av`_?V^%$gpvGXNWp6lKMMdP8vzZFF5~PFh z97GW@9$5Ik!#yq?QF|f!*m{NFOBO+jcqYr^4RBp++|o;0^|x-=Hc5b5}a4TLs0#RFCE9XW=H`{&UQ9I<6In@|;;LiZC`p^lZ5SaSkPq-dnb(XA3Uh zT^o=HFM!ju6sXw`Hw61?mU@HikhO6h`SI(%ZahOg&In}r*C7EV6F^0%7(%^#IE*?G zhf%{)_=8bbiM-3o@?`t%XX)wbUhc+gfE5^^?Neqs`cDOc4d|`yz;2elC-n-faQy#S z5=)PX<$pZZwjU*e3x9J;kTyx`cmQApq3MjII}X$qV6Z^0Hmd7~ALlIO(ye}$;%#*T zf<8k0YN~~9eg|l2c6T`|@@DV-2_D=;@>)UxZBtP3$gN#Pa+_zyTjKC7;B_N_JZ>-W zx{q_p`Dq}Ib1jDnD#<=?bH>6N*3kbv4Fs@1h#UM3*du0;KYIcTjKQBd_D^wu8R@U( zHtso!+v3}0NfT{sVfWxovAM1##eUH~b*YmUkn{_#dr&x6GT8O%y6PR8gtPp&mBk0>ZpTns)|Z_^!_J-Jh{_8{!7y42 zOYlqkBF;XrWn;05AXN?Sq?X)6CtVsL06nGZM!w zpg!XEKQ%%}08jXt@%s0kKl`sq3>^aZCC_+PZ7)<*AThr?*Bv(-e!x_U%cTjQfJiwc zrl0fCz!uC?4q=fjj|Fuvf^B7|A0Zr7OeGLhY z2M!-sBvbSN2SvsABQ`VG7uhpGyYauT^nug5wBL6;)9xN{>i;p-y&-BqSgjNS4}lKk zZ~cHw{MX?jGeHhesTdr+gSooZ2LBAe>N$0lz_-)2W_Al+<5UB3bkuKCv390`!8_|A z=rA84*&Xy?tsB;DLhOGy*FShnJK1u85+3~$V+`!-PXM@EES?9ZA&>-w<|z?2_+5fK zNoFZaN1xM|0`;KCR@y*?Jsn7L-!SZ6v~%K0anlJoz%KL4;2sEg*{R5hi7Z-3Ve|#E ztRzW(=Wxqu!vYXNHc5`Z#}TSvt~|&RyACEL9Gn53RK#5tdW=PUN+t}!k3h(O@vCrgUV-87XMhtT`IX8Ho0< zZH_I{>L7RnJd4iM8;=m3+pTkfhoiyTr)cw%!{6QxQEOY-g5izh6+CvF7ZJhrUCsY* z$6C%Ic`dMz3={Zy2*Mv$lLjUTi4y}2woFtg(1odSu3M(yGO$DpYd%@C2ImFKJ8(BX zfzBJ}9BTQv(jq2dzm3!(sED}KDu5I6#RMC78XMn<qQGE3l zuQfA3fUlV$OI6~G4fXF!%qpUqZk3*M5l@+JQi=)bXwc?P9o< zR*<5mJupe)?l3X$4uWOHkh$c7Vf{X0G=(bzkmnH^Dz!Y@S1{C-uI$3S3Sz`{kHml{ z=3`f{z|kR%hbw7?s~=$?ru-p+gFym0S#+{k2UrG*pp#auC=Gs{-~yIQE-x{*K)N z*7PSEV(CBK8x#+uB#4Sb*zJTB-l4MQXn`hAVIxz7ev?#ggi3iN5n5M7MGtkN5f;AF z>deG^#l9<&CtD2qMA4$hdJ}g)Kuiv774Q?ncKH}D&6^PjOJA#4r;4B$R?7#gu{sCs zW+ZU;?}kc7hb)o3cxW+1E{~?>6Pbm$avKP&PeD>RMH$s#8~=MsuZzjIbm1Kz@hn5~ zGr^o86tr_m7uXAdWL%^#vgiMP{m%pIx5NT*Dg2RGAS^$mvLUTIlzFbP%ghd=pPoJJ zNWSK(SA{fhMZ>O67T|ro*EH(}ArDDo-AIx_;{n z*2VAUn{Sa=Wad0v@4BEGq*xv`vV>@Nwu6TV?DtX-pA2qyB4yN!IjRfC?tJJ4?X_UU zFg%1h3HM)+_iBq+ljtQ~%McHr{Q#}}8n4Zc@n3sB@>5nllBVV8vcSepmee~)aDeE= zv^OCe^|>!91uoW8yHLN;k68GBd;kAE^S)5Zg%6X$$q4TVL}tGazJZMIClJ#eIo^Jc zmeU+^JEX9Yu>Bl5fqqvC0TrJIW6fSLjz9Cb&|1jh2;hyE_q%bmbd`8xv>_p{9;|P` zdkhtHil?Ntgz*r!9cJWcUOvy7!`(bb?l$o3L>HN(?kgtjL$>duB)LF4xY)r<14_8_ zo#TrdR<%@ODo;+FAGYgkZxj@aB_W z-bxWx2Jo-n)YbNtCSBK1j7ruU0~&J0mbcD=`I57w5y#|&si){7?DG#urVzH^iy&gK zonQPIY2%?y*I7UMb1bcX3XnfGEkrke$2PXCjd4uuggC%RgoH)meq00h#c8gzSJaxk z)zzQwNO@RICN#Td6LfM*H~@6Pjfgj$!EI-$r0!^;pnA7*otj@?cG>dv`rWD{Q$Q#V z23EA`C;1DSt*(ZN}a|;nCSEDED2@v z5A6;Y1m7gNEAK!8p2O)PYDytUahqd?oRw1|VP&`309|}A#oo>;7;x$pxqq3Cr`<#Bc$ngujOag!aAp!4!UFObaOVF+9#{EOrCBaP=v)OP zqd^d#FI?xb{13yyG8}^;*$z2LAydi@xq;@8;0SZa^IAgX{u^>VcZ3{qL3ZRum_;3j zFf0pK!8$*Gt+atV9iRMJhOS5a=N-HH6U1Tt*6;A49D3F(Jx~L@8s z-kbn8Taa=C_{i^Hh}_&!rSoAbfm4v|#GMO*hm^j1)d~-$kEaaTaB7wH4uN9)VOxT9 zK-lL(4>X&mCBqoY%K#ejg00{uWC>y(rI8ZaS154MV-M^9h8_?9QWgZp-9ofCV2xOE z!=rT{x#nzvlI=&AXF}#);dZnTga2%*4l3SjJB4$VrHDbFRMxGDwI~uI$ad_G2riyq zx&fG>?f>KJzvHoP|Nn9P~>UQgT$@zRfkLPhb9{0z6kcjfR>VpoDvf6rC+fUUW#2*9%!~L zk2WjAAS>m%U#h^{IgryVwc~m|Fda5izj^CoYgQFn5JwFlsbd+CgjFVepD2(Rb_7y= zp9ES92wr5Kyl(ZS)Bb+Lin69G>4w?0dsZ)X32)Flmh)A-eTH|PUbsHz*a4Kv(aLXL z?!jdHF-EE+QowNi<}oOxtf5voGDB2Odvpi1YiA?7ygf`IX5_Y4EFYyZXhIuL;f2Ub z9acVY=WY_9gW)u@x;-A}%{`tXkNGz|IBj6|%2Y?kd4@=hQ{vpcYqIHuH8d!%MDUn zV8c89bDtuSJ3eUJ9-MRGr9I@0F`8!Iq%u3SsQq20;h>w2q@M<;`rz9>y{LY@C|Aht z#n1b~OqyAGK7mo+ip;X=I7-sqKGA+V@}y9Shs%HulZoH+2{1(*9=sfFFJ?d>v2t`r zhx!=c^1>vFI#3uoek*w$P-*lpMd|B@-(!6v-y{VG++pA$EC3ICmxs8BcOOd5y!Ms$ z9-}E@Qt=zW6_LFglNYwlZlDt&E7IeScZTb0zz498gBC-ymff@Q`y?a2@Yzg21GrEW zEI@^NY8>C86AP*k_4*qsti=ZYd;IavXIy2G&*|Q{oc#s(0fps}vR+2FUkkK<0$Y7qp@UKDbvm8nNIO)VaAPXG@(r(9sOT_&pL}E`D7%CtlSm7e>E_Ar`9zk!a~R6Gd8x_ETo_ z-$8tN0Oe3?w`VWlHu_w(FZ8PT1f>YfL~3k8dXa0&4O9|e+%;TBGk+%Za319S_i!Y^ zSOv8HySy-PPR)9Rj4FU_4gLYXU?q=`p1(%-!#mjy97DH`sFv{FA&{ez(q|%|2&Stv zI&_SlC|c-ZO4XZ|oXEBarLT1B*{zW7e(q&?rIO%FE-4v4tMK@QNBCqc^(@{#j3Phn zG)p@tR!UY1UAG!D0ya9-AO&ewyr|ZO{^*=G(H9jBJ1=6igxaWw7S*Vn2A(Ib}BT)sUq|9Jy3IV+MU%thBX%f`VA#|xy z)dokx(aOyp*1gs8dVh%i!9~O}yo10fqt)QgBdRGD@aQJJpYqP$6kmz@OnvA*9el+U(Oc!$+Or z>A2z^IM`4)K8JyJk{g)*^+85p-{=di+3EnssW?1do53}N#~{mNTsLSJ_OQ-TT3!Q>$ZyQChVIVS81G8-QSuX8byy z*di2`11;E=O_-cN{V2H2Nv!FgOow}HB^aOolys~UaWMbjRmvO^D3T%1f3tRhQA6u_ zT?Q`rB&^{7zBO`@gebrQ*={xHF{lD#a*_yhmh#rS+usmN7`e7Me7(eJ_ZET;xPt*D zX!&V^Vyu;uQ{45A3Zv1+!>7$CPy$JWP%e&gK5i!q!9C+!vd{$F%FvvlhNzRdl*J~A zLC+DfUtOLLbI-#SIR3R#;=s!5=+>Y^=|=ylnjv}&jhK;2=R39GQzfh4PnD|tt5yuo ztvvgHYjcH=D`1e7uX8&~sq0GqYrq@X#Yty1672XdVnajF!UdD8_U3cpP+}cQc5rC<7*~hIIK*D%> z*YV20KeHduR7O%LDr2WO6koDSH2mcBdW zc=(wxMD6|ZCt}`wFaiH!Vd&a;?nnLsN)Ew3Va?;p#ehuS-1D=MXJnTE|JT_c5<3M& z;4*~@f4%Qhpba{8=tnsbvn|18(HDDO04G$_ZE5aWdLZ`dWXbEPE6CNFOvF^m3ifD~ zG7*-P82QfmiAe-jc~i?9FjZ%ov>%tCcG_C*)PTlelXV6FawRuoPo&MILKumsH3Aes z;r`n|g?|kw6C#rYz9Afs{cwAL7=lQ2L$QrxOw_xVK6gxZST81JK&z z^w9fSWIuDq0mhw$#lLz!^Cp9$(@#sIc1I9sCw?LAutOzFFI#j=@)L)8OB-CUg}1e^ zE|l}f(^@V7g8#_T1KXtd8(CSbV`tKYFsGaj<*xf%xK!{d>0g;S-(;A6^Cog2_1t~! zaS#BU5(P$&M!+7kQ;w}Q{Qa9znMqUV`Ym~t`g9AqMCqbMmlv%AJLrJUb z_8YfO(YJS|PC(h)S7jR|I7wq)gDb(z@mf7G8+$reX0LB ze9hd|hZf7u3=H6fFN$avxCiJJJe}4qJWNMVrWL3jH{>~DpW~F9cv!;KeYv_9$5>!u zM|KCdN!$_++MLqb=}A-n(1;o_;k9nK=6Sx z*hchq8^+B*8k*LNsj(MuuY`XC)YBM@=}cGI&(GAIeKg3(2MXfl112?3VXTedNjNBS zhTfU6M80uL2$A!n{l>Pbf4>OX!wJC$xABf2U_k;yy{i)-MevOL|J;|o4R7zy zdUSNXH)Qu(XrDLy-+RuzuNFw!RWFEn?>%xYJzAe7(oi^a^cD<)wtgiR&E!S7==!2p z^XOj)jlxyH|4t}kXoG8!Cx)g8v8;p3n^}Nz4#5ZfLHQqz*smXci=fC1gvNNo2bVYN zcRS}nC52qRZooNNgZQz_*dYTJy6gs<(zP!ZBD1f+z;WQ>%a<9ZPXD3`QcClGK5=_E z*<1^O4AuwEPh0rs$5INHRKbknlO^EB)Q#kF0PK?q$hZn!8K1C>t5 zZKwegVL#xw`1g4Y`^=-Ozr%q+>I-l=;=G?jS8Y_Tc9pQMH_v>r@qqn@A?>?9!{w|Y z#Yf#dz5`}I%o)+H+Oq@lc9Y#w?A*9VXgW2!*G`TD`Uh}H7Z{}pEg0A^9PS*Wv-IB< zb~d{d#k^wCxtBdr)`(x%PP682OTj1rtxqsW<0%AMqeY}n3Qw2_(t!Xvo_fnd=WYjZX%WiCqZ2S#eR2o(@b2%n zz{?iA=kXm9I$k*Yew(iY&p%whwdXU4xk8;l!XT`$o#=mJH*F+~N05{j-LJxIN$c=~ zV-`~#X*Q|1(iS8mGb@mpZ(K}tG3T5XspYc_cNrmQrEl8R*8t+{fh!wPwHn5~>!?a9 zVI8@-t94>**{e2+k27vR)bG}Dm%r#vnK)i9Oujj<(1HvKnjc+~ZV^)}nyv&K_TH@j9h%{^ql^EIic}XQ=W?a+X=oL{y zPoV;w3PxQrxeW?+37uAtWI#GQB_CedP+pQ$SucqA1%js4^R!XbX1SL%X;rHmTmh{- z8dGpLc>mBlapfA?L10>M9*9urMudsTiM(x-B&=A(RLwNLq{w?p0UCuW!qLm3)WdQe zwO5Y2>CHO3>1@@GK(>Ij8CuktZ!FM15FWw{2%F?)tg|y4(649ObOMZ}73Jbs~ zC_JKE0mtL>$Jp`QA}{o%CE{|tR8Ht*SxNG*AY>L_N$E~cPKq;^x5}c3Da*dzllzU} z!4!?Z04$9}BTW;;>Rgg+qNNSPh2uVk-+qf+&>xR*(KXQ(8q`bG^_jn9D7~T*9Ppjif{(?(&nJ__r#ck(grC>_wjAq5-qN z%cldJwr39t))R_ox023inz=R%TlYkDCUBmns|bjNl7&bQlLiD)(0r{ z6eW<_dY^23lh_slgO6{DTm|2x7RMQF=tz=v+?$wwSCEly$R#k zo{+~*jlh+>I{3)*2QMPWlmt+U*~7^qOKJ}UzV7QLX5}89zEaK&@#+q|gk@uKNHUqMv)kxlqG z9|6OvCGBE%bK!U<+ftdtA{bPzolNl}XCAQpD5FtBLzP{-fAE0$D=19TNZXWqM!-iV z-)MPWoOZS-qKd~woPdEx=VBVS>RkX9TD#f@tP9*j;7&0Kr8_=`NzVI~NQi zU=Mu(lFSYX;_xS2?%81Flk;l$=?XGkA_D(=bQWD=S%d!F!R)y(Sm-!xul;bK=Y>;2 z|8qV+wb5QXyTl&kM>&Jys4W)LVaB55n+47fsw{L~i$BC6c7DDX`o5zq$;diQI){k~pqd_KUH+{?5itq` z0J}jpC?3|-ifMRCP#hIGP@pn6c|0xLOqC+Q#wH!;OSFX52*_Rqi~xD9<;+G&@n34dKJizwwZRk`$@5N{zO5{ z4;Dng!of!!t&&*^#UJiAGHN>hxN=cp%RZ5_8HIgk~*h<==-UM+EK#%#H9wR&}m;i{AEqZ=JwQ!)=xd<%0Rv#L>tGE#4 z;P`$$4G=eBDAcsZNYPx0OlYBJdczR?ye<$pSK(<-nY;7y!e90X-~iR-hKHZCQ+$t7 z2lrh-vC2mYz2GTv=+k4SK^FAY1zc#v&O(r)i5FH@7Z8qfeJ7%g|3jRa^nK+G4qdlp zc?3;>@+-jqAp&n3ApxuXb8bIA1Oyfg(l%9zN1Z(aTN~i2DDnkee^N>JuMQx+U6tmC zd0=+!0@54z*9Noob4sjT()_i%E61nYqT-#Y0fJK2yWA~RG(d*;G68X};JH{QT`jn^TQA(ARc!uuT)|tr&TW@>Tc$nr^0sN z>|?SJkgj!V7e2EfMUdmJQ#9(`vc5(&~DrU#1)07 zfru##?Xf@__?;ij4eo~f!C5B(OuZ|5dOpQi9SiUR5{{xlmOpADc;JJ3DP#ACWY45= zDnty4Lj;Sho;-GF4BQjBPY3)c8J_>R`rY?tD@f=rD*4KjU0)TzL046a$VQmq`~yo3 z=+aCueK*XOD!oSxj@=|yGWH$kl=Y`?@3`;1dXR_p92WrVza87bOnH#XW~)3{5Q#n4iFvWs)r^%V3$lUUoG`TiQ1vh%ByHP>+^Yq zM&hT4cU$Gbz(9xE4HWCyLMg?6E`K69%0+fyXM1I%bhD?0rC`{1N(pRAA*<4a0#R}P z;0*N#L$rX)xzZZ9-Nxv;)N&tOrQL|0io34YHk`qMRDTdf3ii)bV<>kc7rkeH`YwRo z?cfbh`=%PWYzQcae$NEx8N(i5Dm6D}6y$CVXWzw*4QsWd43W~F5F8x!3BLYyB%&Dt ztS%gL>QQzWuvNR9QBXBP&9^t^&~)_tnfrbDeZ7Tn_yL~~cpdrHrCYyjik+52c8rD* zJf8}V1ChRykiKlr$y%UH{{TEh;Vmn~?T3FLIpDZL)^$I!*J}~}HPzXK)lUdT;Ew&E zs{{9iFEFp7=R2O#kYU`3lEeSddI6cknVtV%{aE+^gZ^r(;?8%(@}_2nC<%y?#>wa8o4?ZsIu-q!3qnuYwTA6PO`y65lYZ>B!bt=FBbN z!zJD*$-zo}iCYl6;o{t7U*XnayNuvLIKA)Z-7^1mw0k~o({PXNTJYisyUJihXxi<} zPv$*2bsr9R5|UbH8#evw)gWgUzkBd!2S6v>#0c~LE4HD2cN@P|fp#<#6Quw6-@pnC zdq<%LoLU+`z&NnJJhiot;>W`iaz41X7uRY34mL?>NCy|6iK|Ov!E#I!G5iQ`e@APf z0ULPM?%D=7I?*Zclx)o${YGaP{SA=vktTf;b(~4W5UvMr!K6?JA6*&&YSivZL1b?9 zlH*?VtVbg}FV6eEJVz`UIg=jA8w+|M1X*^gS85?AE0|bW0~>n^sSOFxn$}KXsD6|7 zh-`#YM9UasPRa~fQ?Wy;b9eFmwO;Y&dXL2W{~zZ!LWSVKf>a8w#g2}$NT|C$Q!qLL z5Wkw1s^|0c1bT~+58GcO&p-_Vb`;UMt(G-%u9q;sWl@imJ@FbKh!8ut6?IU)1fgMt zc2j+Q1vdSFiV-gZPN@%m8Hh&N(TQG-8(4O%)(Z$i3;;j6GY{s6v_JX_d@U~?{9rLC zEPiYd>B3X}c3+}rCtJ~yvwCjJ=H`qjiv$QmlbEZzL6T&+sV*DFP z1LW|P!Y-0FG_QNT{a{($RofDs!=E~&WMbamZo9pl1mPt(cun6=zFK){I=e|Ep$^3Z zl`6JJ@aej^MQy}-K zK#}S`y+b1&f)@7-^csLR3+?c1flL4~+CZ30(I8>KMvx)L@Z&nDyCV-bqM-vWN(1kP z$auVGE;&!4@-SZ8-wzon+uH|$(u3&q8%2gX% z6#cV8Nobi^@3oVE2F^tf3L$=UrEtYSiQPK_{;pyIcW$|*ew@5n@N?&cv(M!wmUyjF zx9%(EJNmErC5oL_?r&sw6M-n0TF{S=+?A78;f5*Wn40+FcKiEsme5NM3L>t|>=j$fN)a8r92t)5k*5`UGATj>oERW%dnMp3hvWHP{;TVg z=6VyRKX|dyterZs&cIl?VfOxO&@WDVk3;H-mD2mZ@Bg+a+DnM7gaciArZMQU=0dV= z)w3h_!7Jka_evkPsHq;tLl2VA+F=zThSoFBQ{+1G*3b7S)D&bSA-g2Ev%N(e_*$pc z-3{nAPH|ixBYs{v?)VR*UtrOlhPHpVOQ4N#i)L!Q%0KW!)Atf!khdRyDn8trjFaEK zKlJ1Yp-eCHB-6wj^EmbP8iO5OvLo3WhwO#S-XGE*D)aaf5;tTq_ zxiLf}L7O%KXn23Tl&AG*QFl}XIm+#w1p~%K*EJx7Z|Ak%yb;(VSk~Eq2l=h^E?xPgk9kVKpIwygJFVsg;T1iTYBr zd`)AX$IcI!&)V>QoPYE%fPbqG=IE|^3oy#*{d)TZZ(BR<;E4(ETqy>^^Zwlhr;jj0 zX2Uqms?Rc8Xm~_cI})K(lji)xSM+ZX*`!H>OOTUN)HBkr(oxkOrKd6ii#lXm@`9Y7 zY>sR-$_^%?r)0F&-okC*=EsyNEa;NKW2w}VFiM`VJDOqbGMuU#ccbrXVyYvDaZAib2VTOR% zExqHwyGn`u@iyhlBrP$o;xoN9?#F+t*(8hZ9${ zp1n-eB^J|>{HHL4X7+=OFLeQD>;vcms_AB9=3ERp9A7-VM6_}j{Mv;hDq7qsAAO=) zc*NwVrL%r%Onm7BU;H&(FzwaqWS$$~m5P+&*P0?fSGix9JvXVQAf^aNC_lA&f%R)= z7~zOrGw^7HVDg@YP~%YoQ9j)Z;>@O43I38#=yt=?9FmizTufC}zdA{1F7KxmljrHo zo1EU+!_qYZM8l;6(Wmb{5(Nhn8=>V~VQbg;@~C+>UCxe-fQg7Ta%1^&*?KoU-fIk{ zf`aYyY&b_)-M2ux31>bhj5fUY!H<}`V}tt=l!pin4jmGBrIYoAloopxuwl{$42uE& zEATAPCt=mW_&p`PuMHCRM_Q*1?Fc(w=*^`L!HCePygQ(;hu+-<96a(7+XY~N9+8XHWf zWq`mea@*bt@11j8MMS|>NCf_N`h{Y^!x#zzAgvW5A%axC9ImN zcp6&d<h_A~#u!PlA4n>!i>n;}4ez$_CkynD zKd3DRvIo1=6Am(weG9Kl)lh%>t?j7q%coPI9LPYuR0v;h&q0Gf|Bon(2mp- zYT=m+s1K_cdbrJq)T?-u(N7@e)NTg$Ua9b3nTSdiBSeLPW(Y;IqLRC`K1eD{2JlgC z93ST06#6lGjOu^o;Qt&#b_ZbU+Df5#EKAcY?oovF%oyMs#oUrH{T)pIM_`HcnSz=? ztrow&1nBT{ICoPL8RG8zk@qW&W261nSjgtmWl`_y0y<{Ol}?DG<@_j^uaw4(1=#6>5>thmD|A zg!}6}l7T|%hHuB-MOz?9Db~)31f>XNud# z1z}k@qQJHQ)jeit>auQkmi~4@4&uy9KO%#%@U3X!bRE1?TF)Qh zl06i;^Lu4;`|k$6?#h4ESNT@%mbw+IXB+qv)f=4yyTLQ}KJBadC8y)~(3BIxpc=^=G+H1{1w@*#`K{vwA5nO_7*OP$_HfaR+1g~W>CF}l;* zlD-bP+celJ;0+6y6g7gwOr5@^L6dxXxFg`&5LE+C2iY$R@Jk&GylTX%vrnr1ypmH5 z+Vu%2&DB|G+5?SjU&0;Ugw9|O9RPO$%Z!si^5@0e`&k1`@YX?BHRlD5*BFmW#%3rT zYhgec2s5TX{tVa|e)s**`%4AR64rGYPH%koT}!Vh?^4tVkn5%Yc=!exB~N!l(zhMpkRU%-l-b4o>K<^V#< zr&u&2>pD1>)dUe)qKp}F#c^qjbS01MfqR9=d^m8Jl^)RQ#v4}R+D|rMG?MxfF0R1B zxK~%OeZL7D29xIu0TkGMTjU1N<@XtlRP9*}3#pX4BS==WCkL%b6-c#K`+I~ z1F$LBO3amS`sT$v8z8#?b-peZbNiV!PjPF`Qwht)Q zDe6Ehxbr$w1=~Rc)iw{97 zx>vR_?jC>ee;DOz&pru0+D_j)I_x0tzCbAmf{LLBg!CzI*p8n8Tx)roidFX{^>2d} zhD;W^lqGabBE9E_Y;lOR>4!M7mR~mcd$!{+%GlL~hmo?c%=C6@KmixvME=J7M}e2< zL3S1VoFZwZL5SyYb2inM-(4n26pX&p?jbsw<63W+|MQYYF9$5HHqrGoG@%2I>iQZh z@hEab@K}J*0M^5}l-|>FFiw58%R*jHX=D8#f&lbwn$O$@rkQjFjH1whQZSYXcumNX z6L!HWU5dH{l!{=T&z~0k_5VM8%one&jL`nw?R?8uT6L(|C<1O9O%I3q|69#TEhGkt zPr9wQ#ubvgj$gdr@lK7u;$*r#yF3S-G}*# z5TX2KHwbtKtdsQ*g%GNO-_skUOu%j9=K(lR56?u1K9cx;_>bA&QppN}YGq*S?r{it z%A?>DOoOrRN1kNk~ZuFlLED`v$X=_&bN4f``1ykqfj5dRv?hvE>F{RsA)Fj>hE_ya}I%E31K zcWyHBfBvigLViS$jC&njRjaO?-Gj3&4!!1QR7-PJ!jJw1ApcdI0PR&x;w3n-n)(h$ z8)nS_X1j3FP8HQ)(ie%VC3#LwUS63`i)j)pGbWMekU_bkgTgVj#k6u!yN(^<%gEpp z6`qNHEOeOlC%i7@#eg`@iVm(R&Dwtfmj~o_j6 z#m;YMO3ieut^zr@ciZfj#|^W10!GaSYGN=$w5%D4Bl4pW12e7eh#TM9cz3feNxZpU zM0k||qKC3VNg~cKO}2lQ@XRvWig&x0s3x)+0Fh3W63#QL!+{P6{7=c4td~t87knU9 zL5P)hcOc2f<@nNqd^C6QeCc|^z4^f8O`I4QlX*2xT)AlklBi#PMsLZo^8ry25P58Jl9Dw)asHg6&f2Nq-K zg>QRb7^FanqOXiS3u=Y~C1$&3mvFhRv4yT1_NAv&VnWKrzh-bmw2DZWL zb-uD9v?;?cypc~`8DYzvwsx=>qWRa9e*1Ed)Q~aTpu#|527;HGC0^Av{$fi>@LCHV z)hH{z(7iO%s|rgXHEgqXseo^B>Bi>d+f_w}$68`nZ(flBVsyKZi$C{ju=7p&WwIaz zL$5^^LMr}uijX=ASZ)mRyjrDJ^mgnDkp}#tyaOZ4v%8A2+Rt` zjJ+62IC0m2MQl2u?onKFdWGghD z`~vnDJ2{hqkm>q}l+eph%u*amGNT$XW}6J7?xB)EkF#fM#tn5j-XH?y+pe;EwXTN* zVsSyM7#Bo0wF2<)eYxvZvA_bY+Ydb0&WsIkGRpMEn%#!$mi!0c#MT=N*Q3a)`(^?@ zL=q*)k#Hrsn6?H#cPM(O^1l3Nz%MpJss!#8w}!1pq}5D(uF;^JI{F7Oj8_15sQV|} zx1T&`(;MMViBPJcfs~S(KAW6-tB-UfUp`cQ`^H-m`@JdHdQ3vW z!jYLy>sw>TMK)fL=K^}ve6b=Bl$Nv3JwXxY!k&^YvVB(8b=U|V{UB7DV0KPtER0_G z`cF1fl*$JZM@N4z2g5?SWv{T~^Bdg>G6@lhTpjO;tlH_v{Ap|4DVRp%DH334F7 zy3=A3?|CvrWcf#4L+Qr=dcoz-4m{EYiq$Mo#uh@?!41ZsI9C#}>@ zNDB44cj)Uum)y(P9o=Sse_EoO#iX>S{>YLai{;rmDTC>Zg9p)Cp}HCfL1 zBQe(R>$x*_dgq7@=OmG`DId{r6{~Nf6@G?R=|Dw2q?zUO=~h!I5jx&G${1uvFi;#CU+HQS?f0kyIqW-diqu++%!*ldd@p?1KwgsLK|i#A(bWhW4vv4 zW0o_gs^5M&Q-UPKjHJLEb>Z)Q^{9sN{k8Eoi2>HgmB!pP)+$3*1gr+@_=un9|5)`= zdmQ(SbS%{9FyaCeqwKE4w@-J%QEq|3Jx~i@&qM;er@Sxo!ZNQlT%dwgwCXNoX8S>_ zIFHxVE&V&}AfOxMz#k2(uru_M)4ink9SxC1 z2Y!;lq_IXwBU(ABVpy}LK$eMre~sjak_uhP9bn!&9=hUH`%Eh#k%3Y4Fnf0KKaI>} zg!hgnkz+_%_8tEkfY((!g-R6E7^A@%E|MMeIlx)qmsmrdBb!qLwd$p`m>m2bBeG~h zcfQjGpFXW59urBRFh9h?SsnTHrie*v-}J{|e^pJ#suX6TqxwBO{%sdICP)m<*8KB~ z@=@9`aGkXZ3i5Gtnr&rX@ukN<0#IAUd->=x7rOqy&i-3GN2oD z8{Z?EiG^~Ad-&Y!%Ey%lt9oazpvd%V96*yp1u2 zR^f#U46vKwgxGYtfTc)Lj=40L1LITZ9#c&p^!hi+$4a=}^=8$_?&WyFC_5*PXkkB= znuM>0^KwBdmZ5=&j)ofK453hG-6R0y?q%h9Kaudm^2r0OjW$DUpNrmTR%GmX>IKd& z!4Yoadb2V^(;~P8@8QK_t>SIhv~<9k>}Y*o84ASZx;PA{-rfPv$?C#&)rt9hP)4fC z=vc}dngSyW$4rQ|8S&A6H&(JZH#z| zeEcEC)T{+0&1W*B3pUnO#n25bK`MO*_X9nQv590`k z8+qfQTBW*o9s+h%$*UhvD+Di>mOE_=;NmL=pkiYZWnd3ITA0n{7L9BrmBSEm(00Px>V~;fuqIIEfW3y8F zCp084llO;w08VBNYZ4VR9dYm?f-EoRNch$1Adum8a{q+ti@~<_f4#Me2NIw-lxNe* zagpl;bTi5*sm0M0P4J0p_lizAa~4ao{?!48;wg>UXq(2Lp{$Y%yly{<1tAY-I+zHjpP@ z!a9x-DSG((-wL_6QzNmR`JsG8*K=lIjbxrG?e^$eqeDsS4ra(lRt>lmUNIKVy}cj% z&!0Y*!4&BR(xa?Qt6|xdGz-_y*zGH)3MykD&?M&81HLkXu0e>#2@s?_LhgNU$OVMx zLQnFH$lXh3@_JpuPE!nySJaD?-D;SeyW{?Oi)c6Oc5`jf)$t6r4q2GFYQ&1W77~3M z@i@R6>`J1*#Y;{1-U!Pnt$MmZn9*ObY4Oephy!k!0`m5X9YIi^5vMz#3C8{iO}IfK zr^*spOH56lsY_tD4*cc093kpqh(?kEtOs%OmYRQ~3R!606Q7adHvKLw?a~2b?RVw* z&uyxsW%9zuvim;p4Tb$YY0SN_75OhZaiuhYIP>~?OzCNA)0Iqt@<)q{T+qoq!x5{h z&z}(rihdoGrE@>rA?h};Ev(y1>D}~tm|e(l&{h1YYbAWwz{efdlv;ei++$s2sa*bu zp#5NTl?AETE)DF20)zDMXa|ASG82n!(`Vhjp? zbIqN_GNaq$V!)f8(pBG5OZUrypWaqFz=*6ECw_jIH+@9)?l(q*k;D^0egp5?_RR+~ zvEHBh;4my&+w`Gy%iLlGMo8eEnzJaeG_s*aC+4(dM6Pv-p{3M_z>naD4nzKBrxy3Z z%9TY;K|KZ`j*5>@TLV|D6bo0$j{0={aBg8{$Wcx5@S)UyLMnMZu`wzOyM)aAm}ay4 zSzErVbtj7UN`+o6*?~!CTob2$L$JB&mmPVw6^a?`1RJTenZ%uLrK7b~O(S)oVmTcC z=PRD&=)b3CaF;e4n%4h(_^U!J7jX$?3Y%lORQ0XCky4Ey65EY&If))+O&;5LvIPyJCy*d$NSeGW*M;J$_xVND-yp6d{6c*@_9wuM; zj`oYYxWulPsuX!O8QS3Js~M&fD%egKeE?4G_q-p>cjlk-y?_)G#(NF*w?p}~T0Hr9 zhu_e;11q8$22B?gF|nt&KcuClz8tau+kda?rY_Y-$(p16Ulgo3o;0u$u(??sp6ff$ z@X&ct6$ThS9$<15Y;iCPhsU?l>H$u)0EJ zwDs%DS!`z23btQj=@t3f_+89Djzc?`+ecMKi26}@Sq2{VmY{Z-{7avGKe`>=8_Zxz zbb!?R5Ru5DCY+WoWqhH6@#obYvY!jYv6Y%xd^Ws<#dKZ%zUKw-(CYAE3NZ+O-no<5 z$k4Ubp5fO1ka;b>k3auPS zPMizV{BkAb)zvpzU&BeGTQJS!EC#(+M}^Nf(Q7yTDyHk>P%P^eXgpg{VZ1P)B+W09 z{azc~4#dgn+Fr*HIjMBE_^ezbs>5ceJbXLXb}whtX8bY}rpk<0fnKdm($qrQ{G=o|H2geE_n_@bH#9BM#7&DXByW{ zF3tmbf-FzHqZWS0&b|~gIi~V>B^eWPiKxi$eFbq*>w7k(q9F=~9Nk>vjx3k4tgdIyCh(J)X1rO3wsl=Eb2C~{(zPoAkVEPvoxAi8Kg*0V|iHQDDE&T(ZxsN!TsRtydm=K%;H&$omPEu&dN;+p>@yT^W-e}sNgUisggLckvk5cGJ@sOOf@b9+1$*I2RADU@ZYT>&u7lNB!%u`%$q|*TZ<1p z9-Zm84Rw&Gu&S>l`GK`OtkGpct4*||r*6zaQ>L6C>|tJZlZC~z4rYa`RVUp05P*|0 zJ?f0HpqozRg_zH5S0PFAp-)3$b*#bl7&FP~Cp4Pl9{Q9gIFA*C&dK` zYxua0c7IO#(F9SkTN`WQ?xz|rx}U*_MC}i4LIwu=@8uZ~>Kv}B|J7sWL)JI4(ah;t zsiw)*#Be{qTnreLZt6y5wuBBI(P3ihH{u>{uZdrOVcxv+jd{B~tJcZ(l5ttd!FQQFl|LM*?Hj2tS|>L1x=#fXSt%-#|mZ95lh z1G#3>O~a4Dmbr`Ywn|Fv+{Ov@xa=;EHzty&%?X-5a)iY4HdI%S1W(Q%8T$tP@w zP&hapmpAxHLi`G!x9n&gGPGFlh16TGjF6mnY&)zsQMO~OPn(;8k6BL($hW2 zTvn_*&O;V^!6 zAuJ2t5s8iNCNv#)vE>VXA^ELz^Kmlvf@9ae_pTl5P4|#(+JZ%qsCha zEkA#r*U_enr}`p44EjZl9U996B%B%JjW$+3ezGTW2&A*C1XEL5B(T+GIv4?^8)e5c zY_8ohn<2f%@rg!sTse-WN0{YQa6U%4p0hU#>>1j(MLj-Ay5_NSUZ%BdsNA58Ta_k~ zPd@LMcbXXA=!8|nPcwa1e~)irY=ccJm=MP_u>F591O^&~fmvw0G%eAzm_c`KVcLL| zH}gQWteow7TZ;H->nnl~<0+47vt`9ffwj8WnJ*pXZO7_qQ}FC)`qd6ei5)BS-n3@M z3w?`~)R|UJdGPD3-MbQIzM$#U)E%7hml`asodRZ~#g|fg0~M~9qJne%KX%Llb#I+8 z-ZA@$jN`& zh)`lX3<@ShLmA&4u#L|f?MLylEvF~sI-M}h%9iJpOho67T3D=7``^5}dp@^J&PVhe zDmZAS>pW;5$QpAdluMiEv~TVH1Vua@X&wB^u8wu&Ak6&n@Y~C6$1}d=VEgc;@S;DS z>CJkvL7<57wsn(~8n{xq{}*0}yVO_TMEu*IlCVE+ceqhdDCF>C1besP{YZ-8(n`2v z)qrtDEhgRd}%ydFM)frv`{Ny zf-+d(yWUj^2WnSApa`K!s1|EXw>5)N^mDe;M=^oLcY+k$x^)=RFf-D^Sn3wqwCr^c z4QlTAIjx;#t&PIh1R>qk^t1}LR)kgOQp9citLZ8e1{R8W1Xo8mB_=+kb6k(wc(L$$ zTd$?J%`^vZq0@6S0jiVzqn8|2=4e-IFt2g-WATY1!n%ci|BUOqj4__iU3+cjG)rV3 zKT|%4QqNXg4a_Cz@Kwo=UnP0bCnTnsl7rWVeNWH2yt;$+`s2|s!1vr-*SZU z>6O>06&FDi1=PrkKC6sF!$LTTCB16!!j3zE9bBJHclJ}qQ&VqKVteeZpVNVr7Jp!n1_7V~0ZH zvh;z8ArVmwVO#cjmdKOuy7?myumlB^(>j6Zsm0TR zJpVO3kwh9w(>y)Gk&x{In;%ZCWplDDbZOoO2F~|yHC%UtENtZ(>9s7&S7^^e;p?2G z3a6XhjUL#hiD5>hH%&bCzHWluQ{bK!gXCugmuAhVt`8ubIO!tL{ zYlAMTIZ~ODMIBFN*ukj8^-H40RlE|D)pnISutie)+?@DaOQWO+e3AgAvy~pMdKV2e zfvU6_!O@2c|M}6`K6vMS>`u4`M{ty66WX~9PRbJw;IZM|kd)5*g>p<>_2xGQR>W%r z!tgxH1SI)c$b(4UkJ9CJ*XxNBhY?gZtIhMM6E?vj@?mI+#i+V>fy>wChUGZs1<26D zAoivI+5PTpKcGhn>Z#w)J3>g7E%cM}>iL!f?ARRuCki6j4!A2iSy~HC&cDB^*76U& zuTgenkby)FowNnkZEe#~VnZ!MRZN~R)@^$+&f-8w9pzAjk>YglDOUUJH+E&5^KqsN z&HJemdG+^^m@hqCxTq&I>LWn5+C)8ScJYgFC-Z64RJzhBXnaM-F-q~;B z>EvaxlKj0=osabDeY{6V!c?f+&PbkcOx~ur=b_>5;2_Pu*vG?~-2FXzroTHjTHbt8 zaJ@Z)`t`uI?Y5J#*)mB&#)O57(|4%PVJf;`Mpymge>6r$lE3GW57~%e3MQxAo@BKR z^FC7;XpVX7VFJYWR$nQev`+f%(BFYHc{tvY1|GY>_U#gvg2928R#!kP>KF&mumo)? z2#e9Cw5v-gzs>wZKe4KCA?8a)gdv|{!`Ry|gWa1Fz_Ffc(9&h~h$IfjV^6eY5*wGm z@O&Ywf9_Z+##Zoe5Ud9kw3z^ml6>;DtM>un;n)qxz6!<`h(~WiSA|ux(x{c*n^t+oN=WBab zM&t`*k6XICOILBEP%cwR0!?s=J5ydG$M-^}?-b^w+Er>R1x>Dva^@&< zvVFa`bW5GQIm+Bz@$o)&A1tZQef?m(bxMLpsminSaq`-%J_7MgJL6__Lk8~|x^BnO z4AgjNN*aHa9^f{d#ETD+A&|V`ZXyt*Ft~^?LQlORM}eTtFBWdCyf5R;e4Lu{gmwNQ zuzNi2Fo_MWZNna|kTldjYiww0p78Qqv!8jZ?uWeUa~DOM4r)lAc4cLPTNTL8vcODvo|JsB{CEiz|Rh$^c+*CEucN(dT2zMAJTf(Rir3heN~h`Xicz)noBb8D?FZZ8E|PR6MMdWT^9#$O^907tk9vh~ zd}O?wPKO*~v{m}-HSs2xlzFLCnCzdy?7%)5FNhOg1rWSl#OMeyO}a_D!vuqhLqzJ) zQH<-}QfrSm5!DASl1FspiMws+m>q%=+%GFMsXd9>+u7f_zpvm~NyyeX==|01Or{xr zkDD;`ZAGgnQW620ICZ|D8+AGH(mhj;7!h=Muqk(qj5od3Lh2u5;k^ zio)|N%1=RN6h~XY*Rxh)C3Zje z(L1q`34O^|dottX(%>=Fbgg%0mj2?(7)(YdXfpc9!qV{`v{MFOAg#im2!pmoM-Z$egsb{=ZCmd1o9(*;oFbkPg0f| zdNvny5}pOX+qeXrWc^bXPC6vB}iQ5C+uGakJ*9I z^m#Wc_zPLw{^b^$vF!7+r(hRcBMF>rSofJqTUYGNgi?F}0NUF*{vxIo121GiLJ zme2sSR2BTK2Vc9X@WrR;jG_==(!+#M1ZWaIc=}ob8{4_hkEb9?PL@ZzF?F!Wc1gDT zuFk^ukp%-0K0rqG_Q~P3BeD-3WSsn!zzgFCJ1cSpxY*V(-yi)Hl)aWsYI;D?Ww;0(85O-00v{u^$S)oS_hpc zOqgk|p}~2s2gGJG(z7q(l*yaw2l9ewk;5^QYTD{0k9bn}12#@R4XhL@-R&Oz0aR;M z8CW;iRes7<3_Zks*4J8AV;7HO_m`wV3oyoik~z?n9nC@yC-8a((s(lMIz1Gh_5cu> zfkrXVH~9i!-M9z1P*DWZP2~Hmy^BmvAlS_tvd@Xq-oX*z{Mo1UGmnZ6p#F|vKqrHH z;sEe(HM(oO1>Xx7_R-%!L{>0P0c`n?>L4&9<)xP9j#Rjt>b`~0yu+e zA;J)nyn+^xoxF@2D2BisC!`T9?cZ!ID58oZvcqAGSKy7|vWzhdI4l}ao#Tg!SwAic z6T*}!fMJ?@-0Suz2Z4qdv*dEQK2KeTq67DP6T3S;deK$`!uMhYj!?WdynY)K0r3lF zP{knoAr*Hv57pPGp0=s$-Da;N=(AD=F!UAl8 zL!z&oJAH&8g%)gh9;@CbrY7PKudV513YP+$;qMW-aO^JfHCeDW^QZXmraU}@O*)uv z5*rX!FV%maCWH$P#ShOA__e%x;k99V+$A-!2;pb^+!3DYV|FYNtVzbf`Zrmdq%cIN zYBEe_$hDQS9>j_wEejIBO7uTSLFUa+8aE5J6$5d0oZc6bqbwAsQ zc+LHlR-D?BC^S$sQtp>9_uB4bdO_Y}kKNU#UnRO-Q)xKuuclv!mA$u=j4Q2)txwi> znw59>#c_gr0;8*{@^;4HXCZ!QAVFu+nyVoD{RRVhbls3v1DPY=^7rK9XWSBvO?;wP zI!q|X%(Y093`84GPAE@aL)a`(e7FZHAC7$(gM}3nxJs_6hd5XGE-Ejv&7tABLfIy3S#LTgPVs-XXwCfb)e(9~TpC>L8+j@vCPirns1>szk~jxiDrFqJE#}>fLv{ zABwp&3AJH(2~VI?Ja@o|*fIHlV2@A&Hbgv_ZUfOs+F7ylGk$VX?zyGZQq4Zp_D8an z*G;4_VdtN(U|y~jNw9oCi+3j5=IukfE1@Fo3Y*}}4 zKw`4vWU-H;USwOgz(CNkoySlVMIAmojcZ*{Wx(6@<(@-gwzdh=Db2PsDUX3vgAoQU z{C$LAh8@O(?B4%_BX(ihBLp_?Z&Niz?so6c zY{2(MKPQ_(TI;rM>UfoyYI2II+jGIymvFhyH6WZM&an&MDTj3S`Y0=zLM_t#?QUH8 zE0L76he&7IAAHm{@Mqgr`*YMO5zj=bB1kuQ3=!<*8fmlgg0*9K-?YZI5PY8MZ1zKR zcFRZGDzjRmoh;ar*=<*JnW;Tj-M-N_{m>YJM{yB=ULyg2%!o+6{gS9m;E8<-DeME^ z(5%({saCK^`7JMfEFhKD`fkxA)_W(!IDx@=Vrm#+H!`iZz=(y(eFV$Gl}1 z*H=WkLftm3h>%p=H9vA20MJMlTZtcti$bkrft&uMw+0!3y9DtANw|F3+C!_K&ppsq zykE3=rKR^(?lUo&P?r zqg5X%3z_bY?2!Qt$?7G$2X@=j%;EZ zX`Sj58-bzXUanjtr`AzlNpk|FUeXK@Gdn)Uc%^*|)D@GM0vc~Ir^WDAN8kc6KJzz@ z^X4?55~@X@1KGYSntHk*M^OJQ)xLj!u!)6yfkyN*grtQ7lI4A#dEeqYpl?HDRCo&Rd+zW1h3OfsXJX}Ixw}$Wcnd2*zCG=Q{B?xurD%Er41Mn3o=;oR-=&-h65yd4WoKEmf>~3H7C2 z>|NlpZfV(zlDDYF21dtT{kRfAls4P$`dcI_3^FWPrdYXm(H;dNwpK=MS2PnH3Wa;e zDRsl?4@SLVmCK-4NV6?hSn$Z@M=4jn>x)2+GX|e1kaArEAR}*i=){4=hN$?rn9Y=6 z=E|UZoPcBtq)Xvv0T^>BX^}>cx3bja4SM+%-8NIh^FvC-TG~U(fSrj^G?0Y~9Q5RD zGShH37#vbcDf|opl+})qmz`QGV}#cx06cmAWm_zIb(!=n46zYvbV zAABS#oEEh_YEG4d)|><#dzA-m{O5a7qUs;$-k56trBCZU&w%zyKxV^#Rp1_0o=AH z(ONIXiQaB1MbU1bOST~L_>BQBHYlD0F7}k)dDwRPgX_T=f)5OOdD7fftM7tBVm0=~ zF>KZ5nW&G&NQsVT#8 zw3sV-OR9soq$R`nA^@*o6nn+~uS4@6Dgwmp0+5@3UTr|V(BgaX^5bKu(F~`#-> z>aDaGpn<>GOvC`vM!%U4d9^_QDq1fls;*z|<)xjid?`5db3jS%zhAa5LhFE>q8-pT zujzVP+YM=Vk$6HM_AMEb?19(@|9(-zcO0>V@SkUEU;lU>SghR!nN~Qa@P3#RJo{2Z zk9K2vdp`UIsmlE$bOO|r|H!pLUBfm7I!MauFGcsa3ep%RW(N_w$p@fB69Z|nO#$(y zhZSiH@V!p;oNXY3g58 z-fz*X$%%J=r0Vv1dERU40tZ2e)(#G(lo$qIA+#kg32D(&|R)VEJB>3>- z(2b3I=C;6W1AGYpiIXje1V)nqhiDo$e6&DqbEj$%9fP@7;Kq4Zes_Z)8?-}6CNas+ zjz=}@8bQU=Ir+m69K|Z&cH(`f&IYh`CI1TILzAtT9vpGk@fuHGh|M%!Z3D$nbD#tF zj~5_R;JkfZqJf!BT~5yB6gA3jKC|V)>{<%=Nwt9Dz<*(v)%dqu1%V5p`TSeMkdZq? zLk%L`Lct(~>{flg|F`b-XABXZKrs5G>;9d*Oz8(F`gpkg{g3 zk(``$7lQHqTT|3(~!6JS9pF9Y6rrzw!4r$kgr}am&Cl!_TKLcbDLoZk2&ADyak7dN?L=tc!!5FR z0TA<`%NY1NATiZEppu)x;N5eF=-=_oy8b+Xdl$cB3uwNP_w=Dk%->fng}5l}%3~9A zK47_iGT!St5-? z1Ih5pAnA@uXbUWPg7}}EMB_A>03u6l|9D%JLaP3M42FSu7V(Tk%TfW5RkFG1 zaQg}w%YY*0+SAa$Qoa3!A7;Zj3Z(em0HkVeAyXJB+6FLYptfWUi1`rE2Qnx9z4PFS zuc(Z@HSJG*F~ikNWdZc{|9OHaRB|U|piEb3xh!c1SsMTfUY2e;P=oykB>rdf$ss2& zfE>EpPA?!hA6^EK<^sY|N=1*Owq=w7Q2`08@m-w13lIAK`w20T$65e&Q@&L)b738W zv}wO@9Oe9fn6q7$5RLS)oM8AZA%Mx~ceVv>Max&vdItgJR$MV!mjVnq3Kxh3;5EA5 zui(>m$74+}DWvq~`eM|j#xYiMHUHMP%lALsKaKcGJQy6No4WP~uQ{912$)4&9U#F& za#1@EoeHdrc?`F;$|@kjcX&yO4j>OW4OKgUxrY7MrU;5B+KQ0n*oJ~MQpb`f(-iQk z;;RI3Tf1fiyefbtV(m047zI}s`|nr|;me?QZP_9VBEEe#5vA~Y9Ap>eM ziFd^8z|vce(=fDtd%KXjl!4U9Ut=mT(MBMpfIQhB4~v0+P6uNF5Gf$n#)RD4YAs-Gfy4(oo3>){0Cg+3kQTkWD@I*!dF@nezvGrZ$4@y= zbLD-k{DT7$j=fbvgX&VER7HR5{h*EYpJoC0-G8_nxC8AzU4(w1s!0xn41vJ|kdg-b z#x#&kP&ov0EJE=+cRJ=(YWSfT^p~~#J5mxK0X-FJ4&Jv0K)YfGM3YbPY{t>Gd*4vh z07+0-)G7(w-1C3$m(UGDTzQ-cvhz(~aXrC^tEFxasYgS&|8W(|tKmOT=7oQM)Z++^ z#0KCjA$$hn>#b|WzqIzCtmTia9xia2)dq|Sl9Ji4Kn53JoFPluTu`#GRk(rraz)I^ z?7jxmsiq}}_W{EfP)Cm-bJ&_dqnPAyMQGFc4@5nb&2<7P5lqZy98({>c=np$ry)m} z$(7JSXJ~tT{dar(`@Op`K!j@a-F%Yv^#SBa0AbxB-aX`kQ*OR~p!$cJ8{mJ2amB9t z?kELK>mh+vaq4fWpmT=dVz%E-6`rbk#->Z>f?gbuFfe9Ei96eGfIdXZg&sZtx~$c) zne+i!AXNNkDU_z$Ne0%ex06?@5y&WPaEZwnH0uLnz*{}!*ms!dfV=)55ZvDe7F~D3 z2~Kx`u%1~6->z&H(Mt-kyT= z@4X$~4cn?($P2OKPK5;!hsxe;3IQGIte~N;$_08z_nrD3dF+K5Fl9map5z4%PQN(? zJz)9%DejwqTZNR6Acl1|C1HaBBqee)0!IITbjlJWqG|+w)~+h4Kp~(SBs!Aw??CDG zcPK=8_wN-99-!;ilTg*0dcv@}Vstg361)CC%pQI+z$PR&A1`FRHU4OXV^b^~bBXGfe?xihp48U2h?&YXCEagcJWr zr$FldAkekH@BOPl+Rn{OjU^TR!~(RuEcBBcGI>@$Yj4bqU&^K}zPdHQ?)ex0_*S4` z!=nH)9eGq79>27V)!P*#qF z4w`Wi^Vy^zD0=^Uhr$=P<)=6R_!Y||Q$f&<8RiC|W`_QMoHiMxfQih~_6{`2!PUmb zgN6c>=JUSkN$vViyndjsnn49|q88tDmi&MTOCH;{8>`5) z+ntFWuedol!!{|Z@s?0ve`d%-8HEvwL7l@i$!#=7PL?+lEP)h+>EJhRCJI$P#3e8` z^iWR)2bG9e{V#zYPwRXUA%|H)Jda(0n`H2Ps7iz<8UvN2BNWJ{!Vqa_xj&8Ht*qIEL2Oop=xsduRw^3(a7jtwjq^6J zJe^=l5_0MjLMoqrno8>!nw*NXPR+>Vk|E07=jHqz_&Y@2i^Jyothulhyq-4k3aP*3 z@+Bl&1sEq#zT%3C9Uo&mC{w6haUs?n-503Q$DH}ne?l{152OOsfTA!8d$xum8bYd8 zRlqX7;+kK-yUDh>Q3L!nwza%2${P|3T%;9OkJ%rS9kfbQ*1n`-(HGW>zjm)E8s-a7 z8<@zxTonT?qX-H@0L$BX^?V3c5=+%VUnuKSbTZitx^wLW%bV+<+!bw`B ztfF4NJ71^ZP(A=3ByOC0Kv5rvL_5hF7oH@#95sx=nxgizGvH*E>(p$A?EG95=~vNY zo|$g)L&oynrLNB0aX#Ol=lkOIXak~G-6kJijmyQv8%D2(W#r~Irbp=CCHv3>yAx+z zV6lZ|4bq4*wx{MtB-@#k2t-32<;J7jxcR!K*vqdkgl5UCzHbO|8%PHPhLlZ#r`oY>uLcE$ubMf{t}!RX0 zt3M)9ObM+f&+qox4h$)98(gfZ^cBjyJLA>XnXsAYO+~V8mkvbZm#*zms$csO21GZ@ zE)0CUm2U&&=){h}-~`YiY%{m1@=0+EK(x&m>5l2-yu#N*kW^ z+Oa!I^SCaFCakeJ{UHHGLfuL_x%2dcYG@7mCRJ6aN*!#zFTlZ}Q<#<|z}$Gl%)dj_`-$ zK=g?mjsi5Vr+1*UdM|_VN*(32?Hz=*m#2nHyy5+|e3S(^=+Y`2s+ zKFl?}jzFT0H1HBFlW{<2)tTcjDcR;mFa6BU-+MCdG$8NpwD|*1~GjDU-lo`z4k4HRE|BiTaD^6opvra8WMefinll-dN-{{>)uS`Rk zIp+-FFjopvEnZnJnJqy)DBo$$t z@NBbH)TT9#b%P7a&aN&_%$4*fB{Y7iH^9?eobE4@@Ti5(N;@w+c-8FqtGeJiyz*`E zbpe0xO>GzA3F+eVIW8oJ20*mGd@2DIW|EW3RAy%Sp6=1-dbuBqnqrdExzwR~*|QH;Pg|HPN~rLHymG)^PA zQ_|xY$J8p1*?_SSm2>|Q-_+`gV&Mq=(cHD}rP9&dVWdOaHErR~cSL0mm3h9ov`8nV zjff_la5!RUlOGJ2@b|B=92XjbvM2G2D>Ub~fP79$yYxaBqg6%OO+3jTs3@kOQIaqn z16ujw>z&@u)xb=k%%p5^hLz6gDCMQrx%~lF1Udlx%5>Bn6MDHI8v-+DhMD1W4OaiD z?@V64@+LYmvUz4eVq)o$)xZx_5mxYv=S=kDkhc`274mn38gLprx!QG`WdMgq>F|?W z&NeVj$X0biP5X1?HhQmC-7d32vaf)4ef~ho?#ekvHYb+TmGY1Ghhnp?#Exr996Lq( z!wUIPJrz84nyWdiM?KD1~(|`Y;DMI+-6{bo7OGyNmgE<_>3uYHR@3!lK(rzp}~k*@@fD38Xac zT5jD`Oo;vTOxVM#ZRcDv5 zq;7xBb<%rNH{H)6oy41QKwK+4m8?Mh&|#7{fnn(9YLeP^;yk5j@IlxLy&~`O_&_oIm8%@0sdEztc6Qc>H}(5IvlnGd$pp+g+41tUCt{?oeOWd`+5KkUzWX)Q8)un? z?JBB|FKy!4EL2su@l=dC8eRo76uinGnxBt4n8=5dKQIz>xaz&xFO1?X{F;7-p)SX2 zzt?=>;s89Un$0%K)*I?3BTHpM`37IFdu{*^_n|x_3O?!v>%UI_&BtQ%#_LU z-{fL(?aOA=RhT&Z;89vhcFin&(3n4EKIB7C8k`e3+9WyMX^!JV(W|Xizp=hyMNOY9 z&|t~YU4FBeK(lwY%701IW__Gi|n=1_>&wr!7Ts>Auv_rX=EgF}8#K z#tzVD9ciu^DyJ42K-I5iw&B`QmuM(&elPT z(#2=(nH|#1pV?)D@JMI}rsA_#{sNOVsN8H_@Y`c<@)K>d^5?!b#u|I(L~^l>^J4F= zPqkfD+f{={&ySvxzM9;+=-T5p9cffN&kc&&Z;DoASDf&@nhuMgRGJ`4Q>|AC3~*0m#eiCdbOM(N>fC5ivnTBUCdXy=SiSMmr5Snvjb5vfLj0N)Euxn&r1uorKEo?$=zt1 z;3_+>wz_e1J$tiQp-9}yd2Z2)o`P1u`jhjDMmlk&hi_L6Xf%!E&=?5_WRKZIeqA|o zlPB($E>xZ0QFqoy(8}8nMo=e`-S1D*7-v5T?5wEPrt7D1Dw~%-osE(h-thN24))hx zhynIFzsOi7675G~&GvlfYvbx_O7}t%d{)B?(sA*-1a0hl5o%?joAD(`eZdWazd7;nV@XGOIPZ*=BcBgLo zB=P1FXo_wEjgOBGG&N^g$@7`==P(tXU1<9BX^!VhG|>W}ta?Iv;+&yTy|y4VUEip> zfWMb)KfuG6P0XaFgV3|$hI<-sek}m9n%teRo8B4aCJ6->i_5-g@UMtfD#qID?zJg` z#19ypmx6Do?qIL)fPc|Dk)lmEXNfk^4m>=irMvYuKJ%&qRTnEir>b{PW@hYW&ywbf zUnjGF3eu+4F)7u1ou!3pV03uqUCb=AKn3Ss&liJnIu2jn8S_+KJOH;RtMygtaiSHIA)(zz^Ergg>~$Kyn;rt9;iqOc;1DNRhq(c6&C zco?%PiH@!1=))App0nj?-@|ux1|9})JUt$!_C_-(zf{N_O$Vo`2ZD5{wll#vV%8VG zG@Kho`n343gNQ~3YssuLt4sBe_wp68NQYP4fVRCJ=cWr6g5V5G0*PKn>L*t1R(ZtB zd)babaYvhb@8!A4-HnEGq?YOAb<)rZ2fs0m`Cx>)=<%G?y6YX{$5$9Mqoe2>VhCR- z60)XFI5!*sacK2{8QXwEo9zu8kEz3zo-cz5B?m19(}sH?SF~#9sPaBG`K{Vw(eZnM z#~l$_)*lo5KB&D)Oj#?#Y%a3rOo(6Sw6;2I%(upQFjR4%I zbH*mLHtQ2!z(9qjpJM-=;lf{nx2K@ zs*uBjMtwYW(n!G@$H(IA8<=$&e6vM&zXud34Z<3Zh@_7E7Gxt>JsD-Q{jWCE%JThtg`pAkg8V+Mz%5vs= z2kU_ktabAH9{krGs%fHVQeU++dF=@(!=;%c%?Hx&5!sjxuDUd&GQ%!EaY$}ZQVcKe zUx+)a*~Nq7dvEeprj*@fWL4_5RQyxbxhG4H=j;zV#HQDoc_d-Rew}9Hu^Enh8@M94 zZ*Y{2$*QwQ9xT#8GtS&_<_fd?Grl+cJzlU}y`yaeFujr}v~l+0b{QL%9ili=>XgY( zcAoMoRp!;r@--ORoh0M*ukg<}Qc{XJxhmm!70)cuB=jhl_-dsVmU#Q~#K4&+rE*Nq zF3X+9?v!oDjblS|qv6cN<;JvPZPzsO#%upJ+$hS3!TzS^#y2z0XWtO|h1FF!%v58F z&9wH=_$DfhrR(`OwI?3-_up(N&}U0!ot*I#N2QUI$=#5-rREIhs1RPvr}jnU}) zJS(<8D)L1(LvnCV!@GFVTC#04?%Xf?%x=~((YQJJL0h~u|E@5JhTGr(o4d!$Bu@2$ zam1kbK6ej+Si<;2pE>zwJ0Y{{UHr)sA_t&r`5pG#8Q=Wbv?A!89Ny*m5=hJ?d)*LH zoHhqU;Hv_M9Ew}NBA+qrbMUXYs;p2`)qL(c06N5ijr`* zep{Y~H!3d84|9h^y+;h$jK)4Tv?J12jW^LGm<5BN3`pN=sn=%W!|+b1M8*Z7LzM5b zBRG$I+}`ZHqR(J%)Lm8ZyH_|=M8|JJO@=7#yMp5@NnGZ=bj?Ndy!~@K!6fdE10VN4V%=**C8Jdqo~j?691Y9NG5eV#Yc{vJdu~N# zBph7izq#Eu-GFhwSS99l0_l-?9E%&1ScqN)>hhj0Lp)n_WHTBe+m~KPw};9MQ|RBi zNPL5HG_nxY8f}B9rQ;u=YuF|fJ`_Mp{bF+Z=|OWK*CE#dOZG@^Obha{>9fuljK>eZ zmdr1~a+>-G5zt0VmK*illTYgRXCt~x%i8jQiFiZv><4;b1$BBddfrxTU0$m>h;*b*t zX{j!0laJEpJX~r}6;J<7=ce9o3jAMwuAy)OJ^vJhr zuB#r}&7B!4P~+)%jk&;}Za(o|>}{w?qzagm7;Yt$^y2JED$v|yEica{{oq@AKwJQ} zV8gv=5)N~o^EaWy&XP-bx0BZpRuh3r`^!|V9T+e>QkVArQgNK@pLKAYcrJ}=473uM z(}N!eosFCdnlNVFgcn;cUr!&drrA}$j*c?W5i6wOJimZ1SSJ)HLcR(?!zC!uv)CF) zX7q#W((BC-NUlTIY`kgavyKNx0y%acNk+BOVN zzlR^uXYLjl!`;&}YxwI#oy=x&XnxReF6ALAO|FWgy%nkO`ogD-%(Yeqo=2Z*5BWgk4h$8!tQv+06842d0(1oU2PLetx^KUK^v83WMu$`SvBnITkCN&gFadtZ6^a z{f$y-{&4Imo)glNNYM$Vw+XpsWUl04nH<%wRXF;z8i*a7>iGGtm-}MpK$-rl>KQVt z-V~J&>iM7)!u_9|MK>qmSXah96>lO2J&jS4;Xa@nUrLfxhx}A75*bwXk>+>ckj1Su zmwqwjM|`GU zpE2vpR_pr9F2YX(`5)+E(r%s?7@q9YBoET7&c2$oLTI?iWDFwW@jm~M-%^Q;onp_F zCw8N%qwj0?wt$LG&VOcWJh5b{^XwAkxH9IshgYqg9B8d%@v7S81Lrtdn(-}<#yoPU zH}a7gX_5N%qoLq(_$P6ENQ@h+FyZEwE=?VB`s27=VhZ}LmqN3D6w$ZT(%8t zj1?k$bEAm;{0T#CSw$YaEqLP%Am1Va78~zHEXVsD-A1Ga-h`aH(cwz20I z%H-1M1=c(DXd%U6W z8YB6wkfp(b{eGOM4x?M`hRJ8)|$l=q*X-?hrDdgh{ zJb4~kZNb|%F0X^cZHJ%L6+RY!3ocbzh!Q9$s+rcxPwBUHkM zhC>xU*o&@G2AMu3)0V!Xxmrz31)P%t8PUY_U3M zivgiyrx$v76KkZ0$dADXgU6Rkjd=x{{sI53gB~X zZm&R2MyQ1AYOCl4xPPPF0P?e`YAfs0z4rtLj!*l;dF;#IP$TrWQZ=`EwLM}Z(k z2cOOey3eBE>C{5km+faVZn1bA#SfmAbP<%eUzfo^16~acf`fu{1K%K#ulRVA=_>Q3Uq>u~TfiO)&TkoaVGHmfa!Ob)0rk_e-mNb|JL)4OO2 zeJrMujC?%B&B^cHyErc(8i*ep^X+O-@x00N?x@nYkM~+!P=hfvzxeCD2x;G01l#Y? zu%br)H9XQ+s~9u+9BcQ*y`wA%XhP#bo9FH9P7tz8=689l-;Lkl^;GuA!%1!^Tc(~+7~nn)fI|$c*@8TsObPZ2Kdc|Qz3cHw zV1N6&*~{ z1;9}CJg0AFK52U}AhOez{!~1#a=EA=7OS)0?>2mQa10D&3Ry;bl^ZW*kA`}_^8ASjC zqILSU1xC>Y>jbxy94A0Hk5AM2*`^K{JWhLFUQLG8Z3|Z8k-+GuYB}WNM^VD*|BTji zXtcO(I>Rl9PH5=S7dabzci_pnfRQ(H_`?dnO!D!M1`qjD$+NT*(UcxSB#_&2=4m!>W5#S|61#x`h+uC2WBm=99$j^I8-{*w5(6t2DVg(;LyMA1J7rf2o4_pt6Ht?>)DlS z#Z~^P#;!wBaJE~l(|V=X8oc%9-lpWy`$rG*NEY3%5axzVL^+8=NLJ|CPu#Lo&(>KAe@ zaVR0#{1yzY00sHKE8u@CBfC8yZbJ{?fBArzANJpy4*r)u97^el#-S;i3{udY@_xP* z94J>z1QX&F#W~LJB$-~kl{bxi{LlmpZz=!V6Oe}HfHZUy%13j)ckLAJt|Be5z)Fb}iG~$@B;mLeXqIgdeva)5;K*(6<#b{cZ(eF(val{ynO=R&|)Axx_9W*UDv$N*PkYqHS~4xV{O2mE9vM&ix0-#-mBBR$~N1B z@Ar%bik@bO$NJ+ksL_rpD0zKC+d5d5!sdHtEgAiU;D;{(OJ2xFr<>UjvcYBgm}Sri zY)u6<3h&wwAjgigL#Y?|Ig(i+=b1o=@!m{*t@Ual5B4J?Y=1HV*hk`_5SR; z(;3Fe0;k17m4nzA_Q}-=}P*80grTCVEHQ^>hvv?abjBh|->PNQjQ4?j7~^GjZGw6v96YyT{?nYdX?j()O{-b2Vnu*)U6j@Rw@6A6Ta%SO6=vg4 zp5c#*`GseEJI9ZO?cgw(Q>TvGcfpHf7Y->Sf zx_4!tImlS*RtJ6ruxd|W)4i*^6E)Az%a%88Hd7}7lHg1)B=k3ywkeM_zF_Hg#0?L& z=_GqDJ$rO;iiprbx@=K9_`94@2co%Ro|2GuTX2*I;sIYHNLE^fYR6?mO|ZdX6FM~i z-;@zR7(J@8UdKA#nU4RpGbx=|Yn>*W0U!8${cRPrfvYoXAarDL9n5`OXNi0&M=cr8 z1pgugQy6`L?#6|9(k--PsaweK^w`&^mc#su)M~HPiOgf<7eTl|T0i_=84hG+9Vs)* zB~)3cH8{?scYJLBXgQoK)5$s?vLWb`6-=7ub}Ha3z4fM{Wm)RR1-6EGnQQT>7!CP3 z4f0I@E+65e*Kv`@UE&gQnJM@;Mr#EjDyX=OiF{&)EV}6*=UH35x8#H@#b5YRz*Fr` z6(_QrP6-+gW;_tKP8I15+Va65!Cur4PnXCR4VsALl=v>f!=OG}Y>MQEwb~ ztVZ&A0+lnp5VJ&rvXLxY1{EKNCl6;~lz856kwx+giyHN1TeMGmo2u%M7N}>?%25vl zn@(oi`-RJYPVDZCF4VNwFd)(sWDeeL^ALDL7kscikgsnr{mvEbSHL(+)?oOffNbNF zke41yhpWB)-}ki~uQ!kk9of zlv%G!R5wRH6IGcR3H~fpySA&%*7^o^qQaIWszBB60GD>l()pplq9JJ4t6lA6Q(ZQErLgCM`gCdHAL? zk*2U3{gtcaKYJ9Y&HUo^dk5PQ{w<3ihYYu`pJ1=I1zGkIq5wyBYX}2|GM@A+tg>G zDr^FDmIURW3>Z@^;D0TcAM&}jAd?Dbjnx?v^bTSpcB%Q$?CjVmr!N^!MZ3)S<_*6O z>r4pebHe`qdsx&d*tAJmoWJgbTLiY>x7^(!AuxgqPWrKbPn?v;o85Fqob2TMB{4p1 z$$b0gz(H2Zd|M^7lZCD=1K+aYJ`!!~qp+uHj%W7bp@I8TdPS}`GcISuC9(EKeMGN~ z#iiaAy_rwh4QQTii&L8`o9~%Lsz0$}J;;k?StBr>_8ooH?j6pd;rbfSHDGkSa^0Mi zMO~^p*xWB6fsoHbhnNP2trd$>57kew)sdu)eXZ@9xIEsIyY5=x3|+^dnV)mSA;J&A zwv0Z8vFrF_-cz@I{A4~CY8Ub#wkwfxW8rmM=zxl-hivb_hq? zlBR39Uf4KyMsOrMgC z+goW1xdu^#!{pSm&QHwyEv~Vtcp{&jfLO#e{il>0asls7xr3k=PZHMD<6y@Vn;wva5y<7ty`WH3q2cK~9ha@X?FMgrU6Q@}1{dh@|EG7^( zjr7!U%49E8zv8U3c#H|1_wHOX5(c6kQL9Hh4(a`$>>BIvC?cv^Gef5qc6ogde)^M! zO}FW~DqR5;XoDLMFT${`9TA^H8<2Y=9O{PFS)3u$X`~TG-OIyDpmXdUF@*~^<}iMa zAcdX9Li1X3X>X85x;mIxMz%m9hR>YTuQSSJwP&9h&AuRcUDS*3sX(M+xJX)AD? zGC9Pfe=f#Jfp3M6Dpx$#6vL-MDtTlcx$2Qvd)JUuE<;L+qfjl2-el_B2Ti8e{<^iQ zWc8<%6Fy=uDTfDEF)U`=+Y}Kd$M$|iyq=^-#AF7x$n0y&?JQCL# zlKa=$vl=EV%=osKwHXw?QXq^JN@0vWvp;CZJoDQP{wXsOTcV%UXf_jmLON3$!>rpS zi8fJ=i6%|T--(GKWMyT-jWTx;5$izIAB zj}WzN4c=SXJka}#i_b7qO7GsU{xUB^XrQbb zsBZ7XH)x%jo){r9F;mj5JR#}?%{4R?`od8xS?|EuDl;u3ek7*voNnhwv^B2VgnW7i zFlltZ9@b|qemMghw{^lzb4pSuTDkZICjhDt@>1QUy7}SKg1;+X8x$8Uo}ZX_(X`tZ zLV_dAPYx^V^X&}=%=Vf9FX4YQL;`*9bm((tm|~$2kZ96BV`=2-{e(H)#+G!*qTlnf zd~;~I+PTV^ElB`Y4a4fycle8weZIw5!O?>RJv?+t^w3KH^5|nw51DK5WW%DMp?{l0 zHD|H8l5*54elO5lzlg+n*75sh267bL;rSwe5I%pT<-UGTA`g8i>N6yllpbSi@48;f zaHo!U@5u_aZFi<#pB><&NJjDT5L1*NPmPy8Dz~{nFO-@dmuU=6A?C6ga8MUWMP~@C zspdMz*H&$`MN7Oqq%xf>i8UI^dvr2{9F-&faV>aza_Q7!H?X%Sq^}wC5%QbU!#Og9 zYcvA(aLE|j2hMeEr%9DIhtv&?A9r1rI~5}*x_rfFs^0U>HpWXZtU1+@1~i{6 z$X=d3|K#r*WIz%^3#xUzE2;xDtdm9WNk*Mtx2--wVNfpMcizp?h#0|qS*KCW~wYANMQUw61+!&;dR;2@%Z)Ool5t5l|^!!{jZ&IhFK&c z`C%hADxS_Izs2>TWHjuv0{l^uX*0G8bG; z*V+ygB=Rhs!@B=WphYpp7F4Q*r-HX!BvxP67G6n7N3HHs%zi90MDv<>^?)dg3V~-s z4c;-4`p+RB>kXDoi z>5}dgB%~QY1cn9yrMn~~2MOu!?v##w^L(H8+57#iwbyUG`yUptW)XAU_jRAwaURF# z|2>=!4i*gC(Rx=X5oHWwAUs7H21`7io1 z*ql*^->vv8UZb@CcvH%f=`%~!@_T{&R4GFuNNJfB$}*4>QcBcY2=SyklV_{3yh&ThAZ%=)IMX%UYA4#;2I}x{52`YT+UJ9# zVx4NIpTsXcJMnb+cegjY|0OBQ)=*5*l(uI*8c;A#EhUejSZTd_?3N2i#!2bz5TgiC z75|J+9zLX?t>%Rg`4qkkCEqytpV4s2=3%UKRU^6%ogv#wFLregjaowb>)!reK&5tV zKOu4e1l$XY?+=O%7EHq7Fcy%q7~XpUy(8}}9};@HU$xt8e~BZ%;6_K!wU{s|ll>(d zvmAz@9Pmc#H4la~^ONxYp)f#4F?dt-Micy+>0hdo0aB66f2d74kNQ$PQ(F@s5Xwe28U6m-OTHSE9V~jGc_e2-lj31(g)2R~I zjM)uEt}CbXV&;8%-_zi?#wuXF5bZ?7-nw`}8bxx-m){jFqwTzreLLgMP3%X@lFOCR zLM{J;G>UBaQZwHYFwi;a$)WbDIYqln*7}OhZ9F2+O>OR%Ivv<+DbQ(uTi{R#2mi?M zZWIYxW&OJi%SG>syz?i6z)o%MTJWk&+8L)?Ot5wmZ_ z*tH8T??rl#ULI5xQN3sm`!n$Spm=A|nq`|fL0jqgRNjjLwAV{%^oZz>KLfs6ivJBCZHnr*d#?m0^<7a*-8GJS& z#5;Lkcy*nqj)OeMZKpa>5YZHzTAYURWI`oL&Y-fJ#Wr8h0_)K_7U=Uey4e;#iZdMX z=X5q22C?0VScIc@WI&Sx#aPz4Si6#Satn0>vuoI^lz+ASdG@9AhC@|9X*7xwtNcfo ztzq>Ma}G1nDx3ZhoB#CaglTD|k6D`C+trb;+W9!vr65CJ?FiY&~5REbHzyOgNY>6M93xa^<=CC?tPktNQJX+FI{Nr-NUf!br6z1Oo+G z;0;D8XOu>30c!<*VsQ1zdY^e{!RS-%2-s*26yN!|HLKUHzdiO7JY4$r8q|E&QID=W zKp#J;K7?6=ERZS6E~%3Ccy{kvv{2}1EwhB1B{8o2R+Q`S&o-x%w@1C@qKUf05AIi2 z_FspLPzZe(M%-QaR>|rUvFNcY{%Awj;>Dey;9AvAdWscF_z0J7+j>EZm|->eg7Xxn`&$ zAdGnM;KxJ+MmJ3Do>CtI&PVk~*ZE6T2P&SKk%Y9t>Tw}YNMRucjqZEPk2bbYl|Tba zlOSqbXb>$qHhyg64*mp_qUY@UZe$J=&v|bDJ8IxYU%JBYFYI5cM#Bn1PFk0Pap&iMaPMZ6r@H8kY=8( zE0fW`mn0JHL#3q>aZxKt3c)4AdEI>pv`qF{&-VW&k8Ao=v06;G9Qo<*= z31|fl_%5J#p?3X`VP}NQAG#qvsPrfF#9%m2)=~@v!&@lDu1I3i%x zHb7|R5G5FUI=nUjFhdj;N4@>{t>(XX_lJ_Cd0WlrUi34FOgN%rx$k8a=E!SC$>Xw! z$3`oUYLjs>QAdT_Ewx>eL3M<34B+KHw?e7-(TWLLNhC(N_4-As`*V>KI$549Gs zd-W^?FUecM=++AyAI~GJ!oreAv49pE8LUv)OUUq@xbX7K!SsM<;CaA-JpaM5uro6o z*~_^1qI?s!l+)*Ae)jK6vJk<74*agcDFSr{PH#k7x!v}2S)M<6r@c)`Cqfd}ne1$u zUrC-=?$Tu&O>j}mpt3gs?k(Fb_lT?x}U(cL6+?7L4gO!rr zt2KW?>7Y2!m+&q@>FA;`$cwa9I~xTaye)pW5K_9N{95nXP}k>?_1x$wRspH+yffv~ zrwX`!n6%cFWBxk#_9n+A<rKu)votgxPTj=_`G|umDi?>n&aYM~r5-&>Us1 zlEF&A9=7_q&C7m?nT5D-N63;nPB_dqu#HF^5=I-s<93{j{eBgi)ApQ8tF}5% zkXu>x&vIKqbx-$NfE6Rj>=ZIoaRQD1M3*+OeAV+FBpQfwzcs5}^BARv4;i8P@)1&t&<^iU%O~&`L z!j+TthmR0Vw$Fu= zXcmU^2>j&sv0-SKPjlY}mI->U?{(U_)p*fX)%z(8rdXetFYE?ALo1rZRqkILX)1B< znssVEbWiER6?%6iyCT262A?i?T>mtRy*U#-aMViT+l4uN${9dgCI1DYE_wl6AkBe% z6xLcyYopaf5LdD6SOT-V{c9?2{(0GW#>lmnh!k=bs@4r#h9m-QmA^o-M+-ehTeP2P zNPKb|RbUE{&SZh%?Y#1szklfD-n@dCbb$w#Jtl2JIY^xa77GbQD}c@`1<4tRZ;&4E zGjKAuv9<|Kk(RtgMuS7Ki;#WO?3s_o6F1ux(OZD>Q3MNRBD^w}!q>vy&A!hvEH$JXwZb z<`r*$SD=94bY@iejR=-XaY|6VQ6(>J=IA8jrF zMuOn>&gfE3n9Y_2p&^|rA?(S8@Ap!k){DCTqGvT|CVyi_oE8f@4lu~ThH<|ANy?Ef zq1o#4^GmWuWxpZb3E0m+yE_dID1{VsCKtjoP&;Cqy5KVP(qavH%6Pq_6x7u_!|u zXu#==Frgb?w7fjMKxX}kHy1*f?N2?@rt~qoyN|o2Bq4p{_ovC!!rB$9&ogo?U)G2> zx+z|%1*{foGL9^@V=>H`yl{Iib=Bv*F*sa>jPD@UDR-jQZ*nlmw&k@L_=S(hGfC~8 z{7q%81x@gE&4+8&D2fxZ14%jkM&Aw=x*+f$*aXC-$B0zK=lR@E=On*7D{4YcoycDF zguFWP^2i#$IZ_eFWpvl2u!>?`LC2InmQTzZDb?Gs*D5`Z-i^ioF>-J{Si=O2n-9@* zWYCax`PG28ci!WK;$t+-&Nne_@Pp%X9`a`Ol^Q@BKE;FL(XF79x5Wh#FA_yVzphB$ z7ZG)Cg#F&2zWfLw10it9#bjyWhHKT&vh8=AMmfM2FyGtu+S`khequbqguBDp#FQPH zjQ7xdE%LuQRqwD%5d*-lEj{tt^tW$Pu7*SjY0%|_97|z(Gvljg7}IB$5pf^@0cN*5 zk7D>w(oadk$@8rZ!?2zJ!Z17xZY-(pjh7Ly5I+<7nA~Drp%G}4pKO|u&xu4)>8-1zZyrYM?M zt|C4MEeT7O|6(62i)LPSdJz_BbA0be{^{0O|!E1g8}Q>YC-8^HP*O^^stL ziATAgl#$V|7?V$s4(l-(*;`w1k#si$6Qs3|VZ7-ID=#87xx*LhD$M)4=0~lf z>@VVYAVozzFz0jO1Qh&B2r9~pcu2ARkMC-lHrU7lUH83 zVF>35P@?Wil|?+okNd3vAvI>o4>M>b80duTxjiWU#>*Sj^%7*_AXEI^S&jgYlkZ~* zU#U*F`E6LYvOztdpPEPKbRxuYEwFBir}ThQ7-zf>`kdY6E~@ib(427jbe1?YrydPC z!D;wy=+^)F6a6f6SDh9Y;faz)IpI1+zQyx9JJD4y`(Nr^1Joo|&G|PX;dc$Ly4EUm z!(R8%`tK)gr;h|Mka$b&N6&0Bd%|Fp9hgX3@Q1jHp($@de1dqU==d<2fK_$6P-l^$ zo%c`Nqf;b&w-a))ijnd|?V)&z;;+aBvp;)$Nq{lQ3Rhdrq#GfEv#xLPCBW%BN$dt4 zELAO88NxUy8Yy=o%R~x>$$wb-1=l5K=}_S6-cDyl$xq8)sFgIagtB*A{ci&=)?+wD z<@-&^;sDdWb67oD1;!ZcL(gF`6O{Qakwa~S^sx{aZfG_6b%awWM^$pajezzuF12Tm zY7j9ZEr-f8Eo5|jZ_05$NQFv~^?iV8?%>^Bsvnp#RU;@Qie6k$@g4Q>>d`_B122hH z;!nqQTwMB=ww9lR|tcr|+YuTTJ3+X?PCd zA%nY<&1rN$u5oVUt7=4JZ>~2DNsjjuHkV@;U)W%_z9i?OwtC?g@)#3I?m2gxqYC*Y z_{`Pso`Qrd`^yl@?KO8MmrhZgtoeN#p279`UgCjN0;h%$4s6}zNi2d>+;!`wSE7#EubGyJMM%%DR;vyo7;4s9Doz)*F29|f>(lMrF7v+zN zqQj!-QF1bZ27^So!R>%XO^iR*2WatQA{SsOABw163I>?x)~Nh0y9pZ5H$5jbAZoAQ zLyjlf%cLf(Uy4}OZ6{M*>%L7(x7j6T1Es(SuymXQgKf4KfE%9O_Ed+e<%%sf+ zzA~;v&lsdlRp(X^^c%K3~`n{TPQOmrtc18X#k{`W4#FPZnCI0 zvd1SjXf6LSCp|Ry|LxO46B5Hfw&lxF!ZUv)x>&kNIf{PJayxwpk?3t$&J**b8xlyj zOyR=ao%;**^?tC5dPCX^P^h;puP4h;a>P+&9Jw@Fg(Zm6GgL37Db%EfbDXxTfxP9f z6Gz?o9S-1TE=v=wah7d0LDi4w&7{{Vn$$ZNB-lv`s@UjhV|fqvNJ|;_JVfCR8)ti) zMVpUVw9%yG7wQP^VnMLUK56|;!GdbsTF^1Sj$}_jcK~uNL_UP{Xf*?*qUjZm4I8X_ z6A-KL)ziayUmX!|JhW+jjdmeWMv^F4q|yPM_k>-+LOeqgM#Qd;Flv>xdp~-@@hU)*3Slw7o+%RCLq>) znL2M+HS1}WNqOKle1_SK6w{iDn7?-FqffH8_zGS|Z4XNGwjX>i9c3EzkZ?PsZr_)G zUGQ1~Gy}A*<3)mi<^o^tvKuBckL^vuqYtBzA<@J($*od~U8I}{k*Fnil(9t#DoLLX zHNU1&eYa>>NjJev7&$78Q>%m=>MsQLI=H|-C7SrsJ{k9AMsp<0^sQ?dW;i%DG9fbY zhgM<%bDv>!i11dU{Z;P`eyo_+=p#o{7Ni=)G<;Nc*R~^=%c{2nd-)xRj%A%*>}yst z-8Om6QrPNJyWe`tPtj8eGF#(1Ihk-INYV!?6LQO1=Fw3`gIaU4+P|@kc&1i^$n8!S z^?vd5N(ZWdy9I8cFgZWV|4*c$8885Nj`O3zjVJx1Mx<&j$8@9^uL4Hs0}OW|rbTp#Ne2X1H*8)-Sr`sRm% ztkVpAo{v8K@?2TlAX~;@qT~#0-On%#T zlooi8!!-8J$B3T(z4XkWeiv#Hh6^Ov%V;DuoiQuIngB6tVPQKoIQ1!w<4;GED z-c}2tQ)%+Rrq}05SMoZ$dW~?T05r*X0<)ACPwGY*!F6Oj{2v%d+gbj4M^AEgN5Av1 zW6zhJZ?vclSnboUcnnK#BS0!af(4EEjXZ!XucNFBsKi+>;02cf^$-#wIX}t#GH?~h z^IC*fqQM2p7DXP>t9~(&KIKd=L`Wa|8qD}l|G>(kbdLWf%}p-vkZUHdW1kC#dPY24 zhFbV5Jw^$Zm#dW$P7O=wUSShLb$Du`#MC6N=>)|QjMT|#^N@pl*=KaLo`1g+qJAp> z(&n+jQxMt}_BF)Z;c-uEHQZ4@6GV@;3*fb)uJ<@y>j~oYs&#M+mB8J`dTb!G-^UGH zwooIBSVUCD1PHza0p^ifq9DUKyGWldPLt5TEz%3l8OK>70j&5!?6$2_OU%6NK-fzSFFo?){L>^O#~ zNVYFrc^G>6IMpfJqs0~|NR)io+?&rkECQx7P+5Ky4G*h>$(lrirBNG@1Q?!A`Ia!h3s?M`24Z-;+;;$oErC>4FrH zlU#zWfDzv=`JrR`-;T-K$mz6RMW5<3#r1_X|Lhz}A`%wh3$S&Jrlk~LgeoQrMvW@T zS2^^&V1%tRFMB>?bhh59cnYH6r6q~Gk0gl!@&j~!hOIYj!so^!qlSJYxxMcIm8 zM?oSr@^VMFhLuFRM9GoPd&h(--TecPT&)#EY-F98V2Cg$B34+f{cP^ql= zjfAP?vbY7TN&^xbVY`)21|Mvo)kbx6bFRgo3e9;uumxWrOdaZpnE6@jJQlAZ@m$$YHSS1&B!i3UunPPDSjJ($K z%X>Sxzsw9~8V$v#`qLorg8@!7$c{3Ff^4iC;In46Fv))>OHU_$c5J4^F#K?ChEF$` z7dO2AoJBuuegN3c(hPQfyS=NHilVMBhV<_*v?O`)<_`3#SC_f>En4&oh?SJgd28r# zjx7T^9M0C@AQX8~pRe3_rg6MX8>Ew4<9B>&e^sW0KQ15#{0m)9h!}C7xu4y!M1Lc= z4s~1^gWinlmAq*z0ScowU;kEWlH@!TcpM{X^=+&OX?4*?fAGolPLm*uViK3(@}O#t z3>N;MnFHTe6iGHfC)$2SmrV*3py)w!U%@|sckHZhtHQ0V8jtC505D_nXHFLnIq9!n zOF~ju7wKaoV&J76WrnLHlv1-K^b@Nl=nhyO&^bqh4FBx)H-{=$DJktC!crDk$m8`? zz$QFB^{=G$DCJ-RN_5J!2#n|-udu;R*$=wwP{MAf2aooHkN3aF|MLkR>LNOLh7wK7 zX8DWehNtt$3L_ z3WqOp9R+p;iGaCLG7mpV0--%wD(dCk7x|f@&g)ggF^80+MS*W6X(yK*#Ao7nkx_T$ zmQwyKn%|xe+K%e+jWk8UFk$lzp3mL@PD43}7nytrl)M@{sOP5Md6G>WhYVxdL8m6y zWq()uWiP)hmK0Nf1+gW$NLvrefv4=7*1C}9F6VHO(>b^_ZzPpP-4|w;UpJ5OeK$Xu zKyxuqAp?K-3G!dbH;58X>^nD96%=8AxI3(a_O883=2eIiB<73l0{>mw2Ua>HrH+@8 zIC3~|@Ci8uv4$jY@FU(RH0fqYIeXeZVBu4|HS$iI``K?45bq51})&|pt2$2%DO;oi{qocf!-Qh*o|{&_ZqR05;7aS zlGtd%Z^;mL)MyqJm$A(ql;1uY!y!Vd<_^d&T-O{d>hyMF(4);V!V=UL&m~kmALZ-; z4wbx+lc*mPoA!dt>wvlB3DPHdP(UKa(l+;-yxsd;(k|~qkzp$)3!~jxPKV-^v{XSr zoMB*@x?f5eP#4xfJwSs&i#M-*@Vma2=r_<#uuB==e8NV@>yDyQrOmxx4co{UluhUR zFBX7p0f$;PF;vBH{oG*rT>e=>*g>=38n2=5QK4+G6FEcrFV9CI3&2&i8jD->krsF? zKbgRi?RY#K)*w%R{_Ig?n5ir?9WR!%C$MibDB2G0&+$TOOfc@c7$fh4oJpm@$~{+q>a6&X9o!LVG1JXgDXowAlEullCkk{vFFxY;+-4WFv~X^+-P)+giy%(- zk);r6rr{=!or~70)e_**E7hx%w&tB4V#-undu(=bW15}eN5Y7w=yp1jI`Zw8hvfEo zxlgeJu6KIIuBDmc^L6I)nJTxHFWjJ5AR7)SWV|JSi$DGTeP4iSk-kOw$Y9rBF@NVS zN-;Eqai|0&FRP0i!8=p^L51R;==H*+IvvSSMZa#%m#RSre*NAx{6)P__53|rhXDr! z7M6#x#!eZRhM{hw>3pw8OLPrs-Q1nl(?3f3C~=U;zb)3HBc^xXxh&u9_VOL+!bL4r zkC@WL!G<609-XC9Y*pix^3|2l4e|txNa9j5+qnBskvk++v_G?^rMuPhBH3NZEA zU@gI!ExztPo*mAa30$5I6g!>qy*hVP5>HLZpT` zD%{hq-1t`HN-zmL1biHd3Q@ZP$Aj|F-b>Cv7LWAA8UT~3Lrb{hW03Mju2}l>;#|)mhWs1S? zb$Ev)H&L_|vyON`v{nSi_&_t3yhc9jXZt_L*9yR=98*4LExKUy$LTO7)8lCQ!KPe6 z*Z%GQ3_Kf^hMxQ+EQ=O+F>at`OH7QlJ7X-|O9Hi@J^okvc~I%U<1?ZFd-}u47TCVV zY)1KKT?F|&Qu^^*67Qm55?-s^92<|!Q8b7TGQ_V#sa{h34)T!m!*8A5PW){(n6B@$ z62{5>m<4IX{&H1XMbL>FgA7c5;T_6@qMkTWs{f4XsaW4Sms-Ghss3*kM6FU4DfNPh zU<0)>YtzQb`tRGCZ*(7Gq^HE9e?ywj+ir!!<@3&}rkJyN7ki z&@VWRZ-Jor^ou%Klm0v7$_;~ss36VEzwBoYp-TmLWP`Eq_h#!WpSFb_K(F47M$0rD zlM2}2d__kWab8bo!dm&&0(iPK0ki;pv5}?IB`>VYgCN*@=L+{@0ZPWv(~cysjtl+N z16CkI{CCz!B@#G!)}swDS6fdwf{;Cdv9xlmD)|d~;24|9vdDy3l%Naun2qy3@-fYH zo5RlKOAqY)CNbx2zyp*S;v+BUX1Fv8cfdo)mqurK|Bv(%S{yMQm==|W!PxWqgio|vX$oj4|N)R^4d#nFMUPAIv7Y;%4WF3+NsExVUV9#l7G4h}Dzg z*U&Ig4nCXMn}Iiw4raZJX){Jsg9rs}qhjEMchxaE4mjYR1V+CvF9~}T8ed5l59A-? zM*h1{=kcv$c3X)6*+t;h0?-)>5+B65e-wWpLCE^&A>qdVe=w=2h0*9L`=iOC{$WoL zKCl$p_eqh_qvIsPaxcKt-29MfPn9&x!Mg|gUPBFFb?yzaHuj4R=0PEdEg10*5?>?Z z|L_vnQuF0-Hjv!}knWYr??Tp?xy2sv6> z6qj0u+>;AOlLEWOi2{NAZkcf2+~^-j0(|ayseHg_z}L|tRnQ(nMoW91NkfbJO(jhh zIwggI_LB3=4T`%ctk;ec5~yW2t@!B_Kmryrd;4pB*9ab*Tc}Dj23tqfZ1g-P?s;j` zj0cw?t88Aset>j{L*qbjV~6e=bY^f6?K&+~<~3gShqC71X`(T?sj~bpT2UgmxhLC6 zm?4e*x3!s+)fb?&ax2rb>A4cdT0}(I{DAh=6{rcY{kXze?|XB_M@qX zW$2e?)UC4bd?@nMKMbMZXHwev2EGb~!6uM_^kdzr5z6$n%c(}PE!FZ?g+7+n*kPQ) zuR;|<{sw2H_M-(XQL7RtO!I(!%R?HCml}gnY#AgA!h6Ai;JOY^VrQ``UaJ4;a`b+j z?aOHV^e+dmlW(~2GrajptsJLcXz&SM$RL2oBEOd;kHa-WWe)VR1FiLzpg)d~z(C70yfG9ai9y+qquSxlUVF9}4h&1kQzgR?+w#?STo zSj?f(#EN#)HTm+0alxu@)WZC4P6d8EToZ4&d8I0Lm<>waKK~byqllWMGCk0A(7C1M zqiJrGd z%k<+(Kc5v4Q>Ej*X#qsD;z9sOg7Eu)A(;3O&|9qPG+dq#@#j*ZQ_JPrGK6$odMvcw zEhsdUWDoo;BZkqAy~|_$2u@XcyeN0YzS@t9kiR||eD+j$rofSpRDAobN}FrMyw8TX zf=T!#7E4mC^R~z?)DV^tj%AT6;mR-DP4lZ*x?LWQid(pnB51=>s7!IdbIm74O;XrM zAVlL6mXgc&JB=x3Gl|yh@n^O9D&wGB85wknz^41v=?-8ZkwxrGN;HKitrar1 zx6vm!{HZ?JWc+-=uYXI&M#DzM?)EaDitdE?_Mq53&^4u3pgFb+k9(335JyShap+bj zhM8LOtk-hU_V* zN}p(TSazAncC=mtJdBQ3p)dC5-_Ps-%hr{Gk;kJ}I;}4wPheX1a(eS$iz#38r7)gx z*ms~}xr8Rzea3BXZm;Gq5StiZeSp76{=dLqfWlVEdG4Et>kjs=3v-L#oKA|MYX#s< zBE#Rk7oS_xQHRwcK1r84|L!bQjT?V1kQmRT+7=S@{;6_ggI2+_;1o-sx%6pNz%N4D z39}^QGwJUOJYr{3A*%t`!4fBm0zzr4AHshton_L1eAzK9jX5{|!*ErwPYe)KyrHCw zZAxqt$$sY0RIq|Uhn>m0?Jq*xM}Qrv^QW`U&q7dzD8d8?#N^2m3T=d~KI@JdCM`j} zxlasPHj7T3x%2hj3O#84wk=@81M^FixL&2p_79u)NtLeHJQY$-;a1u9skR(Tk#=&C zv9_`F2p$On`T!w6@0~w`0VVlKw~aUCCuCco#&SyD8nJ{6YY}(pQ3bE+TFO==<3J@% zbRLjI!~Yi~(H$mSJq3jK-B@6BOD=8YpC!6AhunvZWy{69(Cda2o$E~9(*~MOE~*)H zXtr0H^WvB-i2bE-0>Y57;h*evrJXKLotkF;6^48;{`^nU+W|xb^h8|Rio+T!Vwvk7 zvR=xeXVWU5R0G&tKZfiib?5pIKQR#%^`vuar@erT9sj(yvrdpp>bMl4ND?=2;D4Ak zYTAXypPc}(yk3<$7SBUtC`WGUD+>Xe-s}8%w&xi%dxo|4zb#@(MZ_mw8q^s{?9DVn zv(dxL^Bc!r7{u!3XxW{Wlv)-rZHt9uVBN!N>q0DT&Zq}d1js0aePBkHIiWGR%{;@w zsuqX3=_5`;iv^|uo31m5>Qp1*klpEDd5fo9WV;&WT`4mm{9L@sg zC{tP295*YakgkAPK=<#4h>#zWN_n1i;f{VC2~EOgR)Xz>rcAWNP$~dyOW+W2iGDS7 zfPWyFIU{7Rxa~SBshY;g(YcgTb$&1(Kao*Jz-FqHF7n2qcsll)d}e}E{~Ol%L1k&X z>{;V*+l1~4>fN1w`Ojh#)CTsk!Q0(rd<^AnsTcpd^VrZxVr6${C%W8KsH{QrxkJrtRrF(K<%`1?w)d8Cd%9r=fiN0AC8o-S?-q~&QLx)@f zTo#yiKK8kz8gA%R+9I7e!8CHKdJc~J{ial2L0Im({1$8TI2Z?!EkQbGh6Ojh(V3?W zW|7X8tG;Ar5vZDnHGtY01J?Hc2en1-_f1$@ znRGnwIk|MRmAY*?8Kc)dQ zM&TI+O5jJST>c=PpVq;n3MB?Pg6*5x(Mu_JTO2Th29G$D-G|hf_$CAl@X>aJ8;BcF zOKAQI3=pq$83#|}3q z6|0bQfHi==_g$-0y5OKD+_eYJ!1l(RIfj2MU*(oXYJnh5%znTY6WLuTUo4WquFb1c zY0ZGPL-(A|{FTD*w16Dw+Y=iA{<>s&^ev0x@jn#658%JXX$!1s0n5RlM>2c#9T7wT z=)fH^CX~n`XMeVKEj);lKJ>kw{{4gT*=IMUu-^*9#!J>{0K=0El`OtNA>Q; zCy@_3R&9@_!8=S9H1T3pZx)+ta146!Dw%cYm<2Ev=UksDJ@+2L+hcJaAV2kuiRt7) z&5+|cutCrsFDF;NPN#8M^KxZ&9L}Z0mj)8KV!-tz@OkrpS7|4J<8rXFm@ClKbiA!W zs0uR&U|?wny3-^IRDhN)*`#mK!DO142kqrFLBqIyA@zgFT8CnAek)$ZWY%R(9Z^gI zZE6xhYRFE=c&rAAK=x230V0I!M2AZxXOfhACGxzLm@vZi z=YafosBK3eBR4lf)c?zf+dTs&2C*(cH>CBY3n#%Pn6ns6qsgUGK|ok`lr=0!s`Cyi z4PZEU#E6uf8hgDK#u<`#hENU;1XB}MX&a=*et}Yi5B|>K{A^ zykfu{tei-+Qf+55;VgTM_YewwXo(U*0xu>Gu+*|N7?}Obv-#I1)E}JwfK3*WTM_-+ zKLe%r=ow=sU##X+D^WsMtU%0~zu!3F8l@ zf1V)!Z$J!IEoWld(Z%VH&yorYP}=l1!Kg`2?xjmKV?wn}A+X+P)~_Rij#oYjbUkr= zZyowQM%BbT99kgbt3Ao_-0lxDmxF4kBWd;G(bS~l`^t7DC&lhHoJGi|3EBph|G+H& zfhHNEBkS(_(uMWMD;#27_{CDGSqaT_$yd7gy05BjJUDz}CK?3_)s|?F-LQ;8R@f1y z_r_RiVyQ+H`ju{JMzPzQqS=1(FJN~OZEK$#P1fpvUruI2W28MN6^Kct=)|sf=PFb0 z_XxTtu1fizt3?h=_Qm^aUjW0*e9`-fq%{L6fB0F64P@UYJAi z-CJ&eaCbrizCQp!p`|1eA9pD3jZ(R10De%jnL_>b`1bQu)7}8=JMtvF8ke7c#^{GE)#oWco0=)W$xU5G;S1TF{tVek zpEB=;>;lNDiX-q#{1z0Xk7F5(|+G6jv;M@`bgn{L<5p@sVV~iIPT! z!Y=?Q*kKT?AMjUtiTF~U13&ZhBN?v7R*D@2j~{>tE`7A2SnRW{1i))^SWRufR6n+9 z|3zxo{o$w_7qq_Ktc>foAbaWK2=+-J9%+SK4t|P}F2t4Jye>)rrh=75k@g8@j4Ki0Z2xH@X7P$dI6ORoQ#l#((zCZ`C?VX2k#3n7lRGz8-o=C0 z&4>8Uj;7;uPg`Lb)zMF3x2C#;Xo{y5JYvyF)Xf2_{3OGgL%ew`m{2W*up@k&Q|A?i z;`4mKyp&Im6ql?6OlK0lS88~ef6U3oaqDWoV#Ekd9)6a9J3X(i}@u1UbIy^9%y}% zULNWL&`AGX7#f7ZVtv;8NRY3dIo%z(k=3BGVaoWZ%ASAR75>*?3atnF?*3kuwii|O zR?Gra=?m>FA35$$^6OpG@K>UBrR;_cp8#elO*FzGbNBKnzm1wCL@mKr`=^ueLTW|D zh^+-0RJ8Osu=20KcoVNh#!D7j{!>tEWQ9sV zw?jt|-x&}m>s%koS|J+_%~=S$Apss(>T`|6xj6uGMZZ_aY(pT&Ip|pr5KI$xZpHN^ znQDGa$+SP$7cy0xV|U#iF;*bL z2zoqT-*z-NAPIa4(E461j@VY_;6n^rOH4ZJu?D37jP{maC{j5z1w{P`*Nhr7`TQ%s z0Z2*s6S~bE7QIvzZ7rnmoy36O%HacZ8zU0Q(kV7NOH$GAzIlOS(_)0kBHWt&jW94cFK~Y#SVzHi-EKr@3j~ zo~aeap6Vwtbg@+89x!~5Xbec#9hc;VTDA%R7Lf6uYv{lKz$2w1g*#JC@F*&5HGlV0 zY-ElfKnzuU%c+;(5|X$*nmwGz=J>kO0*e`-76K@CC2A5JK6Z`%KvUv0Bij)5UUpcG z2qq%-U+Mo57zi1PM`;?g!S8i8$j8H4yHdb4YS!I!d`3o5Xq5GGH2V!%yL>g2e0xe+iYJo?vF2bX*D?931UCkj_IAdd-n56?Tjl31+w1>AANCOe z=tF>WYKKja=0;0cG8JOMG`m9k9se}OR!)0TVY3ATiN86qB!f}sdxNyV+{j^8&k5su zmQLXh2oI$ePx&rhDezs+kL`@i12@CgCuq!7qSCQmmf)U2d_uqbU4zWZ-Gx4FJ}~9t zjpFgwvH6)nIZRlWr_5RI-GaYTz0=0Gbq*b}5VFLHH{9PT7kB9*Ui2l|(rl2B(*3Of z9tI7lRY5td0s26PX|>sRwTFgO@(iDKkf6R_w~F_aOq~o6VB?f+1d@DGAjV=p$B$fD z^(G$)<{ULJvyuhBAlClrG-r+y8f32H!RUTc6E3$}aW>F`-IaG}{Zf+`_!c#68;4Hf^7 zs&vKFk6s}OEIJlzq65?Ru{>v@0)U{SU>D_uymsHsm9^Ca^q2gmO}(EEJZYk?;W{n1 zjJW$J44qL#86s|bueK^pzvW!x|1nhwo;&WpS04)CeHs^urI-6Ny0b9xgR@R^C6lW9 z%l%TfO2&jQBXi!?1Jw0^i-x=t{(EFbe7{}fF{V=`{;b158v+qdSKi-;Bt`%z4g8{y zhqv>T&>&3Q$&0#8$tYOJ2@*JcZ|+VxKTj2aDkJ|$IrZzfS{}@c>tXp(On~}&x`uQT z1Z--IRwB{pTaLy>;a61M{wX68cVd>Ax!U$4XKy|mooV6t%)j~%MYNCNUldVj7fvN^ z`k_|j3b*)T@^HAcaFIlb6LwG)Vs5ns9m$=5PjW-2U+0l7(tP)gc)(#0)~903OcZp@ z()tS^brF*jedEv-9058I1?NP*z`EH|!=}KYbv~Ck;o=x^F?3LH()G+W_^4=-p-Ft> zq5Z)1(Rm9{HQ%dgBSzHT8ixi!;qoNiVBD-hL$QIa`H|c{{nK|g%-8=g)Unfz_=*zh%dl!>~bdF$JSc6`4?n$U5>5B z6A)Iv+Bn>wrc6(JY*|^` z6PA)^ce+#-mk7@)+i`O1TmQ2Gv_Ij?UMpC+=kuww6kD+JOj_#j@W`BaKo&Eq$lP9!8t_yfYA{E(KE0$qwWo6_EG6zg#z4m_ONXOllN8@%i0sf9mLd z@Y8$0PpFcwazkh>_u1-@_defm;|ZR{i=^RiCRitPs%!68)Y3d=hupdx%b>^UE9aFH z?dpQpOC%MhLnlrh3QUSIe!eH+Qn3gQ)nVylck__GUzaC9t8I$64-(1~k6jr?2$N{* zg9&;wnrQEs{Whrl{Vba23CV9A3pWH>EVvacg_{5S%5`VAD2<}eDdl>%F=I|CnGfFX z#QR~F`D2jF{LMyQPV&+oofI&@p2FN5?wYXkp=k=TxQuVxA32zCG8l(-(4zX`zWeo* zV^3Ki7Swc_dBR$KMO9S#>e zFSSce+FH{}Y*t>27Uu>b2tH(=`gir2Tj{=ZQ0U}Tz@%DWj2UHo6qO%yH>H0zO35907J!H_^Kb6icc1@NMDB2ri!=)~ zEqZ>~u@W1%?$A1K` zrbI_}dO65fa1u!jJc2v_fW;zoQ*-?T#p5zk?#hza)H-+d#nePx_ufy;e+07hi+&k!=))Qhuq`YNmnyFZSL7D(bge8y*}&#UYdsi9rcLQb0OI2@w!c z5J{z_LApUg>F!WKP`bM$2auL-hVEv__a6NFf1dMv&pGe;)_T{wzHhCwX5kt}X6E<1 zWAAHU``Y_HIeMg0_%tu zv#hEeZ^O)yViUHOq|MAwVzwF{g7*Nzt#zyF=`F6NCvk9=ozwZi@2ixIn^uZ?SUP84 z4!$<|^GTs)E3|JtqFJxmX=T79i*sMSPh_lYOT1EY7eg;;Nh_k1t%I8yCCU-`0gD}r z*?Dd*3iiR2@p#QOxvyYd;ZZL7V=yE4Tnx8xd%s&7W!^p#iC%)WaPKydJiOfP2(G~# z?wlT_>7p<(yjgOMi`rqad)g)~lHqmZsp(^Poen$AGOe0%j&yIb-JbHDeLXKX$`>3p z+?n^XUgdWzd*a8&Wm`+EmQ)_$$qC<=dCkVI%6GdzN3kW#u3x=mqxV;0)PYEZwV_d` z6rK!;m}Lf5AbOPuQ|IB6$A@I?c!5PE7M!(5Ej8c}9*=UX6E-HGjnerjgnog5mK6P; z?Os8g9>}qK*J&39=$DeTyf;_Lf}U8-b{H8)GG2R#3q5*WWJ)JPB5bxV!QJi_(=}an z)XJ6cp_9N8Bn5;8zJ(mwR@Mu+9_nnx1TP6RcdY9y^EQa&KetYWzKW}%Km5Sh^dzsvT= z7MPTV?tX+|8>_1NAhVHuLCnv`#ckd3+g79wn2gz7Q8D94FOfSW*Adcj)9S9r+Y4? zC}PXwqC27-b*L7-VQXV1g=~ihqf|7+8@1;(W7>KyKU7XfC(p%3N?6vr=Aej0sEbyP zu&H7t>A#Aa1zpL_kbrptbLmgde^ClkFFbs<+7{Dp$@ zYb{R2LXE0M56A_K!S1%j=&9`3W zvbmE8a2eY?g;C3h^h8$e{gKK;=h57^owQQH=8Br--NEB&ewF*EkTteLR^&Kv>N{lf% z#;m^9KdIYjI#B{=cyaUzg4lF5;Xd}HDdP6zBHWx<(UV8kPP$|o*E*B(t@%$zR3(8c zpqOsgP<_YcG=f(#3RY^W{N$NRX89}exkG%=k*f?7v9-H1ElDpmFmi*|NT?SnIq$EN zP>P9ojlas|k8#8*;|yMxy#h=IoxbobGdMR$Xcsh65upFLA4AmBhd|%;+P|!a6WW?f zn26yKIx;pxs!+`aN(tVHQc_3-ij(-^#F`-{>l7zk0r+srkCh zr)1IH@BlHqprM8R$hsu++Z5k5XqsM(nEbfIZm_xy*=kq!C~gYkNokgv zs?*32qxMJ5kXtQ!L%Q|07A0x89WAyJW|6(&kWu1cgWKdmPaC+w^c*HAp zVB7q$(r#53H!As9z60mb88WZjzIrFn5w~4s(yUES zp+}yhIqRsPdiXhZW+An)Hnz>%rl=$u0@})NPlGAsMc^*LxMd50T}&g3e)@Z!J&?vi z-9c??Wb&@s?fRGqKCbN^yW>1y_c?uma|rp02N5gi9Q`CrBh{Gz1y-?m`tBAn8l{w}w~S7IqiDial|2UNO#Gh5^6JvFX1&#Wb=g>u92 zTxBN7qAviNjbYtELJqyvi2{ecd-jgw{DqWFu9$msrC-_Djnmar9g3HG^A#m4J$|ma zRyM6FjMpuD>E0vU9Lak_LsmL@MitR!^8JqJwlem25ktMYE?1kxaX(U~5~tIRTwSl> zyGH_j^8AOv4Zy!>6lk4U*+}s0r>SkX$#`T7Y^ob$^$vHu*3ee6vtR8Y^B5kZ6tKQ0 zt@A(;TOju32k)2ylocl6(;}{f&Mqx91U(JTDsJJViz>l;ARPt=oi#d?DG0%Cy&PJI zcJi@9oN$=gbK~TC@yFfF+H5Txq45VSla74I(|kerkuQ!|0ST`xB?P@Xh@B;Z;{|X4 zj5KPHpJ4(KMK~M2j#AZ3D&`KhfVc1dU!4;)dv#~I8D|gxDj}?g$uc2BwK@yJ!Fp`_ZjoYuMgO=bQV+=!?__IN|3}alea>nU28_EVthwv7C{vML1QC1u|Wt)mAaP_Ush^ zZ&^Ih^>?wtYRO$%eUoHdc6lnMr?uNtrw0fI$IMW_12On87O4@f5X%^Yqz9)Zi6hu?v1%Y`=f9*x;$X%msw&M8^b1Z@Zkhs-*ML z8yJKQ|J4RIHM)eDbUw(=dQ~&F4&&9DqOKHt;I{b#t7~}frsLkadI2-d^P_<{qmI=% z*zB(yJVGS*vL4`;04R%D3;RtVnPO&7+Z+x!im;ZvN9b%FCfwE*=V91hfme zCICMXYKv9|iJVDsI~O22qEF?PqvzZ(*jC6zecYcwoXv7n)b(ac?t7Nx`IzK%VZ@Ph z85oSD?(1r7?EL%$a1Fk5kUWoBBwNj4hsWF0^2hB;B$nXZU%k7tj=ER|n08l1HI*{n zjGTbO`6Bm14i4^JqNO3cS+&_jsu|63m`_>eR`0gh&Sf$%*2(Lwc(As&()QHsWnWi4 z@n~xgC+|9x$*Fe$W)vp|4mV&2FhhKod+Xqe_9HtZjx)P4~nt}tMIn1X-DVHep zh3O5_5a?}&d-Vbj@u>nll>Jd?r)QJC2^*M7E%2*kEI$Mj!EeTN}yzOdjm2H?Y;uZtA~zL^HL`ZW{y6>M2B@Bj8PaKtS00YkGJ0!q9dD`h#U%L&Xr zCF6KsM)FBN1Z~4v$iys7O%A1S1*;LSXlZf7?A$Sz!0>f|w=sVs3UKFXEc`i@|A3L>btHy#;$ipZ&u5{paTGzxB01 zc?bjF%DC9@Ki?XkEqPCL8^o_FM6_$i8?~e-zcQ8C?3lJ!z>a4)%v4B`KpJ`ChQ5VH z%;1ZhPy=-u>Jx0yz3Q7{x*S1qYp%4kUyxmLi0H%sI6Sui zuO3TyX1X1F<^}LM)bFrr!GWwTmI%72)>xj&(7sMyMel2r8k@x@Z-=jfxHTJh0<6|- zn8?RmQ3xnoo)qg8*bp3cIbPEy7Z7cy#S}D(b!HNyIq^G@O=S@{CzX!0Z?7`mr*;9AeE{)&T(z6uwF^Em~GDg zt|EAlY?ARZQz>i3LcxC2?Nnam^ac1|ddCe;xt!|h$RUgDEYj<=l?x}MD}(JHpv}@d zBrLiEtM*t2X=AQ@LL_C91*#fKO2L8#-J<2k>y_Gm3k{?J3+xa>QjvlnyTgL5K6#NQ z7y8E%-MKk_G$*ay;Vye`)tykk-mnCU2((g;6>e(2D1y}~H#Vf@CWHfZN(;}VllJMm z2Sx+e3hr%Z2HDXzVPoI>>|e1sy_}o1J*=M;0gxH%?Lgd{;$FTYQE_aH+{sv6pQ)M- z7_X9VH>W)m*T=>`WSwh>f~DTeNr4YO#7}LS!!!3+L-Y8!!Xulk4B6o!H z0a0M!A7lu2{X)D|;QRG4zLNX9l+V&huWHGh6Y#e$I@X~GN=-y5x;rcC#xl4Ad1o$z z#%(cVTk(G^;V%h$4`k_tgUqc5?}+kkuGR)Fz@so`^IlAFLqZ*Q7sY}p^i)%6JdjeB z5hz^`B!}+DU3xJ~+VX-LH5IY#I(H~Cr*gfmH>H~gSn(yvfp_Bghs^3Ht-{+)QSjkk zNo(!7coDNlSItcxABPa)U|!=A%~*A@X7WJcIMn&f_72KDnfQ_JRa6$Pc#X?V^XFsy zMkx2p(LykfbiTVg_U2Agg82*$%gd*9uP%j7S7DBW6FK;dTdo?=ZZ2?C&@*zE6&15y zQX6><+%BE;Gd`Ner7`E-9!U<}HX@b$ZR<{{fycD_XrsII2UZcdA#*4N5J~4J`T}8$ zIS+PIHmQkh>xh)>MHA2sJYQLJ%>Q8Jg-Hcs9KN`x_Elx8c%Ga_-5A)mBS~(dRSq z6voDI84t1V@0o-AMFBW4ZB9?S@2hO@S6OloWc1Y=v@Jxevm74CYxmz$T?fCcyGFeP zi|+j*L5#4ogYgy^u`LR0eb++0pZB<^Adzlx)*AaENVsK;T0~=7b`L2=yaZ(N9uYgL z8y!E0${Uci1W}ApEGqx-Zl{)nZ@Hag>DFTz9OrET*E?iawp;An*x&5-J z$XLJBbpI8JZoe*_VVG3*XX!2YkRMH(Z7QBjDSI z0Wj;q$A9oof4ryw2wP;^j{m{eph;w-QI>iG;3$qvmJ~gZER02ST~|U&vw9+H#Dc5n zytZBCg5x!l)RL~#8tFP6ngM9SV?0C+L`_8>$8APUbbTmoy03Iu%!>zcp+cMwe_aW+ zenSI%O7zQcmdz~9+L|-!A2IlDaYVo)^Knc_&aO3}9X>UfRfp7XP|@J5osutIU}6^_ z|L@!?2+@qB0RFiB8-IlG(Wn?DJqi}IT~~|YxBecm*cw8k3-Xcty?lVxEoMi@=kP*1 z;9I|_o5;qd=|@S^-3DUDLogsS}!BknH=0=AJDQ0zsi z1>WPcXEtT%G_4IJHG4(5I5EtrmFuV^Ve6+}iw4 zoT1kH&1mwEWn7cB{MCBRv0Qnct1Dl=Jp9{A_W)*XLC;wG-JfxHIgEYd%?CDgMorZ{ zgAQLy6I~XyER`D_AtnKb;awM)TjWWW%Y9w^j+s2kc|T^&5zQOC>(3x;_co%~Oi?g- z8qF@I*Okcf7x8Mo;oP`*q`YI24?dZ2+>oAa4$vM<-X!k-{*q~b&kpay zY<`t--5D5*dkOs3q16XucLyiTWme0S(8zt<>I}sEx<4>xQ|pI= zXgKc?^PNmxDQIa`u9fUl?A0vDeG@jpRuS6)MrA7$U$Y5A1<%#lh;m-P)Z0gTgG#NhN{Lm`C))! z{+G8nQZA-vmvBQR*ZKHO$2(U1FST}wsN2;D9<{#bTR6d;BgKI~QRg6nNY%8sA^RNn zLc@;t!t{!b^qp>Pm=|N%rgtT=$sXKdX*t_}Z#eo*TTr~}UFJ)ztt$Pw0?u=48gEz_ zXP2NNKY)z_EQ`^fSp#rGMb1yGMi1OlL|CP40-)F7znU9A-nbH~9ThwCkI0EU{H~M> z&4Qa&AuYqD{Xua zHQq9D%yse{;X9B$cL#cEWBjX)@1t$imiNhAgtE?gVQ08!kciy!1K-*37HYQ>y6q`w-5q9&+#tene3Px8rE8ZN-@KBLqSdn3$H#MIg{eLrD zuv?yy+$Y3848gdj=ClLoAG@!Zugt*KS#{bX3`!l63WJlOJ_)N(h*&bRW1F&pB88CXMUvq*FIbe?F~K8jE-!0gtGM&GdoQkL+{oD;t+ zjXm%F`;_iCer6%Vj`kx;@Y<)U^x^{)HSENcUS{8^JzkmPc)O}mG8>8gGurzHk)7`J zo$`~c!nvIQc)LmIIDLvlz`dJ^k$bo~1X0xG%rBW*9RCj6*3eKJ^-oZHJp_1w11Xzd z!GJ*X5Qeb+18NuGy8&Xi{Wrn)J5SHRxAQwYqZUfH@AjMCbpvPTx)>>|S$K^HeU+bd z;c)1f@DP$9*K)Ps{O)(V_R{%-l*g0j3nID*Cbsr?zQVK|j1<|9f)XR8+{iV4TW*KI z2r3~*X>CqV+V$-lDDB`W`Jao8UBc&2CtepyP`Mg=J8hmF_of*s+k*|9`kgGHq;wc+ zcCf#e7B*a2{nfqQ%RAS9-l$_Q$Yh9`EoKm$IN9pr1HYFI^4k5FwWSeEn`tT!9kOEr zgjed9w&%AaEKe+25cw@sOknd&} z#IRP>>C!;850_cb!wCng0H-306s`yDE4E9$zpA*SOgeh0BN*#c4wus7bbrSF+Ky3a zPVrJ%FRkpsaXhKk*Rr%uk>r7n3&e@x76UK!D%O=OxT<&c^TBN`mGO#RD^XX&o)PDm zRS%jvHl?ylNpagndZ?@4R5MFfODA0{o5}Qt=2F!o(TT=Q6=e( zN3MawuC`<7fgs?nCm~N=krqo;&ZPJxkM)F5Wl#Ce)%cLzG{~V@dGpty+?)?*_eT_! zay94*7g*Rm+4vn*5Ez}lLbjtSu`w(*XZr^NKa~VnI9xgQx9s)LCz!1clhs@sSUBDu zjQ`}2d+>I&t;&3H%H^y2(bGg%i;pISoFaaXRZhc397~(xI<;FkZNm$R8$}dv^eFa+ zjZ!J&xm|nwdP~$D*-&2lu`)40-_9Pf|^c-t#ZzzniB?rjcK?_xT2XxVq#jYw5Y?fknUI z`FxyfdkQC2NbRIe=E+%>mCmk)c1rBf_nKO*vqH=HrSvvm!pb&JeBqk{>%wTh`?brz zP~U1J{HpkabS3Ce{oiHKH!(u*TQ0AaPDUzgI=@uPeoaRXgLp|OT#96qFpsfbX~3p1 zg-o$-iBe*cd#$p4n*3&@rMHyUi$LmKXNfp5PgthqHMeDYgsg;EZ$CTanAF>hQb5HM^pBFS%NHUND2(%L@b0bvu`ZPVK=YtS zl6)@D^KSGn#)&XPe4#te$NRS5;g?u)YWMH0JZgM*JL-Jzmf^4O)|(2egK(F_Tz|@H zpDfK%In$~+?%@Ve?5OG>4_^?LJm=dgOOX{iO%dEaWH!`yOAj*dKsM<1{J^s8?`c6W zrk@WO%73O7PKqOjNd%3(6b5ZGsNzsXyIiU)FS2jd3gfR%XqS|h?RJqO_Vr#yRc=&I zj!Ek2lD4=BH~J=cw;petE9R81OORD|)#M*MYV&U!DK*ie2^Wnb_gIWmoX`gLlBmipon@j0k#;o?=s_n4JQ&Q8 zQ*SD|-2zo0sH@dqcMm`&U=A993E7IgRHNL*tgrss^?e6KbIS+kI#Q8HXsgm4=GIqM z?*j|RYP=2kp)lhS$p8LPHD30UbSz!7LW}v?9qK73)b`x=$6?zFASKzyw+SX0s=I$S z!>b4-x5$Cb`0zrjsPQbIZO|DI&ZGhgQ*7AWT|o-B4STyZxO=EW$80bsp$)2h*?-cy zIKS8fsoNfj2$7^>d|CR9Uh&K>AEH(f{`28BJIf>nvV7~v2-zuMS{kh1i_}Jj_jQ_M zTfLf_vDJ@%r(7yJYJt#IZcR)%XnSY$Wtn&9V~b9k{laCfhP-Kk_Qz3b^ppZNmL3L+ z+8TCeeA~N<@*|FmV>@i&4R|8Ymx;Q)=l#x&WHgHd7eOw3Y}x+TgF%_vGiZDpeNoli zS8^nFnEqsR32I6lkF(H+E!aEY)|_7(iOBJ$CqA-2%vDzpFSh{tIj3SDyR!K461SP3 z_Y>!7piDz7rJsz9G&z!$t}k3fKgLpcQPJqP)t3%g4{` zSew`Ey2fo05w7Kpoo6jm2|G{YlyY58e_VU9ZT7j-;dss3a;eERwA?B&gj=tBflNP< z4oCLzF`e?)O{D9=cx(1i0< zGzE_Y*b)YwkZ0`4jHa6a)HZGUPlp}|*BM?H+a4>o(p~PKiNNdT0fqk0#Z=}lSmJ3? z@dWjeAeG?g^W*1JoTkW4ia8ppO}fwc8_PkK#UV?|L0`1zZNoxIJdhn~grlr=%F@AE z*7=>7Tn?-O5kV4tfV0&}DF1_FyQ$0GLwftCK*kcIxm${h-SMCqW?-jI9-BI}dEfHh zlh=7>2LPiZVTCxXkt|dX-i814W8k)&BiG}uS6QE7$63UF#Hy1lX5ZW(`Pyj+vYBf} zVa%#EPCWH+fp%!=rz_?paZ|;AfP=>n+kI48XMd7R`(YHsjJh=5V z90wjvFAn0+)1L)BuUx9%G)RQZd?IE!Gt~3%;z`CYbRx(dv0iDlcKI^5OhtB)s@<;pR1SoqLo^(f%xXZZ14onZxGp;l1L%tTe z3PCsqQfXT=z896J?jO{nuJ>cB(ds>SW6^Fo-G0Owc4o$*v&c?9IW4fp41216osNk3 zlonofUeggL&@xY2(rhqVl7Mlea4za@A}x>-jz&&rnDE7eQbQ2){j9b=NU&`U2@5RY zUWvtUm!z~(lXiJrj&JVjY^tPW-#I299T8}!X*SeQEmxgEu9jC2UkS|x0gsh{U3iY}8Z+o!l~8hQRh{(uaiA8pb&whN zZ8E=PSj)veXn#uHMvbliMzja504vlN(bc=MpYQ3hHGF?wSVGk;;^mkA+?d*seeGRE zZj+-&8n24FnT<~KR6Zh|frydQ1kU1Y$fl7G^W4xA+|Fa>I!$oiAC^RCbw?)y;xv9s z+JrB%`Yt+fNEFb?gXGPxX~(9WOx*YdDny0T;bvy#ky&Q6oC@N-*iG>ioR-I9Qz_H3 zpTNdbzl!GeK(Yz&cU5#CjdSRhD}1YpVGA7!k6CpId8mXART9(4Vlbu*b>I7-p95oK zCZzRonDM3N>(@5IMr%ZhlV<=cRm@3+gU3ZZ?9ZC>`j?5fMuuEPAOs|s>_$DL1s_4{ zTz7kX^iabGtKx&5*eBMUO!xY^+K(j_2(0@vbF1z!DKXzTY5wd(X{a^Asn`Ayf6a0o z@`**YgaaD!orIZzid2|8n4D8BkIgvUq%>@zxn7`4ly5i6=NcKk^YKAx@a?t_9Y9r_ ztN!Bv^%!^cJNfY~hJY9;vue~=uXq|DB24HAwNelRxq2r$q0vuw7*_bavPfMA8j#4b64~$4FY5 zjK+OcPv(5ogQW~@oj{D@RyJgxspIM!%@j8s!SVVS|1`0DV+Ym3qf7fSPK)lxKRb}; zty7dftq?0$63=rFKX)?r6VHVEr(=ayYg5zJEo)5VB8S~skb;gyH5m!AV1EokaKW`_ zdl5g`uHCC+Skn#^s9z^M*>H6=DSmSInw8$#!5&yJX&oyRiAc$Da( zjMBop`!~zecUOm1zYqGBylr;{j*MBo;*o~hiAOQ#FmkZU-fGnucS_`4hX!};moc2y zTzS}Tg7)@G1HMR}qbb}F`)(QL&|v-)aDi_83qDmc03Xlkqq zWWkZmBP^X6&@JvkklZ zLUGm@UVTCRe2{5UU9n>Eeq4-K|0$xpY#S3SV&ZVtYOeHC)hlT>~-(L(7@Pv;!5i9wnXFW z&$|6cjW_r>Uc^_*Mvmj`dWe<+(uY4iDUKLmEE zGG_1&mHW7w#m^I$d5f>ub|RcLVlLCg1eGxkI`hh3WltoMjb6aC?)I@7V`0?7aEw1& zz;YUxwsCcFb3uVzQJsbs&ra;aK&v^@953ucrbmMvlMhcU$2HxtRV!*Qk>}mO{P^qNdbphdA2^?^i*?apMkXIkd0G6U zE&Y{H;zZPi{k;;}aMH?uM`|Dx6F*3{BV)g;UQ)7AQhTmJ0%+w7HVM=uQqnG#5Xkr! ze+a^6d(wbbHFjCgk9>4fM+?szC`LxaEQ7CSVQ>DxWqXybxv3Gayel+CU}+)uc>`lX zXKEeHBK-wA}Ku02v~@4aR--P)|Lkl{Kc`PB7Uj6Rw(@iP-heW_NcRpS8Mt-{gBGpv{3oUyccCE`nL#H9 zObZsKS}juU5Pg*Klw+E>3JQyy0iL##xm%3t++y|-y6R~eiyD8}ZcGp~HR5y&*_O*P z>?XTguRxkS7D?PaAYl6U@*wnA7_Xp+mAs; zGX~WubfgR2$yU-Zh-Q2_{Ex7feiDa9l0-R)QGP7pXd`)?VPz3Cb8F0%!KfSefICnD zKAqcrM8Pkv*q@oUL#s>`z9;NLzL($!Yb3KKz^-Y6seY1tRyxDeHgbexShw@wNUVGr z@o=Uj&5r0T)WW?MavB6?Q}D^?<&o3NF&xMk3MkCF7tI`0 zM6+l?!Yt0GHR?6POA6i#I-zS%vhOVm72S>xQOlA6!Dg#+ks_;$&Mu3Vrz0)tg);{- z4E38$N!hvAx||Mwsh@)Kgf}ial%MOxeJ2HLqz-mR}idv_R{m}Rolnew0DbUW;e9lNeF$p&)=0QmG{^=lr*WG0Dw)Ujk9}iVLQj*s1$A zA&;nHKb&>5(Fc}_WfCqU2np?;DB*0MYJvIUKt3@UKlOke3T`sH*Wa{&I*HoZShxp( z1cob7;#^*v*jR9QQF0d4B^)agSC-rfD`T+d99%*p4M+rbDv-W~AC=YP)cu8Xx`hF3 ziA@gjKO*QVweb8)#Zp0(ipUKt!8pSvt>up@8JKJE(H#Hf3O3%(qlC}(G*WnUVhezn z?x#U(!jbHn7Qh?q`Cc;LigWpr^4P!X;r_$-^8M!*^SdezO@#o|xJ`2dpvJw6@|jqmDH6I^ox?!&D&J2g zRuBpVr|q|YW6O>`;2L`eUMs?3iTIaS8@$2T=mH%q3XBPm<>M9C5nklm|51!b06-@A zc|C2m*+a{Oj<6fbVJwjxs`4&?AZUP!CJoA^N0&Qi!%>@G{<4cdz5$0edQu^3U%9Uge_9f)Z_vUs@e^kAe%uu(IDC( zQJj3epRscIw}pE>xbaZSp{2scT2ok5t)SB~ai!CtuMZn}MxR3G45oj@BSF9l+ z-&gyG#>RD=6NVYE+ZUNrCZ~JN>I@9k;5_olz#?c7Ts>N1J;l~E6hl2yCF7x{%HamZ zVLBVcQT!c}g5Xww8+c$e|6*W^*>@j7XBPGr3blG&WH2315>;~L0B1MDfZXoPg9_+j zKRLFK=<|kKM-@?1N$0ui^Y*7Rz%8J?9wpZgWX6Jt$Tx|T&wD^tvrIaFj)uiyDu&3} zx<4HdV>geMS?7S62*wi}1>eqNqu(TWUHZ@<_fSL)5oHkLZ(trUYl>scF}1* zOK(sfDzlg^vmD|rLlX;HQ2W{KF?Vjb>ux3~DNZ=7|2FZOvgEisH>tg!^cQWu@@LfXS$gpN`APZp=&L9H3Ly@2R4Euju4>G61lwcn)g7TY?Hva1a! zOxbYfSYDRg|15d^q;)e>@>^lsJ>gc>sG=A5BlNFGN#H=F#38kTieM$qJ)t zBL%!U9V2%QXw(#k7lqBnXt+g){~prKN;D$PmWj3&L!v1kYtdXJ$ics|v}pbUb#rV& ze*uO*Wt+EOAq%6SMF37R{}GjWbEEZgSu@4~Hn;%~em>L9H_$+nT>-VDi0&JMmFVw* z4D`s`E-7CuVhjRI$s8FWU%STYK@UeT5ug=_uv!~`B98jS;W|ubH`phV5 z*&fN_`Ida;7g!xy&@S@~5w$u-%%^SD8@KQe<+>*-+!PHy*-WMh14A!Jo?^CqNe-}0 z@Ga4rnc)Gf_&Zc~-j+*UbWPanJcl0Ot?PW@oTNfvp;_}gd9hA$UEKCz$mfrlnv@n6 zOB=ICVC4^sQa-G8uqegVqC5}=33i7=kXw)SgAe=2kNMPwy6nwZ5}wG^IwU{ZVmet~ z)UP}Y0h<}$Lz zYS>@UpK#J@xSg3o)u?4gdM=F3_hVzZxAc)<=oC8N4tKFuhO>)VCgy-Ji^S)59vE^< z7SjVcBe>u5ky?s%fa7pb(sLnYdhbm`@w68`CPECs;6nFb`I|av$zY0QfQR0>)6`*m zAfMe3r2zlpLvEt~1wXhH)2LYBs6YI1))zuqh#DYLgwZY$?U&G(=E@C3qBpE1g0ql$ z3F3k;EF>)RS$})N0yzL6HZe4vx$JcpvM0VM@aj-wyj*49|FqXAMEu^ zUDUjnaaTR_2HX+)CV$`POKwnMT-fzwW1@O(+e~-u_@6WJ3gF0_BSx!*&Io5fFh3&j zKwfA3t33KYb=Hy>nG}9A=fjXcy4`;nDgZ7lK@meQjVvrN2wio_N{vRV1UCa_KM^xw zKFiVXO6&zHqRLVUB65DL=@e-3uL@UE$!?-`(#32r=@*Un82~Pfg~w8*7vk0@}XH z06t(2%_uPy09NvUHz9u%*}s`kbYGAQ2#*80<^6%7K%>o9R%FcxDz>Lki)gCz(o71# z_5-mMHV`cqbU{`FAU}Z(MK2$z6`)j`1J$^sqH{Ooc*Ov$(#IeW#=Qh$CSdLVf~g9_ zq8wUVy=gLY1FB}kZb>!H$8AgLNv4l)OAYA$St$REmjA3f02aMrn(2c>3j0D>^X2K?|cF0Zk?oK(xga zn25ApDMe37Q0e%4Z2|M2;8WhICP0`6GUsBJ(c)lxknnzgN{vbYvW_^@3dH)_M2aLn z-{F*Jnz#?1$K0W(XfOqX!pW1rXsj39!T!WBQ1E9XR*~OwwBdU02wJ|z5vPH!?T{Hz zzJvgB%QeyX>iv;+{`~g{0&v5h{uP%e_Apr*8ja-|oGWF$x(&Jjg}Os=o-}yFtZ8ut zJy}l5vjRaDs7daQRHA@w4rQHR^jq8_P{pwXU^AGPAE1E?n8x313aI#aL-ui|3oo}9 z9UcXRa>4^j|8#r-LmA~+Xwg^3C2M}hsyMSb&wIbeV;{LDFvKJAL2f<7kV2_7iIuml}+{#&aYiZGCw zzN}hhm!Hh?!}tfN9F;;h(FU_i7q|v$j|@KFiLbBsq8nxhalI&=(A^6Wx( zs{xySuG=i&PgyWb6>?h+oJTV>sw>7Pxxd#y(q-egi`&iBsa-5XRaBZ@-mcr$5x;IOu@H)#q zTqz0u{E7xIQVj!~DGG~5LXz)Mm$z!6V9YN6OQ`sQORAZ=VyQ2^-3a7lOTGA(Qp1el zPg)A{IKf|s1(B64(xlXSoEeTy(NM*3gQ5`Q#N;j{M+^upqYU6K$lr09gyk!VFNcK< znY$tQAcDjP#lry2iF^8dqTac!=KR?$tfTq6TW%C8B6Y~^1}0@Dy=-haS=Ymf-4GIZ z0BAyFcuU$FV7cCpU{5Zjpvz&5FyX-Ddq;7G7N7l_ZD|Y+T5ryp#lj^SQn6n2o4H-9 z-4aMbo8tV#()m>hx2dQm-e6^2G05N(0mCZ(x?2)kRO0oe;A_c#WJ|AR1mQ2R+0_g6 z0;j`v@SelEkOBOoX)%BP=|!fa|fd8(qAoKG)b_gAaa`(7{;W09lM~ zGT6?kWP3kAC7g+MJ#{>1Cohdr%yW>W?Y)g=v_;8V$N-6u;kwD}gJEJJ{#;KvC98q- z1BQ+TbXj9{Ww=1My^MZUXyFz<+;Hnn${jI|ne*k$+!5VR7L)5$JDpf;ifT&p_1I^? zhUN2@9awBstX=`;OXfB3Gk~!e1Eo1=Ckn97gajZ;nHo6JaNr!&xJJWO&^tY}PFgG& zO^WeD>~1~;h^|w05kt}xc!!I43E;k;lAfe#(;#hiw7_q*_NXogZ5H0({k6+IbW+Lj zLu4YE^Mef{D zff4&>-U)5qpGZ{R0NsZ|FS<{#t^l|OM&yr|Ervpj&;{-k^I)`qp}NN;h4-S)(isA< zPcmL(16{I0&G5JS;&P2?+F5_(vxL+ocLy>QE-%IIgZtukLwRp(zA{pz9a8MggXmai z`iGYdHa*4nXiBf)b~hc1sC4hXV9@StJ48o_-HNjC1N#(&fDDRf=~Xu!xIKNH-3l}F zP9tFGum!mcx~X%n2)w$J1UtcA4+8sdAZN21fEY4PJ^yOgWU@=Z%fkzgx+lK4d?Zh1 z)bp%lo=qeoHy{N*HxzgtfV9!rH*BZ}Bb#utXEr zX^#hK7DoXq`HDQqd>%Rl3@EUC{^){dw4Fs0PtJA!fp}7w8Iuu5vpxD~K`G7Ex?jrP zpo0rYZ2|}J?AS>%C*krCy0Z^o2i*m-e`ym;k%6(mKSJ-OduMiOmuPhGi7OfA3rLT3g@NT!b4ay2ay_zy!?P!Y| z1f)48VybGv*F;GVMv^J$n`6(p-*+P3G~*QQ$0pagUiTyx_u^0R<~oJAn@a4kAWLDOV^RJ9lRtlI3i z{9<$~$j{;*MEpkr$Tz#aeaJ|0ltCC5=HcC^+i(56*TYBreU;iamKV%M4gOqxcAADo z*to98r(m69`PP=G8w~?shGD$0{IM_X8t(yT)G0DE{pA}z`6BfX!qtNF&Nofe@C!o6!pa4?%2Z zAH~heXecJgzVpE9+q~M|)0~6*2A8$Q)9`i(gSLzEf!7yUxha#7lEh_+l*$~9=d)Vf z&NNS4FKB1&E!3A6JMGC*;Y=?G)EDa)VnqRR0))a}9{Xhrv)I`@B(pN4c>{GQ0WyN?WL7^PoaBHtzf@@p8qe`<52A}pzzL*@k_0;rw zI9aI>1G`;}A*80SeC#X9O1{`N6;o+4TC&fk3BR(2J%W8LUNq_I9Lqa5dEA=&Q43B6 zpUTRRCeMwjDGe6X_S|w%+om!oOd&vz9FVs~_)VsC7uDVUSU?f3-#@o_z6x-r%}cz+ z>F3Sn)tRs{CIj+CWj5COR!$Ix*YdCP_4}|M$b1dZPjXgTuiOwO7Yk1S2U8l$l*XM3 z_DP75hHudou`p;GZe`MZWbH~@^t)}g1U`TnSn;yS(5V{-Kq~Z0p+@brRd`p*15O4~ zU>3wnk~wd0m0u6`Z^om%6Lqzw8&8yM&k&)S=PnY}*6IT@aK{Ug!P=mq5;ji${BUmd z3nTv{177uecJM8*ir)JM^RVI*Na1?;LAe;YyD>!PR<|Kw#WAG#`uKeX?}57UmZ3vv zFwKjQf zC~r&Qj5zjt0pDg^hx+IeEpv|L@a4>+2hdSd;7riAv;dhWuL*+6bOjj(AuA9Bt95jc zU3e)Bc?`uD)Y6}%&Sq#olHL)&SXL%@J_T4SH#k{W z1gncRGBZ;|pbr~622_ZHivFTx)jNMp<6+2~yDTo&v^{g>L|T;gfWWeNvo)5_chZF=ZH()?C+Fy@VOL z&7mOt6AJZ{fdW2@C8~rKfN>e~sTxV~*-OPB2+nyhjp>ufO8H3eL-Lj_20oa#ogkbS zQ9dttExTFEu2o&+ML;|~eflFU8$^tt&bpW?+TA@(MSU5*yYTFwtZp!MsIIpOqRkq} z4L1}^BEY*%>Q+fdzA}&f3gJW!0})g|InD~c5VANEVVgDo3VD&0-@ zBZ$bmFL$9>omW#+nM zqvcE+dX5=)j7~ScO}57|EHOGlS(gdV7-jk{M~&t_)Eqlzb-s^Y-2q85_~CB^MH>hp!cUrn_{hYw^hqRHv-W0Z-vOA3737X?j>EFM5pFJ~)lJHxkH zi`+4|hcwCm7ja)1SJk%myXXc{5s*|sQMz*xqJV;cx}{U3q&pX>fYOL`iiFZ#QlfNs zcZY;@-Z7WDb$iZvfA_ri!~L+fY+|mt#vJ1r&p)1L=+wBQ*sb`5(zI9y8_{(PY`H!^ zl~RQy#9j$j1w6#lLEr9cZ!UpH>$2~T0Hr8eR6DfGrV&-V*GLkxY{?y_>aakW&ky-% zwwp(xUO?Q+n`}P#C!Z#a9ziG;eOZ_W>KKBk-Z$I`?RKd67p~}gf2AU70G8N(>nkh^oDAjH(C;_eXBvsjUmA)l&sw$0~&IU zC(jY>ksP|-%$gqu3Evq37l<|m`a7(4<(gndIp-S2<5X~gFplGFnX3JK6zPTO!TKHd z7Z13ZH9BZnvNyrjqDOVG@)m20WGAvEG2_-WF>r|KZhKh7le^R?LE<}?^_|rk(%u)W zYkojGG(tnbdK0iqgYY!67<#f{HA0%tQ zG4_M=_|a4hDkGPA2! z>V>51YOX`19gM7(CNhtj4_&N7uS|}HH~7*%i~2N5@bleEEp_!sIkhD-1V9PdPyXXT z^Z5Lxj&Q%$+Nb52th;fUi5Sy%0)T)_9inF+ClAos(b>uMDAq70X~RE`gK81I$r_Ij zmA;|uH5d5VP~OdWyBOW*O(mjWTmfAYGH%r>!S)|8y4++!AOkXE@&-w{3BwdJPXjdG z=3PK6=_>C7%BTMom|LI=BR~Q?b3prsvXJCi<{OZ@`gi3%L`ESe9l*46b%4Z0Lq#@Z z1KbqIUXnn~|BqZ7%KdIqNQ3mDNZom@=n`|RP4`_+kdqc=f~=2~2K*xQ2bcI!p96lC z=L*E}boIkRy~w~-88Y)0t{*E!^7vN?xMR_7fkI$5pek9R4x=}D41dW1dD1U|_E+iu zKky=0oJbw7BsS#F=X)6}o|M$TDW{_^zmd_LU_3xs(;o_KP%~y|7pWE~(0M^AwEnR` z;~Km5nbvo1SprC|AU;CsGsG&H+@~eykQ!mPo|lc{F;={LE7#8F`Ch2Y}K2{H=$flaQdGh!>GDjL5Ip>j}{C^<5e+II@AN`+CD2sre zZr7nd8Z@O~Rx57(8WB(#x;Jdh#^%iP^WJ|a{h*kzk)d1dxcz{F=h?d#uPo+4!sCK$ zGW#99AkCQt00hUU1G0hvEQfH3?~NsEQnLKGh9JSZOu1!J3lIqxvHek)EY4%<#F!T6 zwS2gpN35=P*azYU91QWzKWEdgx%-En-_Va8I_2*M8VSus8N`PGiMMIC>2(S?OaTBW zV+=j2&Lp$3@)VI9#}}(m7smk5|HO84=RPtogRI&`cc$5X_5%1ZP!OO+R_;Md4!LT4 zAPVK*i||1JlVu|f34Z5^m~&nb9{|=Y46Nb15v&^MEg$dXB|XOO06qm}h}Qp1{NH`B zPOS>NtW33%c2-@u!-lreVD1&5#i)ADT6F?i*6%0H`IE7PBqq~I>L#7j!;L%OSydc@ zhfycohIDFK1VNDqsgxkwQG|p+jt>w#>GwvhgI+ZK;O~bScK(ey_CFXp{$*8wLC2UP zS2X(cjc5@<{u`E zP%wpBzSOur2Tp{+HJD9iiwLkg)Cz!(!mLMl7$0N4_ert%gOzt{`QNbe{~K!vFlBHS ziva)_A7BU$d{|qy73|kb0P9-=8Z%%P`TpR$`~T*a|8UWONI3^Mr<5N%5s1QKkgfAF zcsT!#Zu(c3${!K?=Scn?T0mHPN$dD^w2aes`7eqGCLX*--Skj{J@6Z3;6eF(kIzt? zg^Yy%%8#9g;-GX-C_OL_0!K#51@bB8zm+rLA3m=?JgAUs``@^X{+N{iLBD{+PKd5M z+@MQq$^nzCMe3qkaTO;ZszETGI8X0$5Mm2V>w(`fF3ZNGCfM~G2QW$Lzjz1f2_ySPY;+kOLJ26NIZ);9E396vJ~ZUE ztbY9d;g=ggtQLpc5Whn8046Y%u#b1pgb;@|jxzy#Pe)5F0c%h(NWS;LwkT#k# zVv^0w!t~>2g<-NqM>*t8OOyhFNbGPkz!p5z79xrMeOYRJ#njoB8;`3x&Bff#?Kq9X z7SIlLSl3=7DHF_{B$Lw7J!EG%`nyEEI;#YT1Fna3d=sPh4TnMhrJ zv1Y>EpbU)B64BMzgJ!ku5T@Y)U;fE)DK}=o>vZ%;38(w8Vx$LcMxa**g8@(xpH0nEI;br@H3PB9D zr1wShs}5YvMDtH~HkfbBp*bbW*)L&TVPQR8y{TX5h1!(ft1fsu-CyBGIsHwjGaI_y z5{w>cf5N49Tp32OV4-Qe0KiEI5k>lB*JRn^6tc4c{G7TV?0W}2PpP@p>5KJf*e0bm zz5oX%p4-dxd(CuD6&6b4w0H$C6jX9+4LL5vtyNzB;?2^jcRW*9-;pIn-jr*8Aniv) zVWIifq;#9AQ$N|i47C_^Q?Pg?h}~XZx~8sLD0byVGR>NaI*T@)sZ*lXayM(DsLxRw z#s;uTNbbyNT9&ONJt#0nbTS!<@wZw_-D#gIgC3j!7QqSWjt~n1?X|_D2e055)qW#g zwv7*S%(WhJ z=#zJ^v|ohcRfny2AogSyUd@gUR^y&3e4);d=x zMKffp&uyYDz_m?t&Tdf0$8-2h_6|@M(lO)X6hxqA9985qjkbzh(^G;M27L%>F7(o@ zjg$l>-P&Y;Y!;Z^)3_c}$G4qF0NNgO0ei-I>3aayMml#9#(-)m3RCu&qih)KN^`~d|~-c$)tSmwDut4E!!+-XV>pg zJ#l6BCKxg%;vc%mZ;lchL+nsL+q;i|8I0N!FIN7C-lXo5fPMyc`cC2-7wOH%KdOYY zYTR_Uf;;bqQNx`EebO0uJxe28ZA~3?h&JnBbfnk4kpV{BO1b%8a)D+vdM&q;9n?rj zU1|XZJ{p~?SX#8?)y@lX_V)l*oa=ogAAtW9z=tiA>r{VdX)p?Ksr!@PAZI5`7(lKa z_Z-mowHK{p;2)_4XjHn2HhCUo{2|w>1)-*DG*v*1L;8C1+m(>!8IVg`4bFA4-vi#e zdMG39m8ye;?fiB~*#poS(D^m?Q|Py_!@X-WM;LhvasQnwyZ2KR z`AN_}(4-|V@XYzRbq^9%Ff7QV$xZwe`d!CF4Co^;LQyJVs9wRwEqFldzLM91X(jnP zXxVTCn(%M#;otUA2vyO%j{fe#{iUOv04)2{WB{bnmRj;(E`tFq7sxRLdBs2>ST6wZ z8uefpeqmE-CTUGbkOU?4c5ckDd$1?`)43(|yK`Tvovc&1)~iu?i7(;I8ILpc zb3lLixGXIc6c}lqn@)+@*lHDGFd4`BZBoE`Uq-|j14>UNfWHKLdOSx<^oV?1!eOURX^j`3fPy!S z%6>GDo@!8+Yy=b=o*vopVg)8)bU&Z?4E$)A4!7`*MQrMfwMR~CE}eos!5)1{Oelav zctglnH$yRNRY0G90*HQALwXT+!D(FR6H;ogCf0!8JK#d$IACjOX_A%_#l2Qv!E>})Qtn3VEl~ns2U*@++ z6srJDsMLfElS5ori@dG!M$9VibSD?8TUgXq$w+$=fV5KpID(Lq3!0Bs9#6%d2T<@0 zvkd@wVx79VYR{SVWWyqDvvg?#~L3snJVX(5?`PS#|%b9~_4=KhVd8)&olg1C7c9QdDq z>q)7d0i}lSfFN`c(?*)i86}XNTYjI5Mi@=JPuATc;_QZ-+EKldF$4MEmAIXq z+6AeDg4njM2{8)i8WgO?&LoKxsJPu(+0_wrnYx6EX7aYm$NI_kVwauU1n^umLXBoX ztJ%WUvW0l7PH`-ibR72pW0a)LBPmMD+qD^2;YX_lRv;_8*AWk2`vBj+zkNlap{~mY zXt~WIN2|I=7pl)r)>(4l-&(m?3^8VJi^WnSwjUf-f{I^N$khQ%VR860BBP%)_$!kz zVs~IUooE~6)fg-aZhg9s?R2skQxVdd;u-G@;9rNKV-ZE67g2W#7R4<&2~z#&q9J*RlqvkNqPZgS+lEzVD5N!2)-MJS0$f0*^OH9i02q0d_r;5iU^6N6xP8 zUd2As$|o(U_5wvU{j7_05XiW=!%l#~qN^tpj#z<5ryD+^jg296YxoJ;#hYg*6K6w1<&jCaShfv; zR4g?gftP1Zc6KIENX~BD(I5&?#>Z>sZUqG@9B3p39vr%RKmjX8{lrVz220}`vw2Z$S8N8kGDQS>R!?f{V#9b{G&d?`k53ngWZm2Xpq^z%@q-Y zSWEAZ$6GB{Ur3Y&C)$87FW0vNbd@vrSiO-J`vk=Sv~XMVSNl9yb@a6$m!cpXC0oe; ztBKAsOUdXHgELK=t~+*^_CWf@i++Q_j#w)i7%u&U5;XM1{f+x!#%WuUP}8G_XlHU#ByzK4>#XpAcHfEVw7esZWt$H*91Fo7Xx! zR5C^z5I)_X;e7HLJ40xa%k>C!f5=;E*xMlAVzQRxT$Y->;fi;3jmBl8_R=jai&_j` zd2r$}GiX*67cjvD%5_06>)e2m9JiB|K|*pRcfXG^o?s7~y-oIyU9pDdg@Hh!UKC1( z3-mK86d8J;TU8&Sq2PR^E@0QW(rR>pnIa6?@rehjME=pp{6|q8 za(_gUt#Kzq^GW&ik(fD+{MIt%l;LQBImb=2WY-xTJad@3#ik^xZ2c=0JM%$SnW9QW}wL~_BiT$EI-Ijcr#6yBe$hY1BtTO723;A#ogLlSuL7LCwTFtSvd<#{aRQ1 zuX2s__nR+1I-X;5yTR#RvIvZ~tek|#BiG)_Mh@lO!R)>iHB`8(>A2N&-H)|$yZfyV zwNrXyD?9701oWIosj_(k6K-cGyUvYL$Kg>7>NbZLH022{Y~)|}1Dl@G z)9`dwe)A+Rm1pJHo1869q*un%g<4nh-vUErQe%zN5d|=LScZ7C-hmd4pTcP;o48gR zZ0su9m><4j@j8ogZepYR_S=cm-I86$^SkFP0#K^%=<7R#pw#iYHp-*9n4Tt{61k4x zV={6k7y7ljw%(DZ*uqjJ%hh~3S<UZE@1Ey3(w1ka~(?P7mlZH{~Z;eZIq#6;v9APD{xFHP6;3!;J>Nxybyb!{n(w zxW(^Y4v?0?#l+&G2;HgX+5z*QA4v!^pf?CR_b6&8_9I0_%7(+h32`|s>vMi zk|8lKi1m;!vd=FifkS8))E1Vsd`oiCa+rGYq$W#y{wn~hYqB?HI*Vo4BsgZ=3lQAQ zwgE%!nCippNy5=sHbNDoTu<^0)fLTqQQb4;ke@+39idyj^NIfiVfc{Kw>CagH<}S=?fwFf zVnsG)seri}FH=!!=T(l3GaVl){h5~xWdyM$T<^II=#yKgDQzkAFVh8f;>tXukTx~N zw0%yFtsQYmid^jDxWi09D0x$)kqK=@A7P6Fo!MfBjwGf2Cl;0b+xV@yEy@yTy&9ba zEvmXXYo)dP$4@%I=`mb5y#C)lf1L^-pJT1%Ve!R^zmYd#H#%A8jZ(UM`WgQeszr4f z6jSZguP}T@n~+tK8;ckPv)Hx%^sA+W3UB8nZ7mxYZwA$_Gf65xeq9FaFr!{aiq`mu z$oDAs^P~h7Rm9?gv|&Y+m`sl+$*GGwu^JR%$}doDF%+MO9X}xbbZvP6u;7CoxjQJ_ z-tf%G2hufMSc;Fu;4k=fqwW>r>=SI~Y>eavaVLoea=&%`(oEqiit2hnjXaT1LY0G# ze~=KidHZtW3zS#8*1)8ark@y~ae`ePuzS&YT(`(S9p!TJSIMJSkC``)a$kttbQOvW z!Pn^06sP%&<>&~{V~~3q@ooqoJztf!4YPExkhVji zK8d^=N6nCQ{o^;e+J3xR|EIGQ(|+Z+Wc9elQB9* zr>?)CVbGBT=dQ#b-vUn4niRC)u=-{xC~4Dy@ckU+71qLlLRk24mZXLH@e!z0e4=b~({g<~b=xGD%%B77@9t1wzHP4K}9G#HI7~&UkWYCpH&xt>vpZ2VnW=(&5nDSXe z9py{FC+t_RC{S^+(@RlNJ!#F9_1=hzsm##?mrs0KG@gjxbNR~2H5U5%(fT7E`}Wp2 zNBO;sBb(}@6X7va8fQK~WkK?f+}AqUG6urS2B=lH#*5|=8(W|{g4s0ZBTDUXCN5(U zmKZ_s{=4h&_oI=VS&O;(eI9md6?PLx<0XYliJ5lMrlymH<)4xa4UYS$?=xM!cy87E zQjU=c<&~50?SXLP3za$WUbmtAB+azr8PboF7#Qx)T1*lWD+lCrn+(FVh#Clb`Uwx^ z`J^ASz)Vh>bko*Q?aFWz&rqMsXJ}*^Cd<#uybKunR73FaP<^eM6ExKmk#lTH=Io;B z^y?pXF)dmz=V+bc3uB5K^_`n%U+eYL+!?c@`;x>5fSWRNt^rp!HOQ(thcy*6eQ z-%4VYc}2&hbYdY7@|l z;Dp9plfh`pL97|$Nn>JTgbs-3pAfalVT2ysAfcCUp^oVAgw^De=b^T0f0$&QT;hVE z=ib29GWZKNKzq~Kla`iuutFwSYd4PGBqp>E3%^%ETLphRbk2q^#*~6qgat!&*Eltx@&%K0xc^<)@Rdq*JQn?On>j7nq0jFwZTzwN3SZl}GWLD7k#LT!}O}cw~JdUU9n? z5$y&d-=gN~$<~rZB!o;zd~ri}7=|^Ex{OymZ1|MAyeXkzAaDtmbf$!^D;Gsdi{EW`S+h0vN zcrwSF@3nrZ@12^XZZh7{y-dlfZ|?VAd}TQ3HFHl*zB3X3Ql#Jn&l`JD(5>PqKAjHU z&|9;hNob7v^=$*U_6=vp`?W#&QeE6J8Op<7>WAfh9wGFKNKfhU@mp4SrX4jQp1P^4)`zZTYipuNyV4lP5T-tMv63VQfnU@|zcD zS&t+*6UXJ;ceHWGX=Dp#?~QPaPkZ5LQ_6~;bSe(5+k0Nc>M0H>hwkJ3^(Zo;3uc`Z<7HLU!l8?Y(1x-)J} z7?i(!CquN<$+u0}Fr;l=jx(nnUF*!ArLVAiXK%dl;&!b~vh>i(*x7-+O{Ag}qiLXB zT~fPTh5)5uOrb!ZR;*$w z8<(Rh+ag3G%QqMXpMS%^vi;yt@~Brn*mh<6C=N$4;$)$yq5JbZcfrx5kK4WvQEE&O zVd21dg1!Qcto!cd3pN*>A*JZ5-NFcilfA;q+js5PI^32bT$dQB4%rbAuFBP4JA;-) zmy$=fv$o!{MHTzv_FHiW|NcT7bmKkfkYynbNWwxf0;y{fIsZvKfx*=~J^)a;A(!HgweW$p=+auxlF zCoB^bA%SIso$*H1PK2gucIgX|egr20Re1|!A*4?EJ=XrYdGAWgtCEx+%?KhKJ2*YU<^zLPFO4RI?+HhSTZwG(?0iA(AaWk8BTQA1m0Bx>#kqgu z9$D0mhASRN7!?oHatp0YnxD>_;~EZNVV}pVnOJw-6N=QQBMHu2i_;e-sS#>xWbv@{ zSTIstt?f8&?Fc)>d}`wSS1`UTUhKfSVlXqm$eOu|_6->7xoSXGupwZgDCm*lmV~q_ z3Ne>pUX!o9&hY6)_iKqng3Uw(U|$kQC`mK1(#33wc9oL5C3IJclZmQ+I?L`i8h# zCmNw>h90T^K0??F2CFzm#b|a^EkD*CmZ*4F@ySJ(A9nU2$muT++-7ghBg!KD-2<}| zXn~yxCVns)*(u{=qpMlMrdZbToK-xu#%eMvw5%h@ZH!QItx4SlD!RjMl+RYla1_ck z5@y0w@+JnaCo(;&6R6BPJoOmELq@#TMndj)O~dO9(uS(%^yy%VM``9LboK$*H#y@9$C?e>f zRaLZEQ_AZ$B0(dm#=d9owr4Lxk%Q=4D19&GmVEYsHVTnAVlKE{RJ3kE7kMgIo;Ear z6+=6;H;NCsD(^_RrcoDm+RHMTUp&{LkJ02RL-kN` zZmy~im$z|>E|^%stc+H89$&*sWacYa3zat3f(yoB8t4h6l^0HSiQ4f;VQP&XvHRz8 zU~JygWG*UP=eSBmXCfQm+;of86rmit-I$W4mKBfLqDNjp^8DnYRZmlKIzP9DiOW@XJcRcW?^C5>P|~3BZ#R; z`2gt#brV_kL}sPo9zI#Z2k&)lhx&Bsg}!A83`U}IeA&yjnW!?|@^#tpICc$+H(~54 zH(86quxQ0f71L1Z&NayJAYO{}3%C)i7OuBy1a`RLc++rEkyPnswPe%52rJyH5pxVI z>qcd=rvdO#t>bCZoNEfoyMGnVCLNmIFqT0KC)?wmi$J)t;Fua2w>ZhU>!i@+&c0{3 ze?74@n$}=*WU#R0o!+}OKjGSizKesbGG0y{40HKQn6dsU1Xm z(&?G&v;cA2-ZMe(eeVr^tz&!{d({s^`R{kXgvqyf%N1nbtJY)Fah*qZY_`hS?F^HY zaX6%|9=1x`^()@XpOavDS$=z;?QXQ=h8r_+woCQbl`6BH=huakn0cPPS87oj zF(@d;xsb_tC{q9r7<$CId!y(bBTK5b%23@SkvBfuJ)SHrL2?DPiWf2kp3xaTE&Hn_ zT-B_%J9*3a?i$X;u-&JqFg^0WWb}JpitL_ylf9JcmA{_8(r3@&_3W;VyiZD%Qk)UyiHo(5aYK zQ}$e>y+-js(G&7#LBmxAIb;2D&#-nL3|mNlf{PB9L?h03BUe%Gzj&A}jkA(cK z!y>Bc`Z0%fP67rF{dZEbOjX3(T6|?@Y$X1=VW3AxaER|CEIV1sT(ZW;LaG^29Ij>b zxHx=+_v@qj3xk^XgR7Y3v#WW-c`lK3Y&jpA8TBW8wOV9)WBxV7B|UnxXvJp0_^F1q zYoHWZDCy1E@Is#9UF!Q%pUn@Rx*Q6ug~*aRS{w2e2UrJmN{pnM4iB)snJ4C$1e;a( zat<4C+N+GLkcC8mZxQqQQ|+w4b%Vfeqa!hq$Tn`Lj!>VH=L>iw<>mLG1o+6ONarr0yVKV}Qq}#hp@JWYB#I_)@tIqQP^roy#uuZYVf&)#Qm`UiXzuM+7CH9F#h`pVB zRi)fXzKB>Y%$d57u>biyK3XV7dRml1hEdZjD!KSmu`srW=I)XD9$BeFjZ8JKJFeB> z;9_UWx42|ee~WAgs|gXD=Ua^tZbN5vUN&=n-NEgb^{#935+f{?AaW>H|51)VLn?!q zOs-S()MM1i9;-|YU)~ zpHC9(62!76tyqA*>Jh$IxsFJAi$X-rCwp`WZF@KLVrhatGwGyI+hrycKAKdf4I!2i z4VW)&ECuT7^?T-e(o$z7ZmIbpq8`r9bUn4ysjg|`CZDJ)v|k>(*CxJnLLr2~wuIYm zu~rkXGjgqa)YsoM!I8%4CNF?>?*?898(LVzNOQF&@&Td zI@4F^G@dr@z6UV%@IkJNYRcQub17O6H4o)$l#kZf6+0*Hk6g-3jcS^!=Rz>JXLl%E z-|y^sICm!k*Kw&$%fgx?J(Fr%U~Y9jgTnk{p8M7oo!aSC0^`IXg_`f&dQ zw$|wcsdb)?e>`p3o8a<0ujBcx0QgTU=;GKky5reRSYVnt>78T>vMc3>lqwXdYilp* zjAe$TgI$!-Af!cjP1=m5g>`dvH_L(dYW6rqHQcqkISTj^dt$U#z~$D}GBg`jNAeIt!UESMgZd@brE@iryY1>xWlu3lu%}9fCR-g= zo&9rbb?gi@gL+KemsFRUTGUM}BHze&jJ>jMk}($?eg0_=rc+!g|$Ial@;8ds$!*{BL*weon8>7f2%os;a-JDoX zg$3^+`vxsf1f_hm| zeqwfS{)v|ed{S4=ajEaGG0U}bGpE2|!m&`^**g`bau_wOuut?b zS+qR=Av#qWLkwQ*JFFP}R@Yimm)hn4?uyUih3{+L7s?;;(6oKpp)ezhF2-Oqoi2on z8|kK1PWT}lyFg>mwOH=U9gzV9k#Zw+neMiH1Eo`vn%mafIs+!FD>>CwE^eyRjy$Fl z3d8ma;g}Evac zB07xL(^9fpTDN>ewv#Y4TM7yfE7ud;tw8E05uFN;XfZJ<2psx0Zs1aH5cY;Wm4)=) z;o>ykPQj`no3KC6Hcj*RJ!-S7j}lrjC*~~1Ru*=2j;NcJCd#v_El1Z|zhIh1T$K9C zLMlRiU!4a0q+dd)80_AdEQK8KB?Z1hXX{rP)l2uyZ<^jPHIFolk1d7`-kT?Oda4e; z2fvx!{tQrXL%YN5a$NI^cctq4WH=~Kx_o-LW|nyvS@lL`*!6M1?DH?jM zmxf6gE}}JonAiQ0hmq0n9$82rG4J;4WhHs43%w8coIH2Br&Naqt~?~P&b?xBgNnAj zN_j|B}7aSPbcsMca;{ z;3sd6*xf>d8;VPv)!X~5)_GSHYt9x?R?d_hiEkHjdXRtW5vkF;gSqzl4NYdGww-w# z4-<{qbY3hM4Mi(DLDuwSYPSJ=MP*JO7vV!fo!K~frhgJ0GEcdI{)f)r=pF@3A_*rGhc%Y(3Fun;2wP<#MdGNK|xke|iua zpC)>;l#+#MXNrd6BziQ|!K(0>D(ywomuc0>%$W3I0soaW_&YrGk|HsUTO*}9!v^Js zSiR8)%rvfAhpqc|e+8TD=4RmzCKlPKX?zaH7>4n2RBpY=Q9l`9KhZ=vXhxC7>;Y+s zjl>%eV_E`Z=1d+FtEN*-a?@tYjtM8a?i287MA>Gqp-qh`&)ORu9S&4i8MY8cR<|VK z8+|?-+*1&FLU?p|=xWc(Te$S(1zdZqXr(K;fsw*yjog0mI-E1x)xoxvBb$(3ZK>gk z-0b{oQ^JAF-SRY7U%HG8Yu}6(>yAjP+oRkwjd}B%XrC#ZrW@&+@3`f#t-YtAGucRa zH&sU>6`0bc8Q>cen0qBVJK-b4Ok-K0Z3gK=tWhUF3~oPf#c6vU{sJ%om$^x*8+52c zNCQ8#S6qj2eDcP;KeWr=3W&B(-cDZ^yKL^tSaeFH4P7AP6eAh@YI#mz%w!us9q?o_@`^ z^&^fE>s*~xbV<;A#wUf~qM>MS5P~TBnW&S?2{8!1ZX>S=#idTx)c2%ao47MbJF{8@ zR?yl$70FhmgtUFdK=W_Q3fE51c8V;^h-G2Q>Oj*h8q;4^wVL3+Ovba)A3Bgtyg{1V zSA|xoU*`FySO#XEHJA&r!uUMFtM+ZQcD_FD6WZ?)P9BN}dGfg``SPw?uUpK?Ta*!A zK_=nbd-4U=G@HIsVVeZ$W|y*}Du-QEJHb(dciLzb*;?CE14IbkRhp?*>cmJOau;&R zCtMTY3bG@o^?X%%izX`=yYA+fz4mM!mHCXMlgXR3H!j3)JkNgtsssZt+BOtL67ynQ z;i4NJM9ebkLVHxahEG%TB?OsuO(pYFvcwUcXAF;~`3?yi@n< zW?tOl4uz`i3oUQ6^lAbwnA)P;UbpIebg-MbA*Q>NLwc#T)|;ceICr6*(o!-=Q<)u$ zqF#r4=_OrIr-WK^JctcfT{GMs$`@50QPXwjnMtT+XxJ-{)qEfsp*8Mj8kk`rVBUuRHnv*#`$Q>EJ^c8uz{*ZV@35Fq9zHlUDob%8x#Az0!y?Llt$cY);S7T$!$Lm$3RD8PZJSg3p3 zr$lBUj<(FmZCEAeLhUX#1)vRX0A}htD|L=pE_jrioFH_qz&JeA{j-V>1y1!QMRgwf zmjzTg@ke<|4XhEn3nV zND|pgGt4!>{KWZOjLJ>^b#K~@i@o`5xwJfk=?MErzIGjv1FPyD#_Z?C23VaECUkf+ z&d^>A6LV>?R1&2}$5pf?5uiQP`W8J9Rv1OFc5WnZFL$Wd>}hAvqCE2^zWe2M7;E>9 zN;AgdAg)f7=fg78vPIRx!I_mH&b8EV7@A_{3d;G>oNdyu<&rZ`*h%KwygFGVT52CK zAS&E-7@eV3n#}De^|+gG&}tyU<^Gg!8+AnAYee5Su(`{sdkcf!xlzbe+~`OMtx5lZ zaY5LMpfskdn1izflX>SLzTVvQJflw*=O}(nDtXUdSqDOKw+pi#Be@Th-CmP&N`It( zbKT091h9Uu-h=A2AER`hUJAZV0RHzHa?rwm2nur*GQt%IruYEgjJ9imsZ(lgzr--Wd_Tt5nu`A+%+E;7D5_k0CFL*_24b*jR#P zw^o-S;@21X;6od)wX*XnqCg2I`x<4HWcBP&j_b|XV#XL+6w_ZkD@rkQ*7`oErLJ|Q zGaoXtZ}N4=)Z#10+{cqMvnG=t9vL@B?GIP9+oy8PQK{F*jb2){PocmLS3H2JvD{fl z?DWYRE}{6aFl1%%trS1o1bb@*>|p|$UL_kX!N=xrb=14Lao5oI?@wUa6{YB7_e^+= zYx9jU_YibOZ#sk1Cc}lh<)WG@_ExQBi4J!v0ZZ-{K#=O#g!)cqPd_G7~T>U04qg40u46gG=OFW^pfjHXQ5X$4ZZSCyJCH7==O zK4?WN#WdT4!*t!V3|6^FDYUC5^)S918FEH^OkbIl>L6nlx5`BMIX$3%f%1A?I#5sO zY9RP%&p><3G2|Cc2+F=r@WsQJf%4r zY-q=%mpRMO-1$%ts(H$`mawCeV7m=EQcFGjeh<)n{Cok*6D$v)4@&`ElLr*&?|dgj z7P|ZMWIttqexZTj^txutLiGOg;q-TdDe>%|qk&%F&jjDrjbPBPP_npTHjka1?y<>y z^ziyc(*py3PmfCb?>s$px4&Ma+tR_Kg%+U3;QL5XO4~g;SwA~nk2ndKN$Wsh(8siy z1WlB+N?(d~J^hy8X7(h=u_3-|;xKE%X*=QHe;Vjhf!jr15gLCE2J5e)7u zczd8ueoSfrL=K=7ETn1%h^fW_lFjWJ*?b6e^-SG=gqEKZ>HELGO=ReiP-2DF{&jSL z0?Wi&9avQ}oX5Y#7oap13p=bkPgLCRZW;fy%aaX4%ZJZX)}FKJ;H!UmHH1ztA7WPd zkeuy8Gj>k$bD#6f5SSrR&y{dOy&&=#y!f`2ANoYl3S9b%y${A6h%>~zKnq#ECLL%C zW?~PBSUsOq{`K;y;4M>M6#RVFGh$Z9pNDeh^RI9C0>&zEE)EQlj*DsiDT$Ig@mXFS z(?DzkLxg&MUCFf6!V_>+<_H1LSt6H7G^vq`>h@!{89|5XL9bqlOuZ>qh;af{A_*HW z3nrkM14efTNtp)&c4FOEdE2+t$y`_ngwh`CcJRsWS_RFfH-u8cysk8unmdm~Hc@G^ zya$InP~y3={;waw?<<2$3wRr%k*ZJ(opj)h(bLCZL>`0oo?uE#X+m^9xVOF#;=Dx= zr#mcoBZQhjwdRv1@}>uYu_50l1u>x*_yeX!KgEIH@6NOr$sel~B;5~! zh%I-<9qhoTzcvL%kyff*I?xK-4FNp7cPapiT@ARLU?PIkr~ZvWwhk( z?{WI~?|q*qU{!cSLt_8k7{C_~60J8&0?_74FF+@-45Y&v?ZyzUP@)!oV0)dok?&vlqZh6O;!gFGHVO#tIs!e5xrJ7`pT&FsHt623E`? z#E^U$4K}&&$>Q%f04Cc%L*uV!`g78K|2OX)L;(JMXV5$ToJ&9B9nhJQDZxhRO)ay$ zA;}MIJs6O&$GwKmWZUjz7?}HcuHa6fyZL{(6XYGrs7b+*`4{l^Ww}UwehZ{D5X%{m!72ZJ%~oVvyFynKa5rwDnBghXY0 zJdQ9()pMaA15Bv*I(!6}prN01FtE{e*NXpDTv9Q%X4jwOs&iXVa`2q6#dhx_!fD{Y zD=7!%Y{t8GuHl5d?chlAxN5a(c2WxcjTWhnW?bv07GE2S$f5?wdCl@6F2Vk;&+hWr zx~sj10+B-vnj)xn(KtBKFo_MTA?D?8Eh%d=Ef0S^Ao^Z6wg57*OtNQCa1ZT8%4jcoItq6fZa`Rci|m_FSr{pMO$!|DJv2ibh5D36g6aO}7D`)O12W*Twv{ z;Jmg(tk{tyD5qG>^07>;vYMzGChGX&y-9yI!)jk9v&?a$W?&)1Sbh&a|FD>@J1L%G zWc;Areq1T=e9WT`r{gkW%TIvpId~#|9E1SE2)I4d!-oLsbK54P+}~e;&=tp3IsQ+Q z^?kZ5jipY#x~!Hv#2i(`yqww8D}cyKrl$E?>U6wQr?hZCZj@NMf68&&{V7#|Tuy^{ ztRDdt^>sj5f#H#SC*hN(vx>1&xg&9z65eV3oPa9+%?m#MUfddW)sGn6rZi*G3QmT5IWC<_I4@+{dNVM)DmR|D65y{u{dPE41_k@Etz_ zy6dg*B z#@4facSR)M=@Rn#Pf5BzJ&_4oN8M_x^;SkXs|z4KqDEeX3~#H?x%QBH{XvIe-WH>_Ee`cG3*R84XP7{PqJg9;1%OMcgF*jKRwG->kbXeH{>0gmp6mlT#S6|^N6iGHOU## z?!mQXHXc@YS|`^+M2RG;>DaOo_zy_wphy62C>>~cR69@zq6Qc5sT;g|Qo1{AJQCv^ zi=(|g`jmyLJ`9f1OUw%+TWbKEe1Q+Hj}|$U+sxh>eJk|{Vp2eXMP&(f_Z6{m>(@K# zCcchjGU~Q{kDXVp$Z3yuegEDRS&N3_+?x_7c(3i|>v)bxWsl-qgBL!?nYN=|F<$V= z%eXxmNix_4UxQoD5<8We^@Sj1#U1(VvUrLI;GvGyS0A@jXO4qvi-_G-5dcA#lZX(D zil!sw0a;ZRtv=xwrKpMh`op(gAnAobq{awEJi_v_vg_IL&h!nEPl1b#4--Iu@VgFg zXayQWoavt)O4h52^M4`c8h|Zh=+?`n`c28FD2)=z(%&DET3?04a~&c}>s}-3n}I z4*&{sk_ck3b)chk2gt$Ge-SkDkh6ReUZODVc_FMnhKyam&M?+z{G>bm`{1&^iYht2X;=Bm5s3 znMOB2*Wu$!9dxuCNPbM)VP5MbGUEw!JS=zq)^q1pZG#e4$-`hVXA z^nMWH!RL=C{{J^^!1MdBrVU}-jOgi%Xf~`F>}{?Y&VaF-O&b|X9694w^eMqgzcFf0cANOT64#w!MJ>wtp#2?ENbtTJhVFO!&R9h-u##s~zu z9k9^3L&W%F zU;G`F{|h1qneX&6hz^-Fpi*Lm0#r(DQ4#z7UMT^h9pl~KqR9oYN6vRo0pAZcMI{xR zE!a;5!~fiie|yjWII8JBo3`LFfouO@X#UB>A*nDVMk+s9Gp7OMOa{`f6p0}n|CvAi zFk_oKLc>l4VZ31A#TnDVw1^;@FjxpLPI_w*4N^9tuQ6}lzCfZVi;NPKc*dA`yurPQ zw^&#YG|lWQM#kRisqO79zSpHQu{hTO`YiT=_S@wk)2j7G+}ni%9HMC^nD*OL`Bcp-JElEZtQbr{d zj&ZAyj1sadD~^%Pu?{K?D;XgxGkedhloPVY;TXqW#|r1*aQMBiqwet;-^cg!{r-Nx zKXl)X2iLi-_w|0ip5yg=iA~6RK?2%OcmyQnYkhgql-O}qmx`{r53hwV%0bsAtFuYUoy@Pk}+|ACOuqI~C)7iK@= ztk#OAbMlXf4JJFHD~Fwq00%_}(1_c;kB~qk7_8 zeTW?gzbf>NfOhY;ec5*|wQ+7_NZm}XR|(oqkK37 zHzBbrkI(Bv07&?xCiRCnZZtg*t4#;SF9LK14AJFD3SDOf>*SbX&D7NSz|P8mdvW&a zq2FtL<0!t+v%mLN&!Dpc;%~kes0md7)m#B;C2-;+pb~M9dHxB0QvLJa674`_L5cYK zRp#26DAThL+!5yu&ZAVckjW{ZiiX|0y4?hE&!>dPj=`c25N2(2!5q3Od@YPUxq|OicYEhza>vkDMtRK9G|V zD!)M4B}Fv?cyH*$BD1Z=5aRNR`4e$*Uu(J(;Q}T%sf2-Y9q2se66};ufYJh7Fk2)- zX^kgMg~xZ4$M?B2GAn1A4~-dEU1z~$f|Df$Fpg`E|FhlvAIfYp(Z5&A|2rBJDoTxk zwe(-0t$G1pFr~->ALR1M7$24R>q+Y9?eM@Jqqx!;FMpPK@~Re)Xur2&iSbqFAJUxCme zRJ8>e+6}@G1pDLvTGoPqKSXu1(u4YV&bu@7h`I+EF+Tmz5#uk6@a~MFS3Uc19%5G6 zNcX7a=ej`g;p5DnkUVwg9ryx>?Qh_1!}|X(c#F7livK5u1P~j~03L1TnLV`BzlRFI z-@aW2>9iqW+5WW_`$@|95!#1n| zn&X^??8%|zjSGm9wDLiNXEyT!KvU_Q`fh7V{lHnEk-}$VcLs6S^Q56-Tub#ohfK$l zra6CAh|nti2R0pg2cQ>ue1-)zbY@*qI?UvY^r16WK=8=;FlnO1C`%@|Pa=UPA`0{m zaK+Ib6n_GMpO^Ai`T;CfvRX(D=zmE01V3@&-_YmKRRHcL zYmdA6Ujj*JSKE6CY;lCfU+K! zW`{j+R6ueJnAOduwLg@}DcyUIme|)=yr$~S3CToYTWxs@2~_Eb3c3ZKVk zzAR2+yMK-l7?Ri{B=(3AlX+xhO`eOWuG9t!Va%Z=;E>*Pb+CB-=Qs4P`|&>;1AZ>E zlx#@k1$RE=45%O=l8Fo)_d4NNOn|-U;68$mD%~`J(WR_@4umS{8GsBT@aXdo=|ohZ zkGx3+&jvau_ZkiU%1->3JltP*)i(FxLPWFP@ZF=t1UlGbClVfwx*or5`DI@=cnX7% zmQL>UI~7@C^Y2VG;83`Bl*e`^Kba8_a{_~aFocYLH*HbML*Om?pM&&2R%QQ5xdKPxx-NEE*X>2R0+K}jv7XzU8e_KVIwqI`=zY{+A^CG(*0r@ZK)&HnZN@)Ya# z`h+I?^exVnMFRO-efC*S&1FfTF2NiJt=69j~WdgBFS$6IxGACN0=LJKY@U0QnCuPV3HE z_)UdI4m-C04iilu(heTJlCB0kt;g$^W@cG8gh6JTR^OAVxg-cU9S*o79uX4ReWo}0 z2!eiMc+t7E@+*!%UOi^mp(?LqT;r~G$TFy1NP=-eN8H#R`f(1>DjP3YSyJw zhb^3A$7tYrK{9*83)SEK8lqLqz9GBp;hG|aJ$NGEKczcMi33Zv>82XjN1MvmB&hDvHkif| zHteC+@DjH2uGmE$C3i0J%}nOZK5@L>LdMFo)1i#uo;zU3zVaa+r{v%FdPi$3A8Pi4 z7DTm0TOg}L<7>y`B5MKHz8D2e0@)yef477W{PiiW1|ql#@I)^Q6vE~@n`X(JWt;0= z3550IV2e<@iFzVfQMbWk=wA!Vwg*mE^ee};FK;Gq3{=}@$u^#2MhRN`d0!BQA-+S3 z=%qBkJqko~RspR@U#Kfd#^J>o;NVgCYPa~Z&hO>|5C@yfcL!HEVpDHcr`-WUou4ve z2SmRJI(RRlZcf}lWhTw;w-3xa^)A!Oo$4O+0{_LS>yKs*X@rH2<7FQ0XH;cz?PNwC zf9NM-e7H*_q!DanB(TZGkX4?~I)4>?x}mKNI^I;6jQQ~67%*S2UYK#9F z427s=$?Ca6lBq3>yq3BhtVe;nCBKw2=UOml-&N42d@&l_Nrj28PMJY#d&W}x76Y%% zNq00^5e0rx&;|OTIK>0Vr(58gT>ryFTJ<(FRWnR^uozVu7IY1K6Y!S+eE@U9W>^m3 z%zyKD_?D;yuuwBFx^Tn+&h6oz(~HYeXE$hO`KIG1vhIFGDMTOM0GWdRJM85HK)$EG zd6U!d~2_P~3q)Vq?3jdLZ$KZ|o2Nf*X`)|o6ClcgW_L}(pbfUjrd zdh+jU=)Y0A1D+X-zh!|+I%a7*!oXhPH3|FE zkx4GRYFhT{`(Mt+H_CeC(q)fn)(Qd3Q0VZ6AJ}Z>Qe%Ksew<_JK5UX-O$WB~BUqSm zcduIfzr+@xPRgyY4=+FOQ8prWhN|0r0R4ZLpicr`;fv5~3|vQKW*VSsErpxEJT-vS zh>CLn*2!DI2L?b@#oOEd(_jl`YIcSA@=f(-q6=hCN5ycLpZ6v@0DC&|e=5KI)qV62 zFVtQQ1~n3bcl@Wr5_*;Y>ZSwuP41>6pkb0ys~9QZWS8#$7P&z;W_ML)w{R@U5O|7y za7lhTL0P2m1*y=tXF(JR)w?l#Hg2ezr42Gv8rOoN`JWAyWGDZg*Do?i9MBsiHAD{p zyJXv+*6;s0*!_D2{)@26iGy@PAm(ng5G;R^RpsHRJue$5aet9OktNasT$y zIUIaB0%A4CWSm`{K$2i?fB~dQEG%blbdYm`nt@_Pz6|J6pfBif182f7*awL)WT$K? zSc1Qt7JzuY240h2P7CnA|8G_vwCeXT(YoT_F%j@m{Ax7#7nq1_qCj1in$-la<0AQW z-2XpP!tjXXSkfG-;avTFR?nTHBIV9xa1oWqN6Q3xBY)11_H;{_PVGo{U-+w}D#-c%kAH zm6z-_ug~-ADI0tA3saO}|Om;SN4G8XNy05qGr=XKO%c}b((i~wAH9`qkpCJ}2V zAJ)$?LS2gX5W!u)q6&OoH4pC@7R|!^9oH}y-`QtdbqT)@>m{3mJh79JM#D}+mfi@= zBMD%ni)mT}R~2_}O|MFZEOo?`AP9NyHTns25B28}QSzjB@>>sYXV&z1t(|oE_IhnS zwSMEJEn->{>t=hc`_j&RH5t%RwDM3Ot!4;v2`0;L5{TiCxPVsIUu_@Or*A896T(mn z`S%|-ykW}pE2}`A9Q%eL#8oVARf&D+DQI+c{Ntm;9Kw zx8qz@IN5)d1nH`mKz2y>u;Y7i(K$*UcsJtYX@=+nXrO-pEpC1-+3(o<4*+^WH&32A z_cQ+L^_35AXY9<#zYPMvU2|NU0g$NnV`~OzlL}{@ow0k7oP>VPsCEKz5(znQ{l;_a zaUJ*k(_o8s6wXTiBW>>BkY@SzB-eci?gSAIjwO|SS3k0IK9naSCphV3CBl@1Wurhh zpECM@3IVo;#&mMRrXEnA3uCNn_I%iSawYL(O1)DqY-s%n!>S}?V}v3d)Rm$LGn5|7 zUGCe@$&WMP@WxE};$%wN!kyCTM?k=8K+L^E8mVEg1LtPm=lY>VXK^LW7TKLxe}Df+pin)MDC4}~}l?brFm%<*zrV^Xt2#!KxU)VQN0e&(dR2$x$d{$GF zD|7A#ctz^h0|BDDLT;b!b{Ze&ROxZZ4~}DJAmffA8PJ6(e&X9}Tn-8w|C+(`y@Hy4 z?6H-uBtSd7$b`!y#=o7xyLQD47a{ta4Ye@qYQ$tUUC$0y&kikXm`brcZ2ThnfF$Hr zgFdX@+)RhOerzhRIFcXE$^Nnuaj3Z=6?D@s!@6_h$|?x{sjwgO02C=q)p(SR&7njC zfcMufV|&T{z5GvOpwHr-F|gbfadPU2+i3dSW5mBT20BK=d=A&qacBnDf1fo=GEb0X z1^9by-tMqVqp~%8r}UceiB90k#8<|&z4!L3i1Xn%o@_K@cbNR%LN!Y>_ytsY5RU{3 zt-sp>V>laB=TOCH!-3(!@!J$Y-Ga|vOr8G`tk8Ejq}NN|p+(d^{mDi+`STlo5ky>J zx!8?01ienc{>MS}8CC~%b*?^1s?*dqC@$q3+tg58r3e@Ym_GBpI3@p0$-7PgKy+1UvUh8H5;q~hEa6$mr zlie+>cV>z8P=l7y)hT&UUIxFo0axDq9VTuF1mYLRfx`*ZYH&KrLGUb;j*aEOs#6?I z^s7W~R$UqaEN1gwBgD_+bY>awSbr(l*6SHUaZ5JR!B0+u)(#8cp-=|!<6qkcP@S@u z)zca7%Y4%IyU@VfleW$#exsKy)z@(s&E0Q{w8fziCqBtgy${O!F-qI$6}+U) z-j&9hRS4`P=iSS8orK$I?Z7djQuh{I8`FXns4n&_2~dIltPn^;?>;XW0j^sYbXP(3 z8&q~dU;hj%Puv4TJ>(65*`4^41ROZB>~!59bjFeKZudWXb~zlN_ECtJ^4Gr=2+Xrn_BBED zZZ{tsIf#^y+KTs=oWPCq2!0Z7Pt_MdZN#mAO7lR^*DA;mI}{4XikQ6D6uj@5{hsd$ z=*?V+_&rwWwYqzhr*H|e4W*17y`p8y2}1L87wv-GpX%^R(_Q_hob9r0gIAAkk+`_p zxIQ#8SJje)xoSb1^TV<@TNQ3R7x2Pz;Idm5S1PQE68b2Hj` za|`9Yjq1UI^!-Tq?H$hbGE{a>rB5(JyUqnUtw zv;ugJ`^g_ohiVllt)YBHeM)@fi(Ve6fPwJ2NY>oSl z3|reDrWCw`eaHUTDc!v&&m_yaw%-0DZUKoMc>h%on?zZOxmC@#5Qv^(7|!UT_(b^X z4)rtzKdb3n1)OSCU&fRB{XHwI?!$B^i}hkm>BG=Vjgd>USfecrI%jyJK|I24!rW`h z{8(Tzj_2HBr8$RHS03v5_7ottX2ILD*YpX!8yL`L_m}1o=#QX!@fV09Bhde%AB};L zMv`V75W;^_==GrPDc@rylJ(hxR#2h4ZyoHV=OBSx|BJEf_V ztSCi;{U^c&kb%cpf~ckDR3Fy!zpTm9PX9#L|2E!xK>DDUuWTnEDya}bdHI5hv}-K^ z+XFSE0hYe=FOqCEXz}Tjl$KO($N`k+ASxTmX<*^jmT?&bPVxZq6OIJIcbU)+7d%|( z`(3v!(`@d2Mdol9Tah*vqUmxMJ(t{Bv6)`Z96U7}UxkH6@ES6B4ed7929Z|E#-{M> zX{qUJT*pGbmm<2ZXb=I?K2M=tQN4Vy<_T0HtkS18RZ4w={QsoQ_+7V;F23?}952d; z8OxLnB`Js*wsS*%oNEb5Q7u;4ekKbq*iRP;gRZiHrt;m@X-}W?c+JkIP-eqV=01sFyX)-p6~7gSa}#98W}DLARW7{z30<|Qxww%V|F#uc8kyMb z7OZ~N4j$wm41&}X)-2A=ZwhhLgE*HGH{iw9{GhOuH3hhfuAlq`lP6Fr|1@(k?JV5o z#BSx})!U+^q zbC7@Yz*9)I81Aa zX7sV(@*zdvG@At1G3NN=X3aLfCb=@WLTNZ2G<~lgqiIYS0<&BedvbT?awFK8r9p}e z0zYZAGij={fM=Hq7Zjay-nTsWT8+W#n_cbd2rw2;Mm88yKkR%kRVDUZCSA2FI*4m4pNqY=sx8KV zm;-0K>(pg%+l8g!uWFuLjXgj+~stT8Zd zIUWTDNOBZ&8Vya``PDzX<_*AW{);&d10A?H=z!ikq@gR+V^^fr*#Xh9MFL;9c0eL2 z{h-G-%G-l7QugS#Gs|t_)NW7-lRZrAEsY>ur#UwN0|5pCiePTF2B!&0#V5>0d?sR+1QnKu>o|?ZMDfqPDkVE4uHnitvLGXjc?qlG^v-u z2dYjyG#27!qacN;ZH9gDN4`>c7DCmE@kG;Q54qXCgg5`5xh|Rq6!Og|iWzNhyf*d` zw+?6WxT&yv|6F*khiR1EG6-Px#Fxi!m77L*3tFH{bvkQSF4Wv5xHxlAmyt8Dx${%9 zs!(wV1?py_Oqbndrd{=62j+T^j~dC@l9$6B<%P`Esdl__W&?_@%xB`QJawarrD(FI z{1VkhZ^H(Ili_CG!ijYMOK_UT9QFisTtky;6+xd;Xds8hbVg!ywW6ff7LkQII&lZH zU}{Y(UUP5giHZf&I`E5w0#+VBlV}L_V9Jy3wRPh?nb3ZS1>(=-oQ<;>Hu1Ct_Zhz8 z#x27_FBfuoPP!E`X!$2$0~qn*l84&YRZKaZr5C^$ccO|*_wzCHXiqZ6w^23b@QDc2d0sqJdzTZfQYeTE zSaL!hWF-jtfnX&HW#4&B2*xq_;GU@S3K0KMW`Ei0t7iT&WlCNPux9 z41bS3qOV918s=j!UNrgX!-J8=m?x+_5Q)C?M*QE)*|>2wq%Pbx}X;B zbl@~kN6~M9TFr8f2qk-5s{UK^ZJlkqP^HBatntsbl>B3Vu>MDwi-0=%cM9@XPbMgt zKXXtT0e2dtxRE7uaNY;~!+GyxBC->=yFHx?8VOx~+c0J`1iL%89O^BBmrw&$n8G1n zSIW3nFL7ybw?Q*$WDoYu>)w$O2&cQbi6!2x(7dKzcejGu2`0UAMy3GCpG%6+dF3m? zsJw719=lv^UUv+MTIv(-Gw(gsW7p1nsH@jc%Y&!-SR#BJs-?GN!MP}^K zX4&`IYH=ckJu}vtDpMTb)nNH7xwW}#0W+Py;k_V?Nd&M3x;er#&0)Hc6BBWzdMVYI)t~! zvNf)#7vEPqkbuy7P&ilj{K`910hJun5VVy^bZ`B1cQ85hMEv48W%1`_Y!5YX#8_h1 zPR)-%TYXZmElF!r;G(kf2{iFx>T2-LOz^S8-b0S^U!e6F$<~PKsj}3LF5;>IrHwj4 zKnyowk|Lkm^sNa2eQWPu2STcfTZh*{ef#q?NE#EGSeJae&f<2^VRFHmDjinm_p7a> z#=*3LM`(m30d4TJR{VI6H-*!6ekqZVC!y3bigDy-l`_O z33p1~FG$p8H0|$2b|RT&dJWU#*WKb>x_54&+5nXMC3SkUc!q{HGuUL77@jRvA`k8RO-!((jdrLXx1(VyRtQZje=j~9u3bz zKK!Gx?C|_;^s_df@>&Cqgm{`zPcB6<_ikG%;|Nwm|1vPeVAc)Lf;z~8f_4|mGSMnz zHe2%?pV85I5lN>ZA$CTeCGAgRET$16FoAGu+F3+D473~ynldfw4(n7tp2|$|+thB# zC*B9Z5f&6NJ-k+aUU4c(xAnf;yZueiFXOjZc=US9B2dJYY?k-Y*D|D%u?53ssOm{U zTYe^Wt@7v;hQ;$S26C6J!Hj zyz(}UWeCQm78W(6SnOu3LSY%&+Bz;%{z|%YKP^MCc#x-T!CLewK^Rn;-_j+=AL%E> z)Z`l2Ck3)lh54~FSA!#=T42aCLMWzWJBMIgxMShAsZ#ONrAT1w5yJT!G1Ryl3vrFcO&sSpttVoX7QfMP9*|Jj+sB7*S*HY*aZ|O30 z)6*6`4T<^``d+5sTfGM7uL~3*4x$*2!Jy>~+Uub71rgA_Z0Bi|T*rvy)A+~lp!CPTQwGYK#6E8z+lhxEhr#qpBi|j`R1ke~zDT<&&R0VCR~7(= zurkyj1ir&xgyKPrY=KU(EVDXQQPj%~0B$zFmQI;dhw}MMU<_ldD>vD!XQ1ORlC9ZS zhT_g^6_7sh|VD>_;*8Q!IFvysG!Prtm%Y!${?L1CY;QNqpk!nCV%w8*Ekjf!MTvNy@8=MHvSJorNJ zfddN`dgcy(W3jD>)yER}zX>C?fjz1rE-PG(0^Ja{ek!sP~M& zS2~4(h_;W08^uzO8h#)BJY}OzfGY@E(VmN|pnQd?L0_Wc>8cN4D)jL2M!aKKG~0() zCrRSc5$zoUcq3wp(fKDs49Cs7ZHBqi0>Z*M)4JOAWIg2@#tKp2> zG=(jS&VCf`Iasx=A~ng?sr!=)l5_7Q?pdR^o3TXB#<{OLBBA|`fl;0dCp4a8QYA7! zVF>!G%bk(cMcrrcM5tD2j%%tM?u+4eGw9bWm%CnNx3J{?I1N00ygrj}Bu>1)XxGMN zirAi83BrXyw+!=4r1(IFL1!5EwTu+~;zsl0*&lj8!r`sJIg%(u<|LP3(FLOwdh6Mb z8e>V8LEJiAq$%;5a-N+ekkI*h*VS2-(e^T~$1lF+W~-v~Jt-jMx&4)O00~6EqV}ts z_Pu-sFQ{Wp_l&h@NJq20xLZcyxjoUir|8KqPeWk5uylU+vGYCO35RC$eZO&$GqGa+ zU^+_0KnVVFGvIkLAxwR)&|~4{feqO9D3h=p|3zEokuN8?c5lV6h{&vpWTIJ&`ZRG?n4f!7xq3pmX(x;2^0F=EX}Pm<>K4@R1iVQ|I;n(>D%rq_#}TLQpU)!z zLt=XEMUnT#e)F5}Ctv!&MIPWKafSN$q=d7z+r6dL#e?$`&oxEt@fs1~*@z7eH9&T8R;~ zplPSPFVbdvm4-(zS|{qNfJ@HpUe80^Y!@T7EvVlJnjb>=#&$01CLH0AyB1Gv-;;k} zJUq!COSR#_Nq!Sdc}qEkfA^=9@e~dj+T+xD$>d{_MQp zFJ~xU)m~r%b|Ito2GP+I6U|XQyU&a^o@GMx=47;kU#*bBv%H*Z?#h)qH)M}niPaCa zoS$nZ2C~Hv3^J=P2%YepvviTTzeq*eGrD`Vth~z&H$KT z?g&B@Q}p9ZeHR!78lRxb2UZF&mSh^WcISde&%D5-g5c!|d|_T`1DeL|6BVA-5-r8; zuWj_%TpXIZUz*=C9lZ5DbnF`>$}azG&jn>OozeHQ8dMrC5DPr+zXE0K7(xH23fTf= zGdXAVl$x8H=i7z8lBI_fpBoTQoV4{E@-Kmisk6d3?oE(CaI2sCvfN5}aT@4AwzII<2cLW_W#Z&B{T6}2>m@5_{aClQ>>>Ty{Wp@ ztfjJzUlS7F6~D7~8F&1U12Wl+iQ0Q@%#S+3jF;L4$>B{ypIH%GTvUuaeSiOqPwEP7 zYg6tOitdr~35(u6HcIXM-uh{>1u1c*l#Q}65|q_ZauxeSTTajgrzk|d3*X|8U*S)& z+PGeA{+_nC`E?A%jP3Ih>6+G=_l2)7RRrQSIyqGtIqNqCesDDHqX`*>p&6 zleVx$EVmGekSoA$*2`?yfA|W1k@<~bu=YX8yil`>R*2QCv+1G^2Qg=fn3F_GoU|o{ zs5QSDc-wb$cJ3VIgw(w_`DomWf~(~f+H6iNv)efLO@=U{ z<+J@OJxaZ_21X=WK}$JH$1qqsx{|t*QNZ28yzsH4-KlzMG#VGn2D+p$qreEwOXf$r zWIp05H%@eXeiv1XI}D*jj}wW7GzA0d)2UNkHH_;2$yq zA?g0sh%Ci}0m%*VYD2>yXbM*Q=rzRq7r!W(lv^Am5Tk=fPDsSSS=kj{Vz16FxJESt zr+GR!@qtS6v$fpDnpqzqjI$(M_2|W8F>7zo1KU`doaj)_*)9oCRAZj3Dc6Gdk##1%cYGT~$Pvm?9!iJMSAR>_Fn@y@Ru~)9Zv0t10OICciiv{iJh0PY>^aYNq z42Z4*P8fIp5=FUiR6=DHs*bxbLy&R-GXO1$RQBuh-aQUrtLZQchw&E}-0wZZxyJuT0d}q>gg7WJ096vdd2~-!UY)1}0n4{ch z?aMRLVqaqR(pMfRU9>C|HwsUMCy`uR^s_@K=`)n|GR-51q)uT{XK2LugI410hlS_k zN%Q8BI_YK{x|jRz4qIU^lrB=czvrmPGhMp>VBxt1|6}zEp|^*4V%}^+d4FfiP7e_a zyQq8zkR|}0_SX|0(vBYLq&Z1>uU${4d*z~Jzra_>LXaYbeA?n15y-VO$|ck+@7JCt zXf5b!CRU)+*oy1y#i7e|hRrOyy02|`-`%BHCR$tb)oyuW*F8 z>WZtCy;F7`4`gum5QN&UbVG-%d`~wA-ic%uWU#iTvk*h{#5mbSw1B)I7}z9JQdHWH zLH|evc%utS9I^GH(}^5|AgT(Qvak=YCTj#L$!=i5+H(?<)eERIKHE61uz_}gefnpt zYFo^qlikS{gV%1*g1k`L%bYlBtbujQS77jOf%z4bPu!3=^u&cN%qAk{mTia*wQbBU zb_x$QygpQ)n(3(j$|qRmS3m3+@U5Ai{iK;ftdfS|{DwKipZe8jbMH zT7G$l9^di%65}>Bg-`!9!>aGH#o5_mTtDKobgIv@)9QO+XKxHGfyP+B5T+ZDH>x|e zmlXwU0!S{IDwKD5%E|`;ve%G;>+t>%4JEC&1tOhxF5}S)#TvQ4?gUPvSuT|FY;Jx!U9cokx$*F*Xe@i+|h(X zwfs!tYmZ;U-Ilj3l5+T&C?7qTC*pF23yaaKX9LYNo9NmPou_(-!lOOR)tR55wP424 zfYV+<^!pSM6vb$bn^&4q{ddKXBDJ_xUm~R98AMP-GO71vIhC|9iaQW^B(2!qZ!yLY znzV!m`$o3R^tR#=2cHn*+ul2teXR9DVH}X|Bz0-bI!1fBXKv$W>|MHU{=%_L?3bW| zQmJA@>LRkv=m(3@ojJNKiZY#fo8J?@mI7aJq=aRojkx_rbyVxzdNuO4L^Ag15+`Gx z(KVYc^B>;b;^;SYt{<({4KWnfawGb2Xb=4D^5^pI5g>YMrYn_=rUnZ^KOn#xR+9Lm{NqDTevEDmU#i2}v`bKk%m7$^e# za))=u0MtkWrjZz-m#X~{IR>*)a%6Ku8OUi}Yv+VFUk>uPpY){%7I9QzF&p%ep>N|W z*UJ}^>kC8ALa=Xd0|7JLDd|0EvYrY0pi^3R;X+b{|$k&Jfy+zA@8ErMThBy1( z$)DW(^wCd}s_}#?Xtwc7M@l(!Y-h240WZ#X`&WpQ*I90ulg&quM^aKya%-Nz5Hte9 z#jY$jm7wHKzF}330?qG7!0%5?bNVYv^v$q+10S9?&KNy@xCLr5IdwcKh;a2f#yECe z3|~K@f@C(iSozt|5BWt%tamE~zj{P$AChJij&vQ`DVgp}48kAc2EMW7yF(zu|IEsV z;dCQdGyy|#dE&;hJx8~dp0J1N34L3b>Xtqjb3yFql(7_Y8#QlMP%D3T>Z=+yl|$-d zD{$6Si*0N)obWhj+8nfVH)cju(VSo(d*@pkh46yNfvg?`g{%$+GcS*R6_vydC+#AJ zH!UN2X`im$aN2J{DJboKg~G-u-oQPKDhSp8h;@VNx0b9h=^BQd3LCkHb^O*ZbW#!m zuvZuIP+6oqKIdoi_7yGxN`03!x7p_H74_!OEu~13i5fh9hEBp(4;4nYYv|^B?%;BZ zJLa@o_T;E{1I2^5{wmSf6*GR@Cqo5|6lYax%f~x2q-1+pf%ttkZzEv8Y_w1L@Nm?T zR$dbFVJ#t9it@S4GPR7l+0co?s8cyo`WGlf(wkV{##uL9bfinEGb;J6-=j_4$StU{ z!A#g!@(Hgrc`%}8`I)c){xbtj#5L!QPyXBOzgLaQcMUBe%5L_FIUJYOWUZR+S2@D{ z2~K->rzv;4iE1-27ipp#5Ho8C3NF3px;pJNYuO^n%D&DUR+w5QJi)+nsAjLRZ5!J1FB~&yB{n;h`cfByIgh2d| zDi|W1xD+#FUG9QU z%rVJpcD+@NciDV3;29wHXVIk4J_r0H?_Hn)1WBAy61L%h^;a>0?LR) z-)Re^B|Ers#L*sYhm_Ip5xp8E4*7{3T~qi$*Hglre4B;?-CH2{4%IOc5tNx}loJI) z-ul42j#C305BS1YLF7ncd}tgbnj2PusZ4q>D9heB00*2o$?=R!+U&4=ZRwH250g-vz(rWcKUtytz^d@l;8|e|y^mJ&Nr`<^ zgVwj24OJMVuYQCDWjsjrN<^I^M?oCEBIY_Xdx4>;JD+pfhEZ(ahhk&y+P{r~(?Aba z`zO?Rm82N~A7gnRg$$UbgW=kFUy|k<;@M;t#^4kn{57(pp^NQMVe}VJz8f9m9v3mY z6Fmwq$#kR^eIPW$7vhfT>?Kmm&d>`e7x^@a~ zxElQ-5b>C|I3+gaC6noXlf#kmI*Ca;pA)l^J5(5l9fm$Z9ulNREoGS4_)di&bt-C7 zMM9rlW~)#}&xZa9Ltp&~#o?)hT!dz>4EEy*Fg8ST0g3w_I+(+USB~BHX7&^*Ml5K{ zy5@%7xHg(EFa_&0MoV2X^yER-(sTLXxyVfq0}N(;!C}EHDB8yK{^|#PkDJ)cLx4fv zb;M)P)g?u6n{yq*@1i^gPqo=(Dad^#9=2`XI3{@JX)U7s6#r ziGrp{=2X}hj*ros&baY62CG*-H-KDhjNEuBr%Ir5pd_%_h6D{CU)HxGgCpRxkMF!Z zvwUSA7+U?;QElO74EH z$$+^?l3(3m^;8~z9?v)L6A#AIkuW|Qy7P*?ZLU@h^jcYd8{;P_g390>2|u#@c;A}_ z^()mr)nKwZaX8+w%ZAogk%@8k+&BnqO`)^BozHU3e^cG`s@s5|8M8>gg zRyN+juvTW5K*z8lGnbxu#qd1xUu8=Z-jX^bP#eagf)Pq)02!bx<{X6hmR9M9Fi z#ZDhkq0^7U5{RAx$Z2U4~N_u#U)#`dB2Q*K@at zA`RhnOtGX$_zC}zn&+HG^P#2#2FXdjCIHF{HhbJK-|;8mX(*|w zOXd;H6`Vzv>(?L#mfy%tr1CdKv4b0uHLH@Z8DkLH12Z&@Rvq6SRFzC(Xib-aO{n7g z`|dW1frwEtd0oW!4BF7$+4!B=+aX!4$SgC2B^b*l5`rGsJrG*_lG~0Hi~rQ?;92Vs zzcqMS(i1QMpS>EXo;S@tw_uv`{OljS`Q~vD-Y4vq5!!8I-W=kQ6n8MtLy37$6!}%*c+xGVc*VyadJEu$LycrKtQ7JAqAG*lBi>uB= zV-oO0BX2mn`$`?F22&f=T@1W>)DBgVI=A0@HA`M+ImXgG(W@O^6<3g$)g5EaNJ?6Y zrwwf@;1iysgh^Q%NY1W@CtYqA>NN(cTU#N{oPA~qp=oO=(QBO*f5CVhgZDy^c2{&g zCA_x8=txDgcy*LiZ?0$cx)7neO@+Ou+I&}JDy;V{Bfj3x)=hV_cPSIy(1i1^n9XNw zn23Ci!c*h0*#t`oeI^vg{4#&{!du443ff4FMg zWTU8y{#K&(h;|klH$H#9QsTM9;x2BfT(t;`^T#Q;kMiDV<9b(OV%O3WZqzH|j@eA* zO!Fip?<#Srs9mUhTdjvp<6BV8;KDnk!9a`1N@OG+H(BJ!wO(_?&923AK+SSGXba%Y zn|)7OZlQFHpSNHQy%oWHb1i$6Mo9D z9g%CtIhOaG=2g*qs#N)r!r*8dbDCiJqstDj2J;LOuj51CTi8eBLIFZv_fMSxVct0# z*~%BZRaM^UcfFVZ?jr~Pr{mYIHMgBT6?s8dm4@Qzpo!tz`}sGLC;C&kKPM}n?peAK zcj3t~ZVaNgBqZ|5sc}g8jDKrxg(O{Wv^^VvW!2|H^db~%F$aof?PhmMjPV0+r(Zq& z8d<%TQr{#Zhtql!V#K%Jd%a-idTg$e?CG+#`keCXJZH}7cH+Z=7!LB#$M+vNsu$|T zpV@eS%t8{qv>CV5|Gw|BOG&u+C6lg{fJxX%JyQg~8?TRv{mU<%HYEpsw~TscU+t<& zWfJvia|i(B>7z1fCTA3Q^ovW2112|1&W1Zy-FwiK`$k>xl*6Kj zmP`|)dE<@rCeKAAp^mF5PWelS^3d!DPvHU2w_m5e z9oM^Y=HA(3-eY`TW244kfCpt|+Tq4Ic+4nZe6iOye%E_5$UCzp+gTCJ^E|x9dTTJX zZAJ6*;X3%U^Q0pgR_ABe*@3i9_vk?N+6`6f;u_wV_*>frXK&B=<@MH_3Z3ook|{SR z^y+C-td0Nd=qBVo;}Y1cY-GJ|V?AHdhQ=y~Dq|I2Pl5013 zwiw{aMPAFhQhJ(J0#~*9#yJYu3`6a;jugiKIQ-_e?T&FcU#ORX5nfWe`>o*SE5Q%h zgtKTr+7}bbFNu0DOc1qPO1RRJhnc}e2ei4?emfn-r9|J=b9HiBdLwipVif*tK}*K* z0IRi;WUpB91?jbq;55lQ?RHSlE1BzJgXrDqZWzYAm9?f{rMAg({EXE0bAs;8)mixT zW#Zi$pLy}4Z}f<6*Dqo36a|AHClSM}BO3JBKfKS6L2Wdf^GP@7pa%BkJM`{r4k4Bu zWwPwJj5>0Vh{qti?cHcgJ zF7ZeX+#>csqq)*lJ+}jfiQOfIZ*lmcZ8w88Kj-kWdo$cNEVg2>)>vWQZiU!6wTO3f zTd+d~n%_xM$vuzc@$Eb4)q~?{WoDUWt!k1la)pls5^phm_W1GINQ&RyB2qjKc6iKO zT!;n1&f0QAPp9c|yj#z7Op+A~Q^u@!#E#wE2_`?+1SP8{8OyAsfNHs#Qq3acA9-}z zx*b#7f-g3D>|;3>pq@4olgmc|+m%k9i&FGTk;; zU?Xj<6i$jR+?O)7vP8Ux3u9hh>rbI`T6^}0yV68oW23mFK)@tjjlCwocI=dW=_{}1 z)^iPf)~dT@LQgu;R(SMsNM=49^|AUPf8-~iE!60GIo5U!R=?R9O9D0b5-ebIdb@&t zV@Vt?r5am^k8-B!%FZW*o^<^@fq6F_mR0qwT{}bh#Y9s0B7bg9o2WN4^N{ZEJlYdx z6Fq8^Z*T@eH3#2#P;ybnH5$B_IHTw*@6)ek|AQ*|RH*jIyy^1Pbdw^ty&5&A)V}mH z)P^r6=H^9crF>pAc7No#S*7QViYU1r79y%6tsSDK6)f60KIrCU++|sq{fOO~IZcC^ z7cF*rbKPrco$+nHpj&n1eSO|>@_q&lyUID8mTiAoNh}d2{!z62tpGtXS?}wC69b+$ zDvLbdtmP^+dG6f~LBEW{J84lEibOfZ^5iScly|-`8OSj_Z8?C~x+L_LTUzD@&l#hQ zQ7yUOez1ADzf{~QHrguAwEAv-_~P{Zz?xi;>+kNF%W$QT?3Z!-{IVs~jwIsTw;}3% z{6HSwWmx<)+nXm0Y{8DYZ@0#-?GG`%+~8TN^|)N7fc{6p4W8XG;hix}Y2%cXFu(^6C%I4PIHG^-^dgmKCjpr@dW0Qqg11rFl`lJVnRnM*{F>-iG1o% z=vQ`kOIqRZE$&Vos@uMlZYEp$==9){r9>J#{>$rw2Ocaa`EoqG9{rVT~_ET-9VLN5+3%W51Uluyj zXhrLg`{=iVKDQzBTs*gS{@vy!Zg%Rx`fu|4qIPcV)-grayxD$ZegPSi9egxsxA<&w zgg-Lcpx^nW%b8HHqg)+`%Ph?|sEqus$u*ImugI}^&NFa41UokV~nhrUY;fhnaT0F{&g@O6)uGX!~znx}t8^bJZRWC_yaNq6hGYcs2lo*>n zDwoTcc0g*)XzbL)6)D)ES}dAiW<<($pQGM5CV;1&R&2w=4B$?=1m$Y~-VvXB9j(!7 zZv*?TvASeWR&X|+VU(9x5w)&4+i~`_?Wxcqg;-dbZ&;^vmg|RA{@>qJjh=J%Lo)$) zGuMl|`zrEx?p`|f`s2t*-1NnsV_7A`=O-+b^%|{7)vlJA>A9Df9y)%UVLp6`(;0LA z(HW*b{qt6QHk0Cd&xGSX>$Dmx@xa%KAF>|_JWt%QQbOe98XNuisx{g+bx!Yk!8^^9 z_A2?-HPt`vUeZF`e`RFD!gO`kMVO`9`u#;`8Vx={bB_b3808b6B6(!(xL)Vl^R6F3Mv8$p(q3dL=vh3QUnYMN)ZJM ziVC3$NDW9!0BH)5-h%WN2$0a;+IY^n=iK|Hx<^|zk-M(ZGMAEywy6C4{`t1gw;m98oFm6DQJ15clF8}SLD?-<_u2jU8rAQqPY z!dU=;E!FQms)G>EbX>FJ#^&)9ZMl&TbsT6 zU}mchKU%f{D`RgtI@1U9Tf7lPI$@M`E(!XxZ!&#+uZ8ulj`>r&M*_fz^j%{#KN<uX{n1h~#^ z(_H;*vnyp#?hKebrTKEFrS-0q#MWK%G*T#<&cBkoJ&~~3*xibZrm>5!m$-eH-U4$* z^~?FJd_TtZ_#*S|5lQ#9HPrHR$={+~Sz)VT&P|)%4e=|6Op-WNPP6$w$ogLCuFXtR zHYahNa8iYx{Gy5R=Sc@nW{mfC#@iJ1ZvHFecH+~LZ+MCOsfX>eV%S%F_Rrj|j91CU zV7itA#V~rvwh$OK-wjP}(F@_!T&>E!tm}KREoGLJtG#`MGd28V?xLAYjm;mp^^A5mwVfl<(Y@WUOxLBt@*`oXgEAR4xB9 zdT(`Rw`3bOe33mlwK}QeUtcar{6_pvmyBhVzBN}p+p<$^x^Ikv{&D-Fw@=q293VB4uU?($QoWdSc`k>YZG`3%Vo$ii3bIJ-qlME^vaUTW z_UJrsQo$$1+oWvY_8-0}{whkJvmY&c^2? zhDBA-%=Ea&A8cSked)pOa)Zfc?J)0^s6Wd2`O^2U#fh>QlW1hSodwgb>I3OZ^%~T^ z=kabsW9fvFWVQlrc-J-Vq88`lJy)a?t~$M`8(O|KK(hC++RyAhB{mj>FhP-G})IF!-p}FzWn&&MZJs9(0-?7Sa7bDl6 zW9Y`WZ3!gSv3xVVo(Adh_g`PYiPYOZUysM#eVNFr1i%R=inHpraoJhuX=8>9B9++V zeAFxknl$}tQdY0|B%HF6pW0x>Qpw8~2rsl}A>njk;1}Xa(s#4e!G=)VQO)?keAwE%~%ZGU~G_amR~F3PGwp616z6I=HQL5 z9i&%>UdEk|5elf_7FyWNG3{o9;LT6o;H=NlNGu;Bst&bN7_(-f-!y7s6??yymcKl> z&F^;O9ILgWVzX|u@fTMIhKr#EoOq^Yb3N@zPSM2;94iTuuy-;3Tv?Q&OV7lnUk;cY zfBQx^h&qqNov1IUPev=_#F! z5PVTJEnP#(m>YfCo@q~0*XuqmbS9c6iz6Bk-OVOQRCOu5SbIA$MD_BJwDrm<>C-f1 z^pH6`eWc7FkNwPzfr#aeGSbqzJoMJ-aSWrvt-YNeKggm{c5Nyv&mY=u&Uc15IO+U0 zu_Z7>hW$;aP)kaOY-M!D%x`rmyGLakWcfnfgZC%H8Rt{f9ckoAoljaaS5vul z?>Cnp+Dk`}FAP6Odb+%O;FO4DHGl*Hg zq_LLlG#8s+SR{pfI#b^RupFN>CfB6h)vrlp9G^eoI;~7K_Q?p=>&+^!T`ToE}sDZzd3fvoGUncwQ?GB)}X)3$dFqi9luakUbM zEd=tCP0r|6p3W2jr8_0QrQ4;gI}8#?zquOkm0QZZ_mv?q z-ySliQMv)P+W&xmzP0{Gw@1cF`!T!T%=xEpuX`Gvp3zy|WR?`9ip8Q_rjq%o^wGlJ zsiAGW-`QJNc5d`;tM4plv5_f{`nH*_i1f#4SzM&tl~b4ddOX^?^k6=hEUasIq17`% zb?u3|=QJntN^~+s?ZCi6#_&5!%dyNSnd+Kv6X&xK#J16E@sVewH9p-c$(N(mI3U}x zInm#`Er~SrVWtYOrrNP<37w&u24vTdj#Re@C3hZtPMsSg+WqR-=jcq|9&`enm?#Mo z^G;i`2ne zZmZEQogY+{eOlA$GkD-ya5Rd8=kqY*7=}n9@q>6E)zi_OhLd$Th(GhgBea^mEQ>?G zZ$Oq@tG!a9C57ky{D}4RBGbI)ikLF4>VWUPN2n4LF^$F6=tPTVzCDw)(X8*x_JLz! zG>bfShKN5`z5#C zk$rXPm`X{!g!SFtw}xLE@_l)$kGYjF)I#P0t;hfoZ;wBHV*T66A#{g3*Aw6gg-R!Li=1%Y z5!{CGB!R50eb$4aPtg=pHKl3UT#Tl4KK$n~i%%`tn783A7b%7^T#He&yn(rI2d8|G z>|}*3IxsqWdd1kg$Kp&YN<*$KIHu^4Ec9<5 zWGm<<$YL-l2F7M(9j)^bc7X%U!BU(fKkWolU8B6+wGgprvX|sF5<~AOJb&0eXYlR( z-B6Ut0+a<(a~yeZ-bU(uBfc_xQ%b0__onwl1Ne;__C?y?ueLb|6lj~$i!Zz0l@fNT zk|tV@-Ym3Uu&25VU8%I{@EY1rFSd{*PTYT4Vp4zY*k?q|B1f}e{tC-3MOMnJE6PW> z)nvG>S*WVS&Xk-T@tc$HYfVR^=!smue3O(6;n&Z*Gp_PeP4e12na#Jmj}gL?0_D|j zv@SoAe;=m1j}hy9)yCieOk%N(i=H>dsv_xT%|HAxXBTh8e5)ny)L~g}mC_jCXCJ@( zZBH@9H-+7!axyg~F_`m5sJz3hiQ~1nx{~(2UdOavB^Sh-s#uEmdzRQs@xK<>h7o&L z+UD%qoPsem)YN{GKt$|xFtm4h8M`TNQdK7B6He2Cwh43oX4El)j9UWd~TMUMF;xJWojki`CB0JQ;gYdzSr>)KVTpZ`SeaWaEv7 z5|*6gk&C@VlxFhO+XkldGRCmiLUrxChZo_M#@=}5NM3aD_lM~Y>2NoN0snd5P*nGq zTR~!8gtCT8o?3w_Gp40q`d3z=?(?+ttK^+>yMV@6cUBl??#o{JXdTEE#k&1ShxEHw{F5M@-4 zFh<`T@8zYoNDsc)U>vE!&oOS3@$6M_@3|AZBw54KzXw~6&&eBLV#KrS%7wMsEr>E~ zA5r4qKo(@nE9ya(}k8(CWaf)c!6B+ zBECH7I3gd)2>pyCyrPuxc~|Id_c7pr(N7kSX&R|q8F8#N8#0r6$gR`9UlCC}CzJcs z!!of$we?51y`v0~dX1lo+KqUvWZ&VQ@`!L`(oRGt!Dl`@Q6tq>cTysU(i&5c9w^$p zw`3NEgQ?M*+`y=5Hh-yO}R=Wsc z*oXMzJV?Ao`?n!kvPaNiQo@|TRbGjQn?sESb&Wh)<{8axPYWrVz z35uD-7girg#0-R$npM5@B@6nFel@PM=B0W+fMswVM_JA**y((iVE;(v3r1;oNM#w! z?(!@*>#E#^_PQU8X$#QG*mR+NrD7=r-Ls@|R`ct82pXpJT*gQ;t|-?WVJTRPy$oAD z;m4f0vw?M2n{iEX^5*8fr#=A@Ep$}>hcdD}XUGImD9A$3Y(RV{__DJ*_j2)Eq&Ao> z`)OOrcdm0Kv(PLE&dzq+M2ipW^HZ`(7X!v@gk?N6&5cY73N27e^j^8&bNR9<7EIDw zE*&V}bN2wJVj3n})U3?kygj;xE#i9$Ht~eS_Z?lxDSrP!OC8rt&<2ftgz?Wz&e4|f z>V`S9IVYt{WLq8g*%KSpUPzke>*})=5YXL^^*4-iYCh*=^ za>RLH_q@qR<{GyGb1_!9_YgIS4Ma^|QQ1zoKBJ%s-o|!b6g#f#nwJ8%4)~Gr6+fCs zD=thf@(iKho)uz^3*neeB6q+W8l(PxGMtOx9atIIm2^m#!H4(S&as%Eb-G{F0k^W| zwQt&XGlv8*WvXuPv@Lhh5p+0DQ6%*deN>!(n3xJTR*V0%qg{C0#i=`8g&GtvT5hNC z0)(2bBzbkC@P=E*G6XzBgjW=hJ0#wBg=z3A68WsoP_GUsFn9HYH?_3x_>n8wwospqIch~aq%RY{t}U1h5%ge{8j-K+ z{RY}VLAb4B!01AGVfRKm{S^bb5de=ZV7AwEC^KZQ4S= zf;+?0@0)T|y)Fuqxf1NR7z~yWYas<6J7(boRGs>YTaPjVc|4*Cd9<{P2Z`xIMfav;^g)b3sVu=+|~3jXUe$UV#L3GTU8 zNB(+Q-%8apHnFH}2hny|5uUF3tbvE%T^7F+i|NPqeG0P_Dl*{gE~Y5V?5R_CcP-p( zuWxS4ZSrR#jiWbVt2!2C8^MhQPa?b47s(NnjP4`ZRF$gfR1KG=tb*Yn2!`7~hTP(9 zIPsP|PjsZg$JJRdrSC!AYzg6b#B;^6BWL$d@cN*PDh^)Qc@r&JaF~AWVF-tQ!Kf(} z+4kgURHN>8NjF~85cAB#U*@A4Ov(!Inca3^EetJ>{o6^JOVA|4O%H6;(1Gz&mFYy0 zxs0829zHY=(N)B@5h>}TWm3k9)a*xsSHnIFz(ZeA?RQus;W2020k9? zPMKheY$1fJT-Hi?g%Mw#Y7ldLDJ(SF$S;z1*#IVn#ZY~G+e6T1+U3;zkc+wt`Br+! z*L&m?JjAh!7u$4MWAo8ItIBRF#Tf3%K9)SYOR9FF*)kXN%u>CqAc zMk^KFGpeJtkXh4S(8_UMZ7-b6mj{N?wyWQj5vT0$RJKIfb=`{~p|bmVj^T3LAtagRzIIr0}{S%&VcOCB9(OnXAV#vU1 zQ)rCMtKcb$jdCT^WFjyBkZ^lVp#ZjA-=#qx;nkq&yHp^+-e1+ReR!V5&gnZTrnU-+ z1pz&;^`EaomBbGkEiJRX8w`Fl%a8{+@6IHgow&J$L*bV?{eG40!5@Q#w1W@Puz z#vRYh&9Jv$?eeqepSdJ7*EBS!44kJ>%qJX7xbxa#r0@Ktp1Y*FVoL8&`%_IuYgSCb zv^&NR&11{mEEt5+^xULHI}nW0?7!Lhz)(2e`PPvmD3^(7GCpz^ZC_Z`UnncXCt-|aNSyfW3Q)~uFW0|@T0Ak?HuU~qn!>)CJQ;z`tD+j>QQv&TR0 z>mV#ME{D*!Kkiik!GWHzks~0GUy^0tBNXtiBI=|S;=>W^mtbHBZR)ldMXF9SvUsW0 zxvi_gC?^ycZYU8v$RX;MJPW0p?4oX*L2RVze&j<|olbhz5LqVOWVYqjL4pibDs=x% z+b``VI3>yIq1yiE`eLEZ!i4ZAx(G9=u~mAm?)CUZoOx7-13wlg?=Dp?^Tp1*;ohW_ z(8O~R4P^4q?0M1q?e3FC#p4fOc|J=`8|z5ZVAdp5snDW!+9^p5?emIcQSD@^_})3I zo2`VS^ydjJUeg zZ4Z_AVzZC=wdIl`JGJDt`r;VftSR{yJ!zKY+=9jH>T}TKGZ>VrBg9t_x~>>`#uh7&*&wr83*fZoVhO+eQAuwo zphzm<5Cb$Q5S5m@6X4PxwfB{E@Ipv^u;F(QDot~bWc~mFT>JOmet>d1#&4Q%4cAse z{+R=u|KY-)&zQ&O>~)bB_7T>X+c1_iwu3xXr#2#eCiYeJ`*3|khrS5GaSQ99whL*A zy>d_z=TqOhB;``zKAAtYuwRGhbOw?5ZJ~LnwjlCW*c@0wp~rU-S}Z2ug(mZZv1qP? zmYqD!-falgYqa3h^EL8;cb6owV`X38RT>9}_-jMmlb7m?yFYUJx8FvSBOGO%$lu2E z2;@Ptl0j)L7~#6?W|(e0s<|nE>nfqsWhc;9?jTHG%GW)j!{vU`*vwQghWI2|Av#Oq zlKB!Y-U4v1(xFac%kSC74{(4=uISkUF z;|t+ZCqr_o@=_W{0AiD=+&Bc6>w^&UlYo zi+e3LiLW%3M-Qw_t_+(na*R}*zG674XC!%nC3Qi}%xFZo4LJ}0X)xRI*5u1q{c7N7%S zB+D2RglYT}4+-K9k>Ef4!hK$9SK7My*+!-x0VPhJb&-my)7r{1#1#DBh!1F(nbIId2fgx_evd~Z^HY|^#%J52lK#;o=@GB^V=nv-@gf; z3L-yv!p9Ha@KR@Iz#S+;rViZ+ct%%1usVcrbntvYrTg`a-t#*eHZPlAG&)c^W8nMU ztas8G9hP5Ax&IL3t)JW7Ze(sdEOBBt7Vo$#XL%*%r-!jmR~UB$EDdhFF!@&Q$Q-YN z40s9Bmq6jK^OeV-?H*orCwRZDCFvD>oXN_yk0-LL-Cy#)aTw7cWD=WTAI)c5^$=zN{(=Y zPO%-niIkEP4b8TLbUx{exk180iQg*^(aCE|WIrsk#Z+8#WyDwur-u}RYu4oDESYt9 zI|z$ixDZUp)zZPA7s_Sj3{KI&3K*KpU*wm+e3c<$&W!clCm^p0E_8=+!!Qw!8lGu# z{g>w%UgTWI^*~vE$c%i<_oMmv23S(sJcB$UKDq*B*kNF0PQT^pN_pRzzW9=ncrjnP zfiHdfWd+JRRn7{3@rvcx($7y-F*FOWiFTs|HRp9XD7o?-TrG0~YQJ8~pRRDT%=HEg zXSs@l*B^Ii35O2Z(icI%%A*QMOiNsf3#f$x64H1?!Uj-eY zAPG#+2{K|pDf;G^R9r`~JAl8wVhzL51rv7O!RsRXFCEb0-|xEH4;~r`buGsY#EIQ^ zg&p;$W2MSp9!HjaFZ*VLIB|F7J!^=>6%g}TM51-%WECaw zwmkCg^wuu$XLS}x={uD3OdcMrUR z9bT|^mPHHk3$Hm5{vnJ$NUBg3{glTCyfqTRt`vn^cY=tj1FMKt@ktWjJr!nTupGd> zdR+TTedL3^I?o?}y6U;i28)*5?6CBSY&8t3&$(HGGu4!Ut&o1lJOxB*?!k6987_)D#8~&8XG*0P{%6j;EO4-1cPoP z^F={RQL5eSP9GG5Qr)d>@xsSI?C1@_fkDbLNYR1XB~}d5zxuGy8`e0z1wWCrBIrf& zDr&>I50vB&qj+qCxN+!}8G8#K#DN47Dc&Oi+wyALBmIrGkF6s2#vKN2!*gaxm4AQz z`><_=C*9B&lys^zFos_mp7NcZy0wD`%f+EDZ9{~q4iPw-5P{)@QXg`ct z7YsJGIAY8>yBslHGZX;QcNl<(-4DX4j97MV?kWw`GwUFu=ajp#15|WsY-7Mfc;FyN z0cPdo^rR?FMIK!Q8*g}HvHg1-XN9T7#j$ED#>@JyW8GhmD2>4~{O5BWW3Gr->X_4I^%b6;Q|iv5ksgZi)8VkI z;3=O6bMAzeVd~yqEy0PNL^2|ek=p{ekBhkF6FAS$Ul1&j6Ff?M5-k(iDTd<(7!q(K z1Ew_`_S)WURi8Kj#s^55pu}+c=AX z{z~o|R9Z$63t=`r5R{f6#@mTl>L5HoNcK1^@(`5`=_{*8yg~B zRb36O%#>P@0iO6(OH1s*eL#zjl&pR4uH~F?t^XZZ2Jztghkrdin*}>y+UcNhk#)I zG_R)pw+;3uAcmmYUyr(UFqlN6{%q-mCk6?bYM>;0R2*vbtG_oN*89Mng#|{fyq53$ z_W-DUO}|4O_Y2%%K(F^J26(>!33*VY1W5y0wvc^U@B802)L$FppI?K6|T@-=0*ogK)hUcoKofj+o*v15hpbhEk|LT(by7K3IAYqh|Yuv7Y+< z8Vi!vRrm8^3V>dqzF&supvFb$1Xcln?{0A8;6DF)4E(#!{cA`6dMyLdg(s%UN&rte z_53bUd*1^UTHl5s{D%VAvfssc!+mNbkwn5P0a35w)k@DlSjFcclvB66H^AWkgWD-z zRy8y8fYL9&MmnUQ1B8jOqeG2$8nn^+W6dl&Rpov^f(}Ia9;No}2L?Nkun|DimfRy7 z@27%ht^Zw3fmqLr2cQog-vS2&4}am@fBgS%N95l}D6k_nx)cW5=onLA_uvswheZSNw~v`p>>bfm{z_ z-QUJnJ!pnMFgA2;=y%jh;xlELkV6zd2;5ZS$Ez4 z&xh6t;1l-$aT{+o$)kaBL$3ZYbo?J!3*=@XBmm-~yR}Yk?&SOnLRiA@0dP{uXA=0? zhlur|_NNmZ_`mH0|K_V9w*z^Yd-_m9s`4mwZRjrXQ?`yq8h`Lx$RdAte`;_JkVbkm z628HC>o#@h1kl0j5-ZS>e(}e@+r_^h?tk?}P?G>A1_mmF0}}~<9lVrN~NHsqqqjkT^@P* z{V>1!W9dN7juNm5w^!DgjxEbZLNfqEDwJKCzh4b@Fun6bd*Ef`&b1q=hjh-_2;=ig zm?UNUJ4I*+yzr@i6y`!kM8U1!Dn7!q3UdI2SO4^sP;$=vdiq57wfX%zNc}<|uvw2& zl{JicBS6TrRHsvMbm&dV%{rQe!LmaT_XDzu&t^Oz#Y9D2f6TWzn&8_h%#-nIJR2i#BhHfYF=2-{td`|W6 zddrH<2WoL4XCmbDqM=`wNE3;3D=oi%*3r}9e9U=`c}X3Oxv{c=;hocsc!(wGtPMEB zu=YpbvtvT}_9X;$*Qw)E+h7<=p_xq3{RU&m9WYi9VQ-KafXl@clo+eALTs%}$4fYL zL?5&CCHwL_yl&Rk(;qyF$IS7kG@19cSM{Ykx0-MI8ZWzPk1C6>xniBOL>!a4A}lozobB}-jX zB?6BF{6d@!erk>)4+UUa>~vwO!%>yuP!9sL(xP!j{3IAmfR1q0A4*aXD%-SY*+&;L z)t}#_a7=6GS5r4Snu-cC*Tfi<>2ooF)U0ME1IQ`Ye&zTfo$;FqTKs0C_=U(_iLa_t z)frr3tO)uJg7_w(Il#1YcG#J(K6=BY3wq6q0BA@GY3h~{zi(Rj-TH-3TR7FZvG84W zfU$JchRdpKj`I&8Y-`~b@>e^#gz8P`L8YxiN+9I901^b@Og$^dy%LIRf4uNeM~p01 zL$qcA=B%Qv^f=ORP9a%D;)X3HvZBS$yNl-2wevR-`8Y@p89>+11}k|yCnDiO-#+(He6f@h z3_EC?mNhWPpp6T`cj9N2Pf1Ul4D#ibh6Cn-wa@cLZ~g|1ejrLonX$iv8w`#Ckii*0 z^jL@r2lXR+ad#a4k-#z63ds#v4HRy4?vu znZF=sCm`O5gW|IP8TbNZ0}!r1#NdMv*!eF*#=xJFC}bl4AAm0q&qS^gihhU8|8L^D zFFgOhVmN;RUv!Y}`42+=4?+g);XerZpA?V(Amrc4>_2h!e~*v>vHD#*15WoLh|BAe z&abGXO8FhAgVLBCap25VD9rpj+W!wt=bzzvBX)ocg*9r6Xw`^j(WhBD84MX`N=EaPZ1$AD_|r;_wf zSNXSeO8_=I2DDYhZGY$*5PJX*m-~VM(Q!qeah7J|KV=9{7{ z&(tgzgLT|-jGPI7Nc(@udTMHM>mwix`sO8!r|Igk%lI1rxdK&p)*Ni48RcRxSO);@ zAjB)I5|>qVwjx7)zz$I!nyl3ce+az=-M@;fR_kv1xIm{B0~uayqph)J@iL>5&hwd(fv(AIvFe7Pn#Fhbbw)4nF>TbpTpwd>w0LqeCk7s|?-52T1d-e{zkAcj+bU zQPLd|hVp`c@Vu>@op*jtT?Xl1t*L>x517VRa6NKypj8{@2cPM*KvUqg0$cHKlqv9X z1da(8^VShZ5>087ObPgejXaWQkXCF9QaB*xh%QMBufy9>csE7-%(eqWg&BKZ5UD@= z)QbWqPdncCLJ)UiRawv;1uapD(~q-R!f`()^tK+}%mLrflv=>WZ?Msp$Dvo0aI+L{ zhu{D?X$$ml?@d+h)Ty^}bLaFo9uY07#|2&yyg^iaz5chIoghQA-WAlnD-B$kXJMPB zP$$s+Q*323kiWV0>#IlRB8vhmDQR3s=JxBX9nmg*B)rHbArZMF5;&a0;bR{IW^TwM zd9C>s;WM=squMZ~WA$Cvcu)4v5nL`5rLPG!fi&N^?MqVl+=$l)P>L~be}8Xb&RSp0 z+}@EeszUHSp&nam#+x!u&FKdcmOT)Kfs#1luypdOD%Kg~$)NsnSUI@%IVYBLlxENl z_!Z@9h?c(C(I4xC$LX+=s88bwb!A2*LhINXF59ND?79>EWWc(Gjvrc3WV|fn)}ZvJ zC3xcoTWsDk-?bild+T4%g)ys3=_nqbd{|xoc K>ZWPm@c&=;xvp3M literal 0 HcmV?d00001 diff --git a/src/lib/device/README.md b/src/lib/device/README.md new file mode 100644 index 000000000..99f83ecc0 --- /dev/null +++ b/src/lib/device/README.md @@ -0,0 +1,7 @@ +# Device Manager + +## HomeStore 4.x Disk Layout +1. max_num_chunks is decided by device size and min_chunk_size which is configurable by HomeStore consumer +2. Super SuperBlk (SSB) is the first meta blk to load Meta Service. All other System Meta Blks are chained together by loading the SSB + +![HomeStore_Disk_Layout](../../../docs/imgs/HomeStore_Disk_Layout2.png) From b4b937e74ce45b1b493c70eeb79a9ad853eafdde Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Mon, 24 Mar 2025 16:04:08 -0700 Subject: [PATCH 081/170] Use an atomic in crash simulator to wait for the crash only if the flip is set and triggered --- conanfile.py | 2 +- src/lib/common/crash_simulator.hpp | 4 ++++ src/lib/index/wb_cache.cpp | 20 +++++++++++++++++-- .../test_common/homestore_test_common.hpp | 13 ++++++++++-- src/tests/test_index_crash_recovery.cpp | 2 ++ 5 files changed, 36 insertions(+), 5 deletions(-) diff --git a/conanfile.py b/conanfile.py index 79e643b58..20003792c 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.4" + version = "6.7.5" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/common/crash_simulator.hpp b/src/lib/common/crash_simulator.hpp index 98c22fe17..2afc5d30a 100644 --- a/src/lib/common/crash_simulator.hpp +++ b/src/lib/common/crash_simulator.hpp @@ -38,8 +38,12 @@ class CrashSimulator { } } + bool will_crash() const { return m_will_crash.load(); } + void set_will_crash(bool crash) { m_will_crash.store(crash); } + private: std::function< void(void) > m_restart_cb{nullptr}; + std::atomic m_will_crash{false}; sisl::urcu_scoped_ptr< bool > m_crashed; }; } // namespace homestore diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index caf00b3d1..fdd635589 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -218,39 +218,55 @@ static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr con IndexBufferPtrList const& new_node_bufs, IndexBufferPtrList const& freed_node_bufs) { // TODO: Need an API from flip to quickly check if flip is enabled, so this method doesn't check flip_enabled a // bunch of times. + // TODO: Need an API to check if a flip is triggered easilly to avoid the use of several atomics. if (parent_buf && parent_buf->is_meta_buf()) { // Split or merge happening on root if (iomgr_flip::instance()->test_flip("crash_flush_on_meta")) { parent_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_root")) { child_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } } else if ((new_node_bufs.size() == 1) && freed_node_bufs.empty()) { // Its a split node situation if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_parent")) { parent_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_left_child")) { child_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_right_child")) { new_node_bufs[0]->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } } else if (!freed_node_bufs.empty() && (new_node_bufs.size() != freed_node_bufs.size())) { // Its a merge nodes sitation if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_parent")) { parent_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_left_child")) { child_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_right_child")) { - if (!new_node_bufs.empty()) { new_node_bufs[0]->set_crash_flag(); } + if (!new_node_bufs.empty()) { + new_node_bufs[0]->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); + } } } else if (!freed_node_bufs.empty() && (new_node_bufs.size() == freed_node_bufs.size())) { // Its a rebalance node situation if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_parent")) { parent_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_left_child")) { child_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_right_child")) { - if (!new_node_bufs.empty()) { new_node_bufs[0]->set_crash_flag(); } + if (!new_node_bufs.empty()) { + new_node_bufs[0]->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); + } } } } diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index af1b38f0e..1d397f7f0 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -32,6 +32,10 @@ #include #include +#ifdef _PRERELEASE +#include "common/crash_simulator.hpp" +#endif + const std::string SPDK_ENV_VAR_STRING{"USER_WANT_SPDK"}; const std::string HTTP_SVC_ENV_VAR_STRING{"USER_WANT_HTTP_OFF"}; const std::string CP_WATCHDOG_TIMER_SEC{"USER_SET_CP_WD_TMR_SEC"}; // used in nightly test; @@ -213,8 +217,13 @@ class HSTestHelper { #ifdef _PRERELEASE void wait_for_crash_recovery() { - m_crash_recovered.getFuture().get(); - m_crash_recovered = folly::Promise< folly::Unit >(); + if(homestore::HomeStore::instance()->crash_simulator().will_crash()) { + LOGDEBUG("Waiting for m_crash_recovered future"); + m_crash_recovered.getFuture().get(); + m_crash_recovered = folly::Promise< folly::Unit >(); + homestore::HomeStore::instance()->crash_simulator().set_will_crash(false); + } + } #endif diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index c474db233..4d5048cfb 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -708,6 +708,8 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { this->get_all(); } } else { + // remove the flips so that they do not get triggered erroneously + this->remove_flip(flip); this->crash_and_recover(operations, fmt::format("long_tree_{}", round)); } if (elapsed_time - last_progress_time > 30) { From 9808c9e1a418fc1bfa6a44ceaba6d860380edd32 Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Tue, 25 Mar 2025 13:58:31 -0700 Subject: [PATCH 082/170] retain the default behaviour of the method wait_for_crash_recovery to avoid wrong usage of the will_crash in crash simulator --- src/tests/test_common/homestore_test_common.hpp | 14 +++++++------- src/tests/test_index_crash_recovery.cpp | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 1d397f7f0..c4979e203 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -216,14 +216,14 @@ class HSTestHelper { test_params& params(uint32_t svc) { return m_token.svc_params_[svc]; } #ifdef _PRERELEASE - void wait_for_crash_recovery() { - if(homestore::HomeStore::instance()->crash_simulator().will_crash()) { - LOGDEBUG("Waiting for m_crash_recovered future"); - m_crash_recovered.getFuture().get(); - m_crash_recovered = folly::Promise< folly::Unit >(); - homestore::HomeStore::instance()->crash_simulator().set_will_crash(false); + void wait_for_crash_recovery(bool check_will_crash = false) { + if(check_will_crash && !homestore::HomeStore::instance()->crash_simulator().will_crash()) { + return; } - + LOGDEBUG("Waiting for m_crash_recovered future"); + m_crash_recovered.getFuture().get(); + m_crash_recovered = folly::Promise< folly::Unit >(); + homestore::HomeStore::instance()->crash_simulator().set_will_crash(false); } #endif diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 4d5048cfb..e13c886c9 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -404,7 +404,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT void crash_and_recover(uint32_t s_key, uint32_t e_key) { // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); trigger_cp(false); - this->wait_for_crash_recovery(); + this->wait_for_crash_recovery(true); // this->visualize_keys("tree_after_crash_" + std::to_string(s_key) + "_" + std::to_string(e_key) + ".dot"); // this->print_keys("Post crash and recovery, btree structure: "); @@ -454,7 +454,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT trigger_cp(false); LOGINFO("waiting for crash to recover"); - this->wait_for_crash_recovery(); + this->wait_for_crash_recovery(true); if (!filename.empty()) { std::string rec_filename = filename + "_after_recovery.dot"; @@ -503,7 +503,7 @@ TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) { // Trigger a cp, which should induce the crash and wait for hs to recover test_common::HSTestHelper::trigger_cp(false); - this->wait_for_crash_recovery(); + this->wait_for_crash_recovery(true); // Post crash, load the shadow_map into a new instance and compute the diff. Redo the operation this->reapply_after_crash(); From 2a9d38dc567ce7b3ddc7316fe0cb3829133858c1 Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Tue, 25 Mar 2025 15:37:27 -0700 Subject: [PATCH 083/170] issue: 669 update physical superblk's mgaic and product_name to distinguish homestore 4.x with 1.3 which is massive written in prod (#673) --- conanfile.py | 2 +- src/lib/device/hs_super_blk.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 79e643b58..20003792c 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.4" + version = "6.7.5" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/device/hs_super_blk.h b/src/lib/device/hs_super_blk.h index a539c1e56..9d0a3140d 100644 --- a/src/lib/device/hs_super_blk.h +++ b/src/lib/device/hs_super_blk.h @@ -75,7 +75,7 @@ struct disk_attr { }; struct first_block_header { - static constexpr const char* PRODUCT_NAME{"OmStore"}; + static constexpr const char* PRODUCT_NAME{"HomeStore4x"}; static constexpr size_t s_product_name_size{64}; static constexpr uint32_t CURRENT_SUPERBLOCK_VERSION{4}; @@ -128,7 +128,7 @@ struct first_block { static constexpr uint32_t s_atomic_fb_size{512}; // increase 512 to actual size if in the future first_block // can be larger; static constexpr uint32_t s_io_fb_size{4096}; // This is the size we do IO on, with padding - static constexpr uint32_t HOMESTORE_MAGIC{0xCEEDDEEB}; // Magic written as first bytes on each device + static constexpr uint32_t HOMESTORE_MAGIC{0xABBECDCD}; // Magic written as first bytes on each device public: uint64_t magic{0}; // Header magic expected to be at the top of block From 20ec4072d8460e4c6f5c20d2aa08a2e96688f6b5 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Fri, 21 Mar 2025 12:56:01 +0800 Subject: [PATCH 084/170] Async IO metrics for physical dev. Signed-off-by: Xiaoxi Chen --- src/lib/device/physical_dev.cpp | 68 ++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 19 deletions(-) diff --git a/src/lib/device/physical_dev.cpp b/src/lib/device/physical_dev.cpp index 1b6914cf5..ba52ba2f2 100644 --- a/src/lib/device/physical_dev.cpp +++ b/src/lib/device/physical_dev.cpp @@ -35,6 +35,8 @@ namespace homestore { static std::mutex s_cached_dev_mtx; static std::unordered_map< std::string, iomgr::io_device_ptr > s_cached_opened_devs; +__attribute__((no_sanitize_address)) static auto get_current_time() { return Clock::now(); } + iomgr::io_device_ptr open_and_cache_dev(const std::string& devname, int oflags) { std::unique_lock lg(s_cached_dev_mtx); @@ -136,26 +138,50 @@ void PhysicalDev::close_device() { close_and_uncache_dev(m_devname, m_iodev); } folly::Future< std::error_code > PhysicalDev::async_write(const char* data, uint32_t size, uint64_t offset, bool part_of_batch) { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - return m_drive_iface->async_write(m_iodev.get(), data, size, offset, part_of_batch); + auto const start_time = get_current_time(); + return m_drive_iface->async_write(m_iodev.get(), data, size, offset, part_of_batch) + .thenValue([this, start_time, size](std::error_code ec) { + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + COUNTER_INCREMENT(m_metrics, drive_async_write_count, 1); + return ec; + }); } folly::Future< std::error_code > PhysicalDev::async_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset, bool part_of_batch) { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - return m_drive_iface->async_writev(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch); + auto const start_time = get_current_time(); + return m_drive_iface->async_writev(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch) + .thenValue([this, start_time, size](std::error_code ec) { + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + COUNTER_INCREMENT(m_metrics, drive_async_write_count, 1); + return ec; + }); } folly::Future< std::error_code > PhysicalDev::async_read(char* data, uint32_t size, uint64_t offset, bool part_of_batch) { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - return m_drive_iface->async_read(m_iodev.get(), data, size, offset, part_of_batch); + auto const start_time = get_current_time(); + return m_drive_iface->async_read(m_iodev.get(), data, size, offset, part_of_batch) + .thenValue([this, start_time, size](std::error_code ec) { + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + COUNTER_INCREMENT(m_metrics, drive_async_read_count, 1); + return ec; + }); } folly::Future< std::error_code > PhysicalDev::async_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset, bool part_of_batch) { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - return m_drive_iface->async_readv(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch); + auto const start_time = get_current_time(); + return m_drive_iface->async_readv(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch) + .thenValue([this, start_time, size](std::error_code ec) { + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + COUNTER_INCREMENT(m_metrics, drive_async_read_count, 1); + return ec; + }); } folly::Future< std::error_code > PhysicalDev::async_write_zero(uint64_t size, uint64_t offset) { @@ -174,46 +200,50 @@ folly::Future< std::error_code > PhysicalDev::async_write_zero(uint64_t size, ui folly::Future< std::error_code > PhysicalDev::queue_fsync() { return m_drive_iface->queue_fsync(m_iodev.get()); } -__attribute__((no_sanitize_address)) static auto get_current_time() { return Clock::now(); } - std::error_code PhysicalDev::sync_write(const char* data, uint32_t size, uint64_t offset) { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); auto const start_time = get_current_time(); auto const ret = m_drive_iface->sync_write(m_iodev.get(), data, size, offset); HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); return ret; } std::error_code PhysicalDev::sync_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset) { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); auto const start_time = Clock::now(); auto const ret = m_drive_iface->sync_writev(m_iodev.get(), iov, iovcnt, size, offset); HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); + return ret; } std::error_code PhysicalDev::sync_read(char* data, uint32_t size, uint64_t offset) { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); auto const start_time = Clock::now(); auto const ret = m_drive_iface->sync_read(m_iodev.get(), data, size, offset); HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); return ret; } std::error_code PhysicalDev::sync_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset) { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); auto const start_time = Clock::now(); auto const ret = m_drive_iface->sync_readv(m_iodev.get(), iov, iovcnt, size, offset); HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); return ret; } std::error_code PhysicalDev::sync_write_zero(uint64_t size, uint64_t offset) { - return m_drive_iface->sync_write_zero(m_iodev.get(), size, offset); + auto const start_time = Clock::now(); + auto const ret = m_drive_iface->sync_write_zero(m_iodev.get(), size, offset); + HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, wirte_io_size, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); + return ret; } void PhysicalDev::submit_batch() { m_drive_iface->submit_batch(); } From da788e11a7393d8089bec12d10ad2eaf2ea4d4ef Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 26 Mar 2025 15:46:04 +0800 Subject: [PATCH 085/170] update conan. The connan change was lost during merging. Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 20003792c..c446bca94 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.5" + version = "6.7.6" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" From 924204611d29b8aea679e59a4da767fffeb51ad0 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Wed, 26 Mar 2025 18:08:35 +0800 Subject: [PATCH 086/170] Fix baseline resync corner cases. 1. Leader side: Deny snapshot read if there are uncommitted logs in the snapshot. This prevents the following scenario: If a crash occurs during snapshot creation, the snapshot might be persisted while the rd sb is not. This means the durable_commit_lsn is less than the snapshot's log_idx. Upon restart, the changes in uncommitted logs may or may not be included in the snapshot data sent by the leader, depending on the race condition between commit and snapshot read, leading to data inconsistency. 2. Follower side: Skip replay and commit when BR is in progress and purge logs is no longer supported. Purging logs can cause issues such as the commit thread being unable to access logs if they are purged. This change removes the purge logic and adds last_snapshot_lsn in sb to help determine if processing should be skipped. Replay/commit will be skipped for logs included in BR to avoid log ops accessing unavailable resources after the PG is destroyed by BR. --- conanfile.py | 2 +- .../log_store/home_raft_log_store.h | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 16 ++++++++++++- src/lib/replication/repl_dev/raft_repl_dev.h | 19 ++++++++++++--- .../repl_dev/raft_state_machine.cpp | 23 ++++++++++++++++++- 5 files changed, 55 insertions(+), 7 deletions(-) diff --git a/conanfile.py b/conanfile.py index c446bca94..b8d65e757 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.6" + version = "6.7.7" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/log_store/home_raft_log_store.h b/src/lib/replication/log_store/home_raft_log_store.h index 7fb96a5d4..846b1de3c 100644 --- a/src/lib/replication/log_store/home_raft_log_store.h +++ b/src/lib/replication/log_store/home_raft_log_store.h @@ -217,7 +217,7 @@ class HomeRaftLogStore : public nuraft::log_store { /** * Purge all logs in the log store - * It is a dangerous operation and is only used in baseline resync now (purge all logs and restore by snapshot). + * It is a dangerous operation and not be used currently. */ void purge_all_logs(); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 9bca6d63d..a8d4f0dde 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -69,6 +69,7 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_rd_sb->logstore_id = m_data_journal->logstore_id(); m_rd_sb->last_applied_dsn = 0; m_rd_sb->destroy_pending = 0x0; + m_rd_sb->last_snapshot_lsn = 0; m_rd_sb->group_ordinal = s_next_group_ordinal.fetch_add(1); m_rdev_name = fmt::format("rdev{}", m_rd_sb->group_ordinal); @@ -1541,6 +1542,11 @@ void RaftReplDev::set_log_store_last_durable_lsn(store_lsn_t lsn) { m_data_journ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { auto repl_lsn = to_repl_lsn(lsn); + if (need_skip_processing(repl_lsn)) { + RD_LOGI("Raft Channel: Log {} is outdated and will be handled by baseline resync. Ignoring replay.", lsn); + return; + } + // apply the log entry if the lsn is between checkpoint lsn and durable commit lsn if (repl_lsn <= m_rd_sb->checkpoint_lsn) { return; } @@ -1635,7 +1641,7 @@ void RaftReplDev::create_snp_resync_data(raft_buf_ptr_t& data_out) { std::memcpy(data_out->data_begin(), &msg, msg_size); } -bool RaftReplDev::save_snp_resync_data(nuraft::buffer& data) { +bool RaftReplDev::save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s) { auto msg = r_cast< snp_repl_dev_data* >(data.data_begin()); if (msg->magic_num != HOMESTORE_RESYNC_DATA_MAGIC || msg->protocol_version != HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1) { @@ -1652,6 +1658,14 @@ bool RaftReplDev::save_snp_resync_data(nuraft::buffer& data) { RD_LOGE("Snapshot resync data crc mismatch, received_crc={}, computed_crc={}", received_crc, computed_crc); return false; } + { + // Save last_snapshot_lsn, so that we can skip the replay/commit operation for logs included in baseline resync. + // The reason is baseline resync will clear existing resources on the upper layer, skipping replay/commit + // operations can avoid accessing unavailable resources + std::unique_lock lg{m_sb_mtx}; + m_rd_sb->last_snapshot_lsn = s_cast< repl_lsn_t >(s.get_last_log_idx()); + m_rd_sb.write(); + } if (msg->dsn > m_next_dsn) { m_next_dsn = msg->dsn; RD_LOGD("Update next_dsn from {} to {}", m_next_dsn.load(), msg->dsn); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index bb0a72815..cc9a30822 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -25,6 +25,7 @@ struct raft_repl_dev_superblk : public repl_dev_superblk { uint8_t is_timeline_consistent; // Flag to indicate whether the recovery of followers need to be timeline consistent uint64_t last_applied_dsn; // Last applied data sequence number uint8_t destroy_pending; // Flag to indicate whether the group is in destroy pending state + repl_lsn_t last_snapshot_lsn; // Last snapshot LSN follower received from leader uint32_t get_raft_sb_version() const { return raft_sb_version; } }; @@ -231,9 +232,9 @@ class RaftReplDev : public ReplDev, if (!ready) { RD_LOGD("Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); } return ready; } + // purge all resources (e.g., logs in logstore) is a very dangerous operation, it is not supported yet. void purge() override { - // clean up existing logs in log store - m_data_journal->purge_all_logs(); + RD_REL_ASSERT(false, "NOT SUPPORTED YET"); } std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { @@ -325,6 +326,18 @@ class RaftReplDev : public ReplDev, */ void force_leave() { leave(); } + /** + * \brief This method is called to check if the given LSN is within the last snapshot LSN received from the leader. + * All logs with LSN less than or equal to the last snapshot LSN are considered as part of the baseline resync, which + * doesn't need any more operations (e.g., replay, commit). + * + * \param lsn The LSN to be checked. + * \return true if the LSN is within the last snapshot LSN, false otherwise. + */ + bool need_skip_processing(const repl_lsn_t lsn) { + return lsn <= m_rd_sb->last_snapshot_lsn; + } + protected: //////////////// All nuraft::state_mgr overrides /////////////////////// nuraft::ptr< nuraft::cluster_config > load_config() override; @@ -366,7 +379,7 @@ class RaftReplDev : public ReplDev, void replace_member(repl_req_ptr_t rreq); void reset_quorum_size(uint32_t commit_quorum); void create_snp_resync_data(raft_buf_ptr_t& data_out); - bool save_snp_resync_data(nuraft::buffer& data); + bool save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s); }; } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 47912898e..12619b204 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -188,6 +188,10 @@ raft_buf_ptr_t RaftStateMachine::pre_commit_ext(nuraft::state_machine::ext_op_pa raft_buf_ptr_t RaftStateMachine::commit_ext(nuraft::state_machine::ext_op_params const& params) { int64_t lsn = s_cast< int64_t >(params.log_idx); + if (m_rd.need_skip_processing(lsn)) { + RD_LOGI("Raft Channel: Log {} is expected to be handled by snapshot. Skipping commit.", lsn); + return m_success_ptr; + } RD_LOGD("Raft channel: Received Commit message lsn {} store {} logdev {} size {}", lsn, m_rd.m_data_journal->logstore_id(), m_rd.m_data_journal->logdev_id(), params.data->size()); repl_req_ptr_t rreq = lsn_to_req(lsn); @@ -206,6 +210,10 @@ raft_buf_ptr_t RaftStateMachine::commit_ext(nuraft::state_machine::ext_op_params void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_ptr_t& new_conf) { // when reaching here, the config change log has already been committed, and the new config has been applied to the // cluster + if (m_rd.need_skip_processing(s_cast< repl_lsn_t >(log_idx))) { + RD_LOGI("Raft Channel: Config {} is expected to be handled by snapshot. Skipping commit.", log_idx); + return; + } RD_LOGD("Raft channel: Commit new cluster conf , log_idx = {}", log_idx); @@ -322,6 +330,19 @@ void RaftStateMachine::create_snapshot(nuraft::snapshot& s, nuraft::async_result int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, ulong obj_id, raft_buf_ptr_t& data_out, bool& is_last_obj) { + + // Ensure all logs snapshot included are committed to prevent the following scenario: + // If a crash occurs during snapshot creation, the snapshot might be persisted while the rd's sb is not. + // This means the durable_commit_lsn is less than the snapshot's log_idx. Upon restart, the changes in + // uncommitted logs may or may not included in the snapshot data sent by leader, + // depending on the racing of commit vs snapshot read, leading to data inconsistency. + if (s_cast< repl_lsn_t >(s.get_last_log_idx()) > m_rd.get_last_commit_lsn()) { + RD_LOG(WARN, "not ready to read because there are some uncommitted logs in snapshot, " + "let nuraft retry later. snapshot log_idx={}, last_commit_lsn={}", + s.get_last_log_idx(), m_rd.get_last_commit_lsn()); + return -1; + } + // For Nuraft baseline resync, we separate the process into two layers: HomeStore layer and Application layer. // We use the highest bit of the obj_id to indicate the message type: 0 is for HS, 1 is for Application. if (is_hs_snp_obj(obj_id)) { @@ -354,7 +375,7 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, bool is_last_obj) { if (is_hs_snp_obj(obj_id)) { // Homestore preserved msg - if (m_rd.save_snp_resync_data(data)) { + if (m_rd.save_snp_resync_data(data, s)) { obj_id = snp_obj_id_type_app; LOGDEBUG("save_snp_resync_data success, next obj_id={}", obj_id); } From af78632940b82dfc24c2591df83c5213632ecd7e Mon Sep 17 00:00:00 2001 From: yuwmao Date: Wed, 2 Apr 2025 14:27:13 +0800 Subject: [PATCH 087/170] Support blk reservation Application has an option to reserve some blks in a chunk when allocation. --- conanfile.py | 2 +- src/include/homestore/blk.h | 1 + src/lib/blkalloc/append_blk_allocator.cpp | 8 ++++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index b8d65e757..4f17bf904 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.7" + version = "6.7.8" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h index 1ceab0b8a..d85185b97 100644 --- a/src/include/homestore/blk.h +++ b/src/include/homestore/blk.h @@ -249,6 +249,7 @@ VENUM(BlkAllocStatus, uint32_t, struct blk_alloc_hints { blk_temp_t desired_temp{0}; // Temperature hint for the device + std::optional< uint32_t > reserved_blks; // Reserved blks in a chunk std::optional< uint32_t > pdev_id_hint; // which physical device to pick (hint if any) -1 for don't care std::optional< chunk_num_t > chunk_id_hint; // any specific chunk id to pick for this allocation std::optional committed_blk_id; // blk id indicates the blk was already allocated and committed, don't allocate and commit again diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index eca445381..141d09279 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -67,9 +67,13 @@ BlkAllocStatus AppendBlkAllocator::alloc_contiguous(BlkId& bid) { return alloc(1 // If we want to change above design, we can open this api for vector allocation; // BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hints& hint, BlkId& out_bid) { - if (available_blks() < nblks) { + auto avail_blks = available_blks(); + if (hint.reserved_blks) { + avail_blks = avail_blks > hint.reserved_blks.value() ? avail_blks - hint.reserved_blks.value() : 0; + } + if (avail_blks < nblks) { // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); - LOGERROR("No space left to serve request nblks: {}, available_blks: {}", nblks, available_blks()); + LOGERROR("No space left to serve request nblks: {}, available_blks: {}, actual available_blks(exclude reserved blks): {}", nblks, available_blks(), avail_blks); return BlkAllocStatus::SPACE_FULL; } else if (nblks > max_blks_per_blkid()) { // consumer(vdev) already handles this case. From dc53c44b990087f86d1304f0e5f8f91ec80af541 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Tue, 1 Apr 2025 14:35:30 +0800 Subject: [PATCH 088/170] Adding group_id to RD_LOG rdev name (e.g rdev1) is human friendly but not visible outside of the Homestore, only group_id. Signed-off-by: Xiaoxi Chen --- src/include/homestore/replication/repl_dev.h | 10 +++-- .../replication/log_store/repl_log_store.cpp | 1 + .../replication/log_store/repl_log_store.h | 1 + src/lib/replication/repl_dev/common.cpp | 4 +- .../replication/repl_dev/raft_repl_dev.cpp | 13 ++++-- src/lib/replication/repl_dev/raft_repl_dev.h | 10 +++-- .../repl_dev/raft_state_machine.cpp | 2 +- .../replication/repl_dev/raft_state_machine.h | 40 +++++++------------ .../replication/repl_dev/solo_repl_dev.cpp | 9 ++--- src/lib/replication/repl_dev/solo_repl_dev.h | 6 +-- 10 files changed, 47 insertions(+), 49 deletions(-) diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index dfa241f7a..7ef61ee47 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -29,6 +29,7 @@ struct repl_req_ctx; using raft_buf_ptr_t = nuraft::ptr< nuraft::buffer >; using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; using repl_req_ptr_t = boost::intrusive_ptr< repl_req_ctx >; +using trace_id_t = u_int64_t; VENUM(repl_req_state_t, uint32_t, INIT = 0, // Initial state @@ -385,7 +386,7 @@ class ReplDevListener { } /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer - virtual void on_log_replay_done(const group_id_t& group_id){}; + virtual void on_log_replay_done(const group_id_t& group_id) {}; private: std::weak_ptr< ReplDev > m_repl_dev; @@ -416,7 +417,7 @@ class ReplDev { /// @param ctx - User supplied context which will be passed to listener /// callbacks virtual void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx) = 0; + repl_req_ptr_t ctx, trace_id_t tid = 0) = 0; /// @brief Reads the data and returns a future to continue on /// @param bid Block id to read @@ -427,13 +428,14 @@ class ReplDev { /// @return A Future with std::error_code to notify if it has successfully read the data or any error code in case /// of failure virtual folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch = false) = 0; + bool part_of_batch = false, trace_id_t tid = 0) = 0; /// @brief After data is replicated and on_commit to the listener is called. the blkids can be freed. /// /// @param lsn - LSN of the old blkids that is being freed /// @param blkids - blkids to be freed. - virtual folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid) = 0; + virtual folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid, + trace_id_t tid = 0) = 0; /// @brief Try to switch the current replica where this method called to become a leader. /// @return True if it is successful, false otherwise. diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index 072d06b99..8fa5c0f18 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -107,6 +107,7 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { } std::string ReplLogStore::rdev_name() const { return m_rd.rdev_name(); } +std::string ReplLogStore::identify_str() const { return m_rd.identify_str(); } bool ReplLogStore::compact(ulong compact_upto_lsn) { RD_LOG(DEBUG, "Raft Channel: compact_to_lsn={}", compact_upto_lsn); diff --git a/src/lib/replication/log_store/repl_log_store.h b/src/lib/replication/log_store/repl_log_store.h index a386d397b..bb19df119 100644 --- a/src/lib/replication/log_store/repl_log_store.h +++ b/src/lib/replication/log_store/repl_log_store.h @@ -30,6 +30,7 @@ class ReplLogStore : public HomeRaftLogStore { private: std::string rdev_name() const; + std::string identify_str() const; }; } // namespace homestore diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index b2ba6bce4..b733c19c0 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -31,9 +31,7 @@ ReplServiceError repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool std::unique_lock< std::mutex > lg(m_state_mtx); if (has_linked_data() && !has_state(repl_req_state_t::BLK_ALLOCATED)) { auto alloc_status = alloc_local_blks(listener, data_size); - if (alloc_status != ReplServiceError::OK) { - LOGERROR("Allocate blk for rreq failed error={}", alloc_status); - } + if (alloc_status != ReplServiceError::OK) { LOGERROR("Allocate blk for rreq failed error={}", alloc_status); } return alloc_status; } return ReplServiceError::OK; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index a8d4f0dde..27561d632 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -80,6 +80,7 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_rd_sb.write(); bind_data_service(); } + m_identify_str = m_rdev_name + ":" + group_id_str(); RD_LOG(INFO, "Started {} RaftReplDev group_id={}, replica_id={}, raft_server_id={} commited_lsn={}, " @@ -300,7 +301,7 @@ void RaftReplDev::on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< // we do not have shutdown for async_alloc_write according to the two points above. void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& data, - repl_req_ptr_t rreq) { + repl_req_ptr_t rreq, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } { @@ -319,12 +320,16 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, header, key, data.size, m_listener); + RD_LOGD("traceID [{}], repl_key [{}], header size [{}] bytes, user_key size [{}] bytes, data size " + "[{}] bytes", + tid, rreq->rkey(), header.size(), key.size(), data.size); + // Add the request to the repl_dev_rreq map, it will be accessed throughout the life cycle of this request auto const [it, happened] = m_repl_key_req_map.emplace(rreq->rkey(), rreq); RD_DBG_ASSERT(happened, "Duplicate repl_key={} found in the map", rreq->rkey().to_string()); if (status != ReplServiceError::OK) { - RD_LOGD("Initializing rreq failed error={}, failing this req", status); + RD_LOGD("traceID [{}], Initializing rreq failed error={}, failing this req", tid, status); handle_error(rreq, status); return; } @@ -1071,7 +1076,7 @@ repl_req_ptr_t RaftReplDev::repl_key_to_req(repl_key const& rkey) const { // async_read and async_free_blks graceful shutdown will be handled by data_service. folly::Future< std::error_code > RaftReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch) { + bool part_of_batch, trace_id_t tid) { if (is_stopping()) { LOGINFO("repl dev is being shutdown!"); return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); @@ -1079,7 +1084,7 @@ folly::Future< std::error_code > RaftReplDev::async_read(MultiBlkId const& bid, return data_service().async_read(bid, sgs, size, part_of_batch); } -folly::Future< std::error_code > RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid) { +folly::Future< std::error_code > RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) { // TODO: For timeline consistency required, we should retain the blkid that is changed and write that to another // journal. if (is_stopping()) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index cc9a30822..62a0c635c 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -156,6 +156,7 @@ class RaftReplDev : public ReplDev, nuraft_mesg::Manager& m_msg_mgr; group_id_t m_group_id; // Replication Group id std::string m_rdev_name; // Short name for the group for easy debugging + std::string m_identify_str; // combination of rdev_name:group_id replica_id_t m_my_repl_id; // This replica's uuid int32_t m_raft_server_id; // Server ID used by raft (unique within raft group) shared< ReplLogStore > m_data_journal; @@ -205,10 +206,10 @@ class RaftReplDev : public ReplDev, //////////////// All ReplDev overrides/implementation /////////////////////// void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx) override; + repl_req_ptr_t ctx, trace_id_t tid = 0) override; folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch = false) override; - folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid) override; + bool part_of_batch = false, trace_id_t tid = 0) override; + folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid, trace_id_t tid = 0) override; AsyncReplResult<> become_leader() override; bool is_leader() const override; replica_id_t get_leader_id() const override; @@ -216,7 +217,8 @@ class RaftReplDev : public ReplDev, std::set< replica_id_t > get_active_peers() const; group_id_t group_id() const override { return m_group_id; } std::string group_id_str() const { return boost::uuids::to_string(m_group_id); } - std::string rdev_name() const { return m_rdev_name; } + std::string rdev_name() const { return m_rdev_name; }; + std::string identify_str() const { return m_identify_str; }; std::string my_replica_id_str() const { return boost::uuids::to_string(m_my_repl_id); } uint32_t get_blk_size() const override; repl_lsn_t get_last_commit_lsn() const override { return m_commit_upto_lsn.load(); } diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 12619b204..ceb1f4525 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -429,6 +429,6 @@ nuraft::ptr< nuraft::snapshot > RaftStateMachine::last_snapshot() { void RaftStateMachine::free_user_snp_ctx(void*& user_snp_ctx) { m_rd.m_listener->free_user_snp_ctx(user_snp_ctx); } -std::string RaftStateMachine::rdev_name() const { return m_rd.rdev_name(); } +std::string RaftStateMachine::identify_str() const { return m_rd.identify_str(); } } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index 2b50fea7b..97de4ec3b 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -24,43 +24,33 @@ namespace homestore { class ReplicaSetImpl; class StateMachineStore; -#define RD_LOG(level, msg, ...) \ - LOG##level##MOD_FMT(replication, ([&](fmt::memory_buffer& buf, const char* msgcb, auto&&... args) -> bool { \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "}, \ - fmt::make_format_args(file_name(__FILE__), __LINE__)); \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ - fmt::make_format_args("rd", rdev_name())); \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ - fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ - return true; \ - }), \ - msg, ##__VA_ARGS__); +#define RD_LOG(level, msg, ...) LOG##level##MOD(replication, "[{}] " msg, identify_str(), ##__VA_ARGS__) #define RD_ASSERT_CMP(assert_type, val1, cmp, val2, ...) \ { \ assert_type##_ASSERT_CMP( \ val1, cmp, val2, \ [&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool { \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "}, \ - fmt::make_format_args(file_name(__FILE__), __LINE__)); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}:{}] "}, \ + fmt::make_format_args(file_name(__FILE__), __LINE__, __FUNCTION__)); \ sisl::logging::default_cmp_assert_formatter(buf, msgcb, std::forward< decltype(args) >(args)...); \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ - fmt::make_format_args("rd", rdev_name())); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}] "}, fmt::make_format_args(identify_str())); \ return true; \ }, \ ##__VA_ARGS__); \ } #define RD_ASSERT(assert_type, cond, ...) \ { \ - assert_type##_ASSERT_FMT(cond, \ - ([&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool { \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ - fmt::make_format_args("rd", rdev_name())); \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ - fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ - return true; \ - }), \ - ##__VA_ARGS__); \ + assert_type##_ASSERT_FMT( \ + cond, ([&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool { \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}:{}] "}, \ + fmt::make_format_args(file_name(__FILE__), __LINE__, __FUNCTION__)); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}] "}, fmt::make_format_args(identify_str())); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ + fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ + return true; \ + }), \ + ##__VA_ARGS__); \ } #define RD_DBG_ASSERT(cond, ...) RD_ASSERT(DEBUG, cond, ##__VA_ARGS__) @@ -139,7 +129,7 @@ class RaftStateMachine : public nuraft::state_machine { void iterate_repl_reqs(std::function< void(int64_t, repl_req_ptr_t rreq) > const& cb); - std::string rdev_name() const; + std::string identify_str() const; int64_t reset_next_batch_size_hint(int64_t new_hint); int64_t inc_next_batch_size_hint(); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 72d4fda48..b06347cb9 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -28,7 +28,7 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi } void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t rreq) { + repl_req_ptr_t rreq, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1}, value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, @@ -94,11 +94,11 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx } folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch) { + bool part_of_batch, trace_id_t tid) { return data_service().async_read(bid, sgs, size, part_of_batch); } -folly::Future< std::error_code > SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid) { +folly::Future< std::error_code > SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) { return data_service().async_free_blk(bid); } @@ -111,7 +111,6 @@ void SoloReplDev::cp_flush(CP*) { m_rd_sb.write(); } -void SoloReplDev::cp_cleanup(CP*) { /* m_data_journal->truncate(m_rd_sb->checkpoint_lsn); */ -} +void SoloReplDev::cp_cleanup(CP*) { /* m_data_journal->truncate(m_rd_sb->checkpoint_lsn); */ } } // namespace homestore diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 88d6174c7..78cace9f8 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -42,12 +42,12 @@ class SoloReplDev : public ReplDev { // TODO: implement graceful shutdown for solo repl dev void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx) override; + repl_req_ptr_t ctx, trace_id_t tid = 0) override; folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch = false) override; + bool part_of_batch = false, trace_id_t tid = 0) override; - folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid) override; + folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid, trace_id_t tid = 0) override; AsyncReplResult<> become_leader() override { return make_async_error(ReplServiceError::OK); } bool is_leader() const override { return true; } From cbba03b59c4ce855b00c704072083ebfe1853d16 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 2 Apr 2025 11:29:21 +0800 Subject: [PATCH 089/170] Add traceID into repl_key the traceID replicate to follower through log and data channel(push_data). The target is we can use single traceID to get logs across replicas for the request. Signed-off-by: Xiaoxi Chen --- src/include/homestore/replication/repl_dev.h | 5 ++- src/lib/replication/push_data_rpc.fbs | 1 + src/lib/replication/repl_dev/common.cpp | 1 + src/lib/replication/repl_dev/common.h | 1 + .../replication/repl_dev/raft_repl_dev.cpp | 35 ++++++++++++------- .../repl_dev/raft_state_machine.cpp | 6 ++-- .../replication/repl_dev/solo_repl_dev.cpp | 2 +- src/tests/test_common/raft_repl_test_base.hpp | 2 +- 8 files changed, 36 insertions(+), 17 deletions(-) diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 7ef61ee47..d020c9794 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -57,6 +57,7 @@ struct repl_key { int32_t server_id{0}; // Server Id which this req is originated from uint64_t term; // RAFT term number uint64_t dsn{0}; // Data sequence number to tie the data with the raft journal entry + uint64_t traceID{0}; // tracing ID provided by application that connects logs. struct Hasher { size_t operator()(repl_key const& rk) const { @@ -67,7 +68,8 @@ struct repl_key { bool operator==(repl_key const& other) const = default; std::string to_string() const { - return fmt::format("server={}, term={}, dsn={}, hash={}", server_id, term, dsn, Hasher()(*this)); + return fmt::format("server={}, term={}, dsn={}, hash={}, traceID={}", server_id, term, dsn, Hasher()(*this), + traceID); } }; @@ -121,6 +123,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: repl_key const& rkey() const { return m_rkey; } uint64_t dsn() const { return m_rkey.dsn; } uint64_t term() const { return m_rkey.term; } + uint64_t traceID() const { return m_rkey.traceID; } int64_t lsn() const { return m_lsn; } bool is_proposer() const { return m_is_proposer; } journal_type_t op_code() const { return m_op_code; } diff --git a/src/lib/replication/push_data_rpc.fbs b/src/lib/replication/push_data_rpc.fbs index 1f6d20546..279fefcb5 100644 --- a/src/lib/replication/push_data_rpc.fbs +++ b/src/lib/replication/push_data_rpc.fbs @@ -2,6 +2,7 @@ native_include "boost/uuid/uuid.hpp"; namespace homestore; table PushDataRequest { + traceID: uint64; // traceID for the REQ issuer_replica_id : int32; // Replica id of the issuer raft_term : uint64; // Raft term number dsn : uint64; // Data Sequence number diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index b733c19c0..388d95015 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -54,6 +54,7 @@ void repl_req_ctx::create_journal_entry(bool is_raft_buf, int32_t server_id) { } m_journal_entry->code = m_op_code; + m_journal_entry->traceID = m_rkey.traceID; m_journal_entry->server_id = server_id; m_journal_entry->dsn = m_rkey.dsn; m_journal_entry->user_header_size = m_header.size(); diff --git a/src/lib/replication/repl_dev/common.h b/src/lib/replication/repl_dev/common.h index cb8a57931..880a8d30f 100644 --- a/src/lib/replication/repl_dev/common.h +++ b/src/lib/replication/repl_dev/common.h @@ -35,6 +35,7 @@ struct repl_journal_entry { uint16_t minor_version{JOURNAL_ENTRY_MINOR}; journal_type_t code; + uint64_t traceID; // traceID provided by application, mostly for consolidate logs. int32_t server_id; // Server id from where journal entry is originated uint64_t dsn; // Data seq number uint32_t user_header_size; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 27561d632..1c65f2550 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -196,9 +196,11 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ members.replica_in = member_in; sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_members_ctx)); - rreq->init( - repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, - journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0, m_listener); + rreq->init(repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = 0}, + journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0, m_listener); auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { @@ -267,7 +269,10 @@ folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { // here, we set the dsn to a new one , which is definitely unique in the follower, so that the new rreq will not // have a conflict with the old rreq. - rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, + rreq->init(repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = std::numeric_limits< uint64_t >::max()}, journal_type_t::HS_CTRL_DESTROY, true, sisl::blob{}, sisl::blob{}, 0, m_listener); auto err = m_state_machine->propose_to_raft(std::move(rreq)); @@ -315,14 +320,16 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } } - auto status = rreq->init( - repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, - data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, header, - key, data.size, m_listener); + auto status = rreq->init(repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = tid}, + data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, + true /* is_proposer */, header, key, data.size, m_listener); RD_LOGD("traceID [{}], repl_key [{}], header size [{}] bytes, user_key size [{}] bytes, data size " "[{}] bytes", - tid, rreq->rkey(), header.size(), key.size(), data.size); + tid, rreq->rkey(), header.size(), key.size(), data.size); // Add the request to the repl_dev_rreq map, it will be accessed throughout the life cycle of this request auto const [it, happened] = m_repl_key_req_map.emplace(rreq->rkey(), rreq); @@ -391,7 +398,7 @@ void RaftReplDev::push_data_to_all_followers(repl_req_ptr_t rreq, sisl::sg_list // Prepare the rpc request packet with all repl_reqs details builder.FinishSizePrefixed(CreatePushDataRequest( - builder, server_id(), rreq->term(), rreq->dsn(), + builder, rreq->traceID(), server_id(), rreq->term(), rreq->dsn(), builder.CreateVector(rreq->header().cbytes(), rreq->header().size()), builder.CreateVector(rreq->key().cbytes(), rreq->key().size()), data.size, get_time_since_epoch_ms())); @@ -448,7 +455,10 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d } sisl::blob header = sisl::blob{push_req->user_header()->Data(), push_req->user_header()->size()}; sisl::blob key = sisl::blob{push_req->user_key()->Data(), push_req->user_key()->size()}; - repl_key rkey{.server_id = push_req->issuer_replica_id(), .term = push_req->raft_term(), .dsn = push_req->dsn()}; + repl_key rkey{.server_id = push_req->issuer_replica_id(), + .term = push_req->raft_term(), + .dsn = push_req->dsn(), + .traceID = push_req->traceID()}; auto const req_orig_time_ms = push_req->time_ms(); RD_LOGD("Data Channel: PushData received: time diff={} ms.", get_elapsed_time_ms(req_orig_time_ms)); @@ -1583,7 +1593,8 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx jentry->value_size}; }; - repl_key const rkey{.server_id = jentry->server_id, .term = lentry->get_term(), .dsn = jentry->dsn}; + repl_key const rkey{ + .server_id = jentry->server_id, .term = lentry->get_term(), .dsn = jentry->dsn, .traceID = jentry->traceID}; auto const [it, happened] = m_repl_key_req_map.try_emplace(rkey, repl_req_ptr_t(new repl_req_ctx())); RD_DBG_ASSERT((it != m_repl_key_req_map.end()), "Unexpected error in map_repl_key_to_req"); diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index ceb1f4525..21dd04886 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -74,7 +74,8 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr jentry->value_size}; }; - repl_key const rkey{.server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn}; + repl_key const rkey{ + .server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn, .traceID = jentry->traceID}; // Create a new rreq (or) Pull rreq from the map given the repl_key, header and key. Any new rreq will // allocate the blks (in case of large data). We will use the new blkid and transform the current journal entry's @@ -150,7 +151,8 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_finish(nuraft::log_entry RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry received from RAFT peer"); - repl_key rkey{.server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn}; + repl_key rkey{ + .server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn, .traceID = jentry->traceID}; auto rreq = m_rd.repl_key_to_req(rkey); if ((rreq == nullptr) || (rreq->is_localize_pending())) { diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index b06347cb9..bc278303a 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -30,7 +30,7 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, repl_req_ptr_t rreq, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } - auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1}, + auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1, .traceID = tid}, value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, header, key, value.size, m_listener); HS_REL_ASSERT_EQ(status, ReplServiceError::OK, "Error in allocating local blks"); diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 11c6d6bc2..8fe72ac1d 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -357,7 +357,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { test_common::HSTestHelper::create_sgs(data_size, max_size_per_iov, req->jheader.data_pattern); } - repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req); + repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req, s_uniq_num); } void validate_db_data() { From d8a6d992ac28b824c4152f1a80b36676a242c4ce Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 2 Apr 2025 15:35:48 +0800 Subject: [PATCH 090/170] Adopt traceID for all RD_LOG Also adjust some of the logging level. Signed-off-by: Xiaoxi Chen --- src/include/homestore/replication/repl_dev.h | 13 +- .../replication/log_store/repl_log_store.cpp | 18 +- src/lib/replication/repl_dev/common.cpp | 12 +- src/lib/replication/repl_dev/common.h | 6 +- .../replication/repl_dev/raft_repl_dev.cpp | 382 ++++++++++-------- src/lib/replication/repl_dev/raft_repl_dev.h | 10 +- .../repl_dev/raft_state_machine.cpp | 52 ++- .../replication/repl_dev/raft_state_machine.h | 16 +- 8 files changed, 269 insertions(+), 240 deletions(-) diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index d020c9794..5b0395f44 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -54,10 +54,10 @@ static constexpr uint64_t HOMESTORE_RESYNC_DATA_MAGIC = 0xa65dbd27c213f327; static constexpr uint32_t HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1 = 0x01; struct repl_key { - int32_t server_id{0}; // Server Id which this req is originated from - uint64_t term; // RAFT term number - uint64_t dsn{0}; // Data sequence number to tie the data with the raft journal entry - uint64_t traceID{0}; // tracing ID provided by application that connects logs. + int32_t server_id{0}; // Server Id which this req is originated from + uint64_t term; // RAFT term number + uint64_t dsn{0}; // Data sequence number to tie the data with the raft journal entry + trace_id_t traceID{0}; // tracing ID provided by application that connects logs. struct Hasher { size_t operator()(repl_key const& rk) const { @@ -68,8 +68,7 @@ struct repl_key { bool operator==(repl_key const& other) const = default; std::string to_string() const { - return fmt::format("server={}, term={}, dsn={}, hash={}, traceID={}", server_id, term, dsn, Hasher()(*this), - traceID); + return fmt::format("server={}, term={}, dsn={}, hash={}", server_id, term, dsn, Hasher()(*this)); } }; @@ -123,7 +122,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: repl_key const& rkey() const { return m_rkey; } uint64_t dsn() const { return m_rkey.dsn; } uint64_t term() const { return m_rkey.term; } - uint64_t traceID() const { return m_rkey.traceID; } + trace_id_t traceID() const { return m_rkey.traceID; } int64_t lsn() const { return m_lsn; } bool is_proposer() const { return m_is_proposer; } journal_type_t op_code() const { return m_op_code; } diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index 8fa5c0f18..ca62c3197 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -10,7 +10,7 @@ uint64_t ReplLogStore::append(nuraft::ptr< nuraft::log_entry >& entry) { // We don't want to transform anything that is not an app log if (entry->get_val_type() != nuraft::log_val_type::app_log || entry->get_buf_ptr()->size() == 0) { ulong lsn = HomeRaftLogStore::append(entry); - RD_LOGD("append entry term={}, log_val_type={} lsn={} size={}", entry->get_term(), + RD_LOGD(NO_TRACE_ID, "None-APP log: append entry term={}, log_val_type={} lsn={} size={}", entry->get_term(), static_cast< uint32_t >(entry->get_val_type()), lsn, entry->get_buf().size()); return lsn; } @@ -19,7 +19,7 @@ uint64_t ReplLogStore::append(nuraft::ptr< nuraft::log_entry >& entry) { ulong lsn = HomeRaftLogStore::append(entry); m_sm.link_lsn_to_req(rreq, int64_cast(lsn)); - RD_LOGD("Raft Channel: Received append log entry rreq=[{}]", rreq->to_compact_string()); + RD_LOGT(rreq->traceID(), "Raft Channel: Received append log entry rreq=[{}]", rreq->to_compact_string()); return lsn; } @@ -33,7 +33,7 @@ void ReplLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry repl_req_ptr_t rreq = m_sm.localize_journal_entry_finish(*entry); HomeRaftLogStore::write_at(index, entry); m_sm.link_lsn_to_req(rreq, int64_cast(index)); - RD_LOGD("Raft Channel: Received write_at log entry rreq=[{}]", rreq->to_compact_string()); + RD_LOGT(rreq->traceID(), "Raft Channel: Received write_at log entry rreq=[{}]", rreq->to_compact_string()); } void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { @@ -54,8 +54,8 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { } } - RD_LOGT("Raft Channel: end_of_append_batch start_lsn={} count={} num_data_to_be_written={} {}", start_lsn, count, - reqs->size(), proposer_reqs->size()); + RD_LOGT(NO_TRACE_ID, "Raft Channel: end_of_append_batch start_lsn={} count={} num_data_to_be_written={} {}", + start_lsn, count, reqs->size(), proposer_reqs->size()); if (!reqs->empty()) { // Check the map if data corresponding to all of these requsts have been received and written. If not, schedule @@ -85,7 +85,9 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { // so skip waiting data written and mark reqs as flushed here. for (auto const& rreq : *proposer_reqs) { if (rreq) { - RD_LOGT("Raft Channel: end_of_append_batch, I am proposer for lsn {}, only flushed log for it", rreq->lsn()); + RD_LOGT(rreq->traceID(), + "Raft Channel: end_of_append_batch, I am proposer for lsn {}, only flushed log for it", + rreq->lsn()); rreq->add_state(repl_req_state_t::LOG_FLUSHED); } } @@ -95,7 +97,7 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { auto rreq = m_sm.lsn_to_req(lsn); if (rreq != nullptr) { if (rreq->has_state(repl_req_state_t::ERRORED)) { - RD_LOGE("Raft Channel: rreq=[{}] met some errors before", rreq->to_compact_string()); + RD_LOGE(rreq->traceID(), "Raft Channel: rreq=[{}] met some errors before", rreq->to_compact_string()); continue; } rreq->set_is_volatile(false); @@ -110,7 +112,7 @@ std::string ReplLogStore::rdev_name() const { return m_rd.rdev_name(); } std::string ReplLogStore::identify_str() const { return m_rd.identify_str(); } bool ReplLogStore::compact(ulong compact_upto_lsn) { - RD_LOG(DEBUG, "Raft Channel: compact_to_lsn={}", compact_upto_lsn); + RD_LOGD(NO_TRACE_ID, "Raft Channel: compact_to_lsn={}", compact_upto_lsn); m_rd.on_compact(compact_upto_lsn); return HomeRaftLogStore::compact(compact_upto_lsn); } diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 388d95015..5d0f262f0 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -31,7 +31,10 @@ ReplServiceError repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool std::unique_lock< std::mutex > lg(m_state_mtx); if (has_linked_data() && !has_state(repl_req_state_t::BLK_ALLOCATED)) { auto alloc_status = alloc_local_blks(listener, data_size); - if (alloc_status != ReplServiceError::OK) { LOGERROR("Allocate blk for rreq failed error={}", alloc_status); } + if (alloc_status != ReplServiceError::OK) { + LOGERRORMOD(replication, "[traceID={}] Allocate blk for rreq failed error={}", m_rkey.traceID, + alloc_status); + } return alloc_status; } return ReplServiceError::OK; @@ -105,7 +108,7 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list if (hints_result.value().committed_blk_id.has_value()) { //if the committed_blk_id is already present, use it and skip allocation and commitment - LOGINFO("For Repl_key=[{}] data already exists, skip", rkey().to_string()); + LOGINFOMOD(replication, "[traceID={}] For Repl_key=[{}] data already exists, skip", rkey().traceID, rkey().to_string()); m_local_blkid = hints_result.value().committed_blk_id.value(); add_state(repl_req_state_t::BLK_ALLOCATED); add_state(repl_req_state_t::DATA_RECEIVED); @@ -119,6 +122,7 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list auto status = data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), hints_result.value(), m_local_blkid); if (status != BlkAllocStatus::SUCCESS) { + LOGWARNMOD(replication, "[traceID={}] block allocation failure, repl_key=[{}], status=[{}]", rkey().traceID, rkey(), status); DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks"); return ReplServiceError::NO_SPACE_LEFT; } @@ -134,7 +138,7 @@ void repl_req_ctx::set_lsn(int64_t lsn) { "Changing lsn for request={} on the fly can cause race condition, not expected. lsn {}, m_lsn {}", to_string(), lsn, m_lsn); m_lsn = lsn; - LOGTRACEMOD(replication, "Setting lsn={} for request={}", lsn, to_string()); + LOGTRACEMOD(replication, "[traceID={}] Setting lsn={} for request={}", rkey().traceID, lsn, to_string()); } bool repl_req_ctx::save_pushed_data(intrusive< sisl::GenericRpcData > const& pushed_data, uint8_t const* data, @@ -198,7 +202,7 @@ void repl_req_ctx::release_data() { // explicitly clear m_buf_for_unaligned_data as unaligned pushdata/fetchdata will be saved here m_buf_for_unaligned_data = sisl::io_blob_safe{}; if (m_pushed_data) { - LOGTRACEMOD(replication, "m_pushed_data addr={}, m_rkey={}, m_lsn={}", + LOGTRACEMOD(replication, "[traceID={}] m_pushed_data addr={}, m_rkey={}, m_lsn={}", rkey().traceID, static_cast< void* >(m_pushed_data.get()), m_rkey.to_string(), m_lsn); m_pushed_data->send_response(); m_pushed_data = nullptr; diff --git a/src/lib/replication/repl_dev/common.h b/src/lib/replication/repl_dev/common.h index 880a8d30f..cf8f53759 100644 --- a/src/lib/replication/repl_dev/common.h +++ b/src/lib/replication/repl_dev/common.h @@ -35,9 +35,9 @@ struct repl_journal_entry { uint16_t minor_version{JOURNAL_ENTRY_MINOR}; journal_type_t code; - uint64_t traceID; // traceID provided by application, mostly for consolidate logs. - int32_t server_id; // Server id from where journal entry is originated - uint64_t dsn; // Data seq number + trace_id_t traceID; // traceID provided by application, mostly for consolidate logs. + int32_t server_id; // Server id from where journal entry is originated + uint64_t dsn; // Data seq number uint32_t user_header_size; uint32_t key_size; uint32_t value_size; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 1c65f2550..fcf49becc 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -82,26 +82,26 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk } m_identify_str = m_rdev_name + ":" + group_id_str(); - RD_LOG(INFO, - "Started {} RaftReplDev group_id={}, replica_id={}, raft_server_id={} commited_lsn={}, " - "compact_lsn={}, checkpoint_lsn:{}, next_dsn={} " - "log_dev={} log_store={}", - (load_existing ? "Existing" : "New"), group_id_str(), my_replica_id_str(), m_raft_server_id, - m_commit_upto_lsn.load(), m_compact_lsn.load(), m_rd_sb->checkpoint_lsn, m_next_dsn.load(), - m_rd_sb->logdev_id, m_rd_sb->logstore_id); + RD_LOGI(NO_TRACE_ID, + "Started {} RaftReplDev group_id={}, replica_id={}, raft_server_id={} commited_lsn={}, " + "compact_lsn={}, checkpoint_lsn:{}, next_dsn={} " + "log_dev={} log_store={}", + (load_existing ? "Existing" : "New"), group_id_str(), my_replica_id_str(), m_raft_server_id, + m_commit_upto_lsn.load(), m_compact_lsn.load(), m_rd_sb->checkpoint_lsn, m_next_dsn.load(), + m_rd_sb->logdev_id, m_rd_sb->logstore_id); } bool RaftReplDev::bind_data_service() { - RD_LOG(INFO, "Starting data channel, group_id={}, replica_id={}", group_id_str(), my_replica_id_str()); + RD_LOGI(NO_TRACE_ID, "Starting data channel, group_id={}, replica_id={}", group_id_str(), my_replica_id_str()); bool success = false; #ifdef _PRERELEASE success = m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, [this](intrusive< sisl::GenericRpcData >& rpc_data) { if (iomgr_flip::instance()->delay_flip("slow_down_data_channel", [this, rpc_data]() mutable { - RD_LOGI("Resuming after slow down data channel flip"); + RD_LOGI(NO_TRACE_ID, "Resuming after slow down data channel flip"); on_push_data_received(rpc_data); })) { - RD_LOGI("Slow down data channel flip is enabled, scheduling to call later"); + RD_LOGI(NO_TRACE_ID, "Slow down data channel flip is enabled, scheduling to call later"); } else { on_push_data_received(rpc_data); } @@ -111,13 +111,13 @@ bool RaftReplDev::bind_data_service() { m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1)); #endif if (!success) { - RD_LOGE("Failed to bind data service request for PUSH_DATA"); + RD_LOGE(NO_TRACE_ID, "Failed to bind data service request for PUSH_DATA"); return false; } success = m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 1)); if (!success) { - RD_LOGE("Failed to bind data service request for FETCH_DATA"); + RD_LOGE(NO_TRACE_ID, "Failed to bind data service request for FETCH_DATA"); return false; } return true; @@ -137,6 +137,8 @@ bool RaftReplDev::join_group() { AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum) { + // Fixme: traceID for replace member + uint64_t trace_id = 0; if (is_stopping()) { LOGINFO("repl dev is being shutdown!"); @@ -144,12 +146,12 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ } incr_pending_request_num(); - LOGINFO("Replace member group_id={} member_out={} member_in={}", group_id_str(), - boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + RD_LOGI(trace_id, "Replace member, member_out={} member_in={}", boost::uuids::to_string(member_out.id), + boost::uuids::to_string(member_in.id)); if (commit_quorum >= 1) { // Two members are down and leader cant form the quorum. Reduce the quorum size. - reset_quorum_size(commit_quorum); + reset_quorum_size(commit_quorum, trace_id); } // Step 1: Check if leader itself is requested to move out. @@ -157,8 +159,8 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ // If leader is the member requested to move out, then give up leadership and return error. // Client will retry replace_member request to the new leader. raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); - RD_LOGI("Replace member leader is the member_out so yield leadership"); - reset_quorum_size(0); + RD_LOGI(trace_id, "Replace member leader is the member_out so yield leadership"); + reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::NOT_LEADER); } @@ -166,7 +168,7 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ // Step 2. Add the new member. return m_msg_mgr.add_member(m_group_id, member_in.id) .via(&folly::InlineExecutor::instance()) - .thenValue([this, member_in, member_out, commit_quorum](auto&& e) -> AsyncReplResult<> { + .thenValue([this, member_in, member_out, commit_quorum, trace_id](auto&& e) -> AsyncReplResult<> { // TODO Currently we ignore the cancelled, fix nuraft_mesg to not timeout // when adding member. Member is added to cluster config until member syncs fully // with atleast stop gap. This will take a lot of time for block or @@ -177,16 +179,16 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ // can be resend and one of the add or remove can failed and has to retried. if (e.error() == nuraft::cmd_result_code::CANCELLED || e.error() == nuraft::cmd_result_code::SERVER_ALREADY_EXISTS) { - RD_LOGW("Ignoring error returned from nuraft add_member {}", e.error()); + RD_LOGI(trace_id, "Ignoring error returned from nuraft add_member {}", e.error()); } else { - RD_LOGE("Replace member error in add member : {}", e.error()); - reset_quorum_size(0); + RD_LOGE(trace_id, "Replace member error in add member : {}", e.error()); + reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(RaftReplService::to_repl_error(e.error())); } } - RD_LOGI("Replace member added member={} to group_id={}", boost::uuids::to_string(member_in.id), + RD_LOGI(trace_id, "Replace member added member={} to group_id={}", boost::uuids::to_string(member_in.id), group_id_str()); // Step 3. Append log entry to mark the old member is out and new member is added. @@ -199,53 +201,53 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1), - .traceID = 0}, + .traceID = trace_id}, journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0, m_listener); auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { - LOGERROR("Replace member propose to raft failed {}", err); - reset_quorum_size(0); + RD_LOGE(trace_id, "Replace member propose to raft failed {}", err); + reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(err)); } - RD_LOGI("Replace member proposed to raft group_id={}", group_id_str()); + RD_LOGI(trace_id, "Replace member proposed to raft group_id={}", group_id_str()); // Step 4. Remove the old member. Even if the old member is temporarily // down and recovers, nuraft mesg see member remove from cluster log // entry and call exit_group() and leave(). return m_msg_mgr.rem_member(m_group_id, member_out.id) .via(&folly::InlineExecutor::instance()) - .thenValue([this, member_out, commit_quorum](auto&& e) -> AsyncReplResult<> { + .thenValue([this, member_out, commit_quorum, trace_id](auto&& e) -> AsyncReplResult<> { if (e.hasError()) { // Ignore the server not found as server removed from the cluster // as requests are idempotent and can be resend. if (e.error() == nuraft::cmd_result_code::SERVER_NOT_FOUND) { - RD_LOGW("Remove member not found in group error, ignoring"); + RD_LOGW(trace_id, "Remove member not found in group error, ignoring"); } else { // Its ok to retry this request as the request // of replace member is idempotent. - RD_LOGE("Replace member failed to remove member : {}", e.error()); - reset_quorum_size(0); + RD_LOGE(trace_id, "Replace member failed to remove member : {}", e.error()); + reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::RETRY_REQUEST); } } else { - RD_LOGI("Replace member removed member={} from group_id={}", + RD_LOGI(trace_id, "Replace member removed member={} from group_id={}", boost::uuids::to_string(member_out.id), group_id_str()); } // Revert the quorum size back to 0. - reset_quorum_size(0); + reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_success<>(); }); }); } -void RaftReplDev::reset_quorum_size(uint32_t commit_quorum) { - RD_LOGI("Reset raft quorum size={}", commit_quorum); +void RaftReplDev::reset_quorum_size(uint32_t commit_quorum, uint64_t trace_id) { + RD_LOGI(trace_id, "Reset raft quorum size={}", commit_quorum); nuraft::raft_params params = raft_server()->get_current_params(); params.with_custom_commit_quorum_size(commit_quorum); params.with_custom_election_quorum_size(commit_quorum); @@ -289,7 +291,7 @@ folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { void RaftReplDev::use_config(json_superblk raft_config_sb) { m_raft_config_sb = std::move(raft_config_sb); } void RaftReplDev::on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done) { - RD_LOG(DEBUG, "create_snapshot last_idx={}/term={}", s.get_last_log_idx(), s.get_last_log_term()); + RD_LOGD(NO_TRACE_ID, "create_snapshot last_idx={}/term={}", s.get_last_log_idx(), s.get_last_log_term()); auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); auto result = m_listener->create_snapshot(snp_ctx).get(); auto null_except = std::shared_ptr< std::exception >(); @@ -312,7 +314,7 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& { auto const guard = m_stage.access(); if (auto const stage = *guard.get(); stage != repl_dev_stage_t::ACTIVE) { - RD_LOGW("Raft channel: Not ready to accept writes, stage={}", enum_name(stage)); + RD_LOGW(tid, "Raft channel: Not ready to accept writes, stage={}", enum_name(stage)); handle_error(rreq, (stage == repl_dev_stage_t::INIT) ? ReplServiceError::SERVER_IS_JOINING : ReplServiceError::SERVER_IS_LEAVING); @@ -327,16 +329,15 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, header, key, data.size, m_listener); - RD_LOGD("traceID [{}], repl_key [{}], header size [{}] bytes, user_key size [{}] bytes, data size " - "[{}] bytes", - tid, rreq->rkey(), header.size(), key.size(), data.size); + RD_LOGD(tid, "repl_key [{}], header size [{}] bytes, user_key size [{}] bytes, data size [{}] bytes", rreq->rkey(), + header.size(), key.size(), data.size); // Add the request to the repl_dev_rreq map, it will be accessed throughout the life cycle of this request auto const [it, happened] = m_repl_key_req_map.emplace(rreq->rkey(), rreq); RD_DBG_ASSERT(happened, "Duplicate repl_key={} found in the map", rreq->rkey().to_string()); if (status != ReplServiceError::OK) { - RD_LOGD("traceID [{}], Initializing rreq failed error={}, failing this req", tid, status); + RD_LOGI(tid, "Initializing rreq failed error={}, failing this req", status); handle_error(rreq, status); return; } @@ -344,14 +345,14 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& // If it is header only entry, directly propose to the raft if (rreq->has_linked_data()) { if (rreq->is_proposer() && rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { - RD_LOGD("data blks has already been allocated and committed, failing this req"); + RD_LOGE(tid, "data blks has already been allocated and committed, failing this req"); handle_error(rreq, ReplServiceError::DATA_DUPLICATED); return; } #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("disable_leader_push_data")) { - RD_LOGD("Simulating push data failure, so that all the follower will have to fetch data"); + RD_LOGD(tid, "Simulating push data failure, so that all the follower will have to fetch data"); } else push_data_to_all_followers(rreq, data); #else @@ -386,7 +387,7 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } }); } else { - RD_LOGD("Skipping data channel send since value size is 0"); + RD_LOGT(tid, "Skipping data channel send since value size is 0"); rreq->add_state(repl_req_state_t::DATA_WRITTEN); auto raft_status = m_state_machine->propose_to_raft(rreq); if (raft_status != ReplServiceError::OK) { handle_error(rreq, raft_status); } @@ -412,7 +413,7 @@ void RaftReplDev::push_data_to_all_followers(repl_req_ptr_t rreq, sisl::sg_list auto peers = get_active_peers(); auto calls = std::vector< nuraft_mesg::NullAsyncResult >(); for (auto peer : peers) { - RD_LOGD("Data Channel: Pushing data to follower {}, rreq=[{}]", peer, rreq->to_string()); + RD_LOGD(rreq->traceID(), "Data Channel: Pushing data to follower {}, rreq=[{}]", peer, rreq->to_string()); calls.push_back(group_msg_service() ->data_service_request_unidirectional(peer, PUSH_DATA, rreq->m_pkts) .via(&folly::InlineExecutor::instance())); @@ -423,12 +424,12 @@ void RaftReplDev::push_data_to_all_followers(repl_req_ptr_t rreq, sisl::sg_list auto r = res.value(); if (r.hasError()) { // Just logging PushData error, no action is needed as follower can try by fetchData. - RD_LOGW("Data Channel: Error in pushing data to all followers: rreq=[{}] error={}", + RD_LOGI(rreq->traceID(), "Data Channel: Error in pushing data to all followers: rreq=[{}] error={}", rreq->to_string(), r.error()); } } } - RD_LOGD("Data Channel: Data push completed for rreq=[{}]", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Data Channel: Data push completed for rreq=[{}]", rreq->to_compact_string()); // Release the buffer which holds the packets rreq->release_fb_builder(); rreq->m_pkts.clear(); @@ -439,7 +440,7 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d auto const push_data_rcv_time = Clock::now(); auto const& incoming_buf = rpc_data->request_blob(); if (!incoming_buf.cbytes()) { - RD_LOGW("Data Channel: PushData received with empty buffer, ignoring this call"); + RD_LOGW(NO_TRACE_ID, "Data Channel: PushData received with empty buffer, ignoring this call"); rpc_data->send_response(); return; } @@ -448,7 +449,8 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d flatbuffers::ReadScalar< flatbuffers::uoffset_t >(incoming_buf.cbytes()) + sizeof(flatbuffers::uoffset_t); auto push_req = GetSizePrefixedPushDataRequest(incoming_buf.cbytes()); if (fb_size + push_req->data_size() != incoming_buf.size()) { - RD_LOGW("Data Channel: PushData received with size mismatch, header size {}, data size {}, received size {}", + RD_LOGW(NO_TRACE_ID, + "Data Channel: PushData received with size mismatch, header size {}, data size {}, received size {}", fb_size, push_req->data_size(), incoming_buf.size()); rpc_data->send_response(); return; @@ -461,11 +463,12 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d .traceID = push_req->traceID()}; auto const req_orig_time_ms = push_req->time_ms(); - RD_LOGD("Data Channel: PushData received: time diff={} ms.", get_elapsed_time_ms(req_orig_time_ms)); + RD_LOGD(rkey.traceID, "Data Channel: PushData received: time diff={} ms.", get_elapsed_time_ms(req_orig_time_ms)); #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("drop_push_data_request")) { - LOGINFO("Data Channel: Flip is enabled, skip on_push_data_received to simulate fetch remote data, " + RD_LOGI(rkey.traceID, + "Data Channel: Flip is enabled, skip on_push_data_received to simulate fetch remote data, " "server_id={}, term={}, dsn={}", push_req->issuer_replica_id(), push_req->raft_term(), push_req->dsn()); rpc_data->send_response(); @@ -476,16 +479,17 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d auto rreq = applier_create_req(rkey, journal_type_t::HS_DATA_LINKED, header, key, push_req->data_size(), true /* is_data_channel */); if (rreq == nullptr) { - RD_LOG(ERROR, - "Data Channel: Creating rreq on applier has failed, will ignore the push and let Raft channel send " - "trigger a fetch explicitly if needed. rkey={}", - rkey.to_string()); + RD_LOGE(rkey.traceID, + "Data Channel: Creating rreq on applier has failed, will ignore the push and let Raft channel send " + "trigger a fetch explicitly if needed. rkey={}", + rkey.to_string()); rpc_data->send_response(); return; } if (!rreq->save_pushed_data(rpc_data, incoming_buf.cbytes() + fb_size, push_req->data_size())) { - RD_LOGD("Data Channel: Data already received for rreq=[{}], ignoring this data", rreq->to_string()); + RD_LOGT(rkey.traceID, "Data Channel: Data already received for rreq=[{}], ignoring this data", + rreq->to_string()); rpc_data->send_response(); return; } @@ -508,10 +512,12 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d rreq->release_data(); rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->m_data_written_promise.setValue(); + // if rreq create time is earlier than push_data receive time, that means the rreq was created by raft + // channel log. Otherwise set to zero as rreq is created by data channel. const auto data_log_diff_us = push_data_rcv_time.time_since_epoch().count() > rreq->created_time().time_since_epoch().count() ? get_elapsed_time_us(rreq->created_time(), push_data_rcv_time) - : get_elapsed_time_us(push_data_rcv_time, rreq->created_time()); + : 0; auto const data_write_latency = get_elapsed_time_us(push_data_rcv_time); auto const total_data_write_latency = get_elapsed_time_us(rreq->created_time()); @@ -521,10 +527,11 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d HISTOGRAM_OBSERVE(m_metrics, rreq_push_data_latency_us, data_write_latency); HISTOGRAM_OBSERVE(m_metrics, rreq_total_data_write_latency_us, total_data_write_latency); - RD_LOGD("Data Channel: Data write completed for rreq=[{}], time_diff_data_log_us={}, " + RD_LOGD(rreq->traceID(), + "Data Channel: Data write completed for rreq=[{}], time_diff_data_log_us={}, " "data_write_latency_us={}, total_data_write_latency_us(rreq creation to write complete)={}, " "local_blkid.num_pieces={}", - rreq->to_string(), data_log_diff_us, data_write_latency, total_data_write_latency, + rreq->to_compact_string(), data_log_diff_us, data_write_latency, total_data_write_latency, write_num_pieces); } }); @@ -547,7 +554,7 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ // RD_REL_ASSERT(blob_equals(user_header, rreq->header), "User header mismatch for repl_key={}", // rkey.to_string()); // RD_REL_ASSERT(blob_equals(user_key, rreq->key), "User key mismatch for repl_key={}", rkey.to_string()); - RD_LOGD("Repl_key=[{}] already received ", rkey.to_string()); + RD_LOGT(rkey.traceID, "Repl_key=[{}] already received ", rkey.to_string()); return rreq; } } @@ -569,13 +576,15 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ } #endif if (status != ReplServiceError::OK) { - RD_LOGD("For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), status); + RD_LOGD(rkey.traceID, "For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), + status); // Do not call handle_error here, because handle_error is for rreq which needs to be terminated. This one can be // retried. return nullptr; } - RD_LOGD("in follower_create_req: rreq={}, addr={}", rreq->to_string(), reinterpret_cast< uintptr_t >(rreq.get())); + RD_LOGD(rreq->traceID(), "in follower_create_req: rreq={}, addr=0x{:x}", rreq->to_string(), + reinterpret_cast< uintptr_t >(rreq.get())); return rreq; } @@ -589,7 +598,7 @@ folly::Future< folly::Unit > RaftReplDev::notify_after_data_written(std::vector< if (!rreq->has_linked_data()) { continue; } auto const status = uint32_cast(rreq->state()); if (status & uint32_cast(repl_req_state_t::DATA_WRITTEN)) { - RD_LOGD("Raft Channel: Data write completed and blkid mapped: rreq=[{}]", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Data written and blkid mapped: rkey=[{}]", rreq->to_compact_string()); continue; } @@ -632,10 +641,10 @@ folly::Future< folly::Unit > RaftReplDev::notify_after_data_written(std::vector< HS_DBG_ASSERT(rreq->has_state(repl_req_state_t::DATA_WRITTEN), "Data written promise raised without updating DATA_WRITTEN state for rkey={}", rreq->rkey().to_string()); - RD_LOGD("Raft Channel: Data write completed and blkid mapped: rreq=[{}]", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Data write completed and blkid mapped: rreq=[{}]", rreq->to_compact_string()); } #endif - RD_LOGT("Data Channel: {} pending reqs's data are written", rreqs->size()); + RD_LOGT(NO_TRACE_ID, "{} pending reqs's data are written", rreqs->size()); return folly::makeFuture< folly::Unit >(folly::Unit{}); }); } @@ -662,9 +671,9 @@ bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rre // sometime before do an explicit fetch. This is so that, it is possible raft channel has come ahead of data // channel and waiting for sometime avoid expensive fetch. On steady state, after a little bit of wait data // would be reached automatically. - RD_LOG(DEBUG, - "We haven't received data for {} out {} in reqs batch, will fetch and wait for {} ms, in_resync_mode()={} ", - only_wait_reqs.size(), rreqs.size(), timeout_ms, is_resync_mode()); + RD_LOGD(NO_TRACE_ID, + "We haven't received data for {} out {} in reqs batch, will fetch and wait for {} ms, in_resync_mode()={} ", + only_wait_reqs.size(), rreqs.size(), timeout_ms, is_resync_mode()); // We are yet to support reactive fetch from remote. if (is_resync_mode()) { @@ -694,12 +703,12 @@ void RaftReplDev::check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreq for (auto const& rreq : rreqs) { auto const cur_state = uint32_cast(rreq->state()); if (cur_state == uint32_cast(repl_req_state_t::ERRORED)) { - // We already received the data before, just ignore this data - RD_LOGD("Raft Channel: rreq=[{}] already errored out, ignoring the fetch", rreq->to_string()); + RD_LOGD(rreq->traceID(), "rreq=[{}] already errored out, ignoring the fetch", rreq->to_compact_string()); continue; } else if (cur_state == uint32_cast(repl_req_state_t::DATA_RECEIVED)) { // We already received the data before, just ignore this data - RD_LOGD("Raft Channel: Data already received for rreq=[{}], ignoring the fetch", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Data already received for rreq=[{}], ignoring the fetch", + rreq->to_compact_string()); continue; } @@ -727,7 +736,8 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) { entries.reserve(rreqs.size()); shared< flatbuffers::FlatBufferBuilder > builder = std::make_shared< flatbuffers::FlatBufferBuilder >(); - RD_LOGD("Data Channel : FetchData from remote: rreq.size={}, my server_id={}", rreqs.size(), server_id()); + RD_LOGD(NO_TRACE_ID, "Data Channel : FetchData from remote: rreq.size={}, my server_id={}", rreqs.size(), + server_id()); auto const& originator = rreqs.front()->remote_blkid().server_id; for (auto const& rreq : rreqs) { @@ -743,7 +753,8 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) { RD_DBG_ASSERT_EQ(rreq->remote_blkid().server_id, originator, "Unexpected originator for rreq={}", rreq->to_string()); - RD_LOGT("Fetching data from originator={}, remote: rreq=[{}], remote_blkid={}, my server_id={}", originator, + RD_LOGT(rreq->traceID(), + "Fetching data from originator={}, remote: rreq=[{}], remote_blkid={}, my server_id={}", originator, rreq->to_string(), rreq->remote_blkid().blkid.to_string(), server_id()); } @@ -768,15 +779,15 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) { auto const fetch_latency_us = get_elapsed_time_us(fetch_start_time); HISTOGRAM_OBSERVE(m_metrics, rreq_data_fetch_latency_us, fetch_latency_us); - RD_LOGD("Data Channel: FetchData from remote completed, time taken={} us", fetch_latency_us); + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData from remote completed, time taken={} us", fetch_latency_us); if (!response) { // if we are here, it means the original who sent the log entries are down. // we need to handle error and when the other member becomes leader, it will resend the log entries; - RD_LOG(ERROR, - "Not able to fetching data from originator={}, error={}, probably originator is down. Will " - "retry when new leader start appending log entries", - rreqs.front()->remote_blkid().server_id, response.error()); + RD_LOGE(NO_TRACE_ID, + "Not able to fetching data from originator={}, error={}, probably originator is down. Will " + "retry when new leader start appending log entries", + rreqs.front()->remote_blkid().server_id, response.error()); for (auto const& rreq : rreqs) { // TODO: Set the data_received promise with error, so that waiting threads can be unblocked and // reject the request. Without that, it will timeout and then reject it. @@ -804,13 +815,14 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) { void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_data) { auto const& incoming_buf = rpc_data->request_blob(); if (!incoming_buf.cbytes()) { - RD_LOGW("Data Channel: PushData received with empty buffer, ignoring this call"); + RD_LOGW(NO_TRACE_ID, "Data Channel: PushData received with empty buffer, ignoring this call"); rpc_data->send_response(); return; } auto fetch_req = GetSizePrefixedFetchData(incoming_buf.cbytes()); - RD_LOGD("Data Channel: FetchData received: fetch_req.size={}", fetch_req->request()->entries()->size()); + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData received: fetch_req.size={}", + fetch_req->request()->entries()->size()); std::vector< sisl::sg_list > sgs_vec; std::vector< folly::Future< bool > > futs; @@ -834,15 +846,15 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_ sgs_vec.push_back(sgs); if (originator != server_id()) { - RD_LOGD("non-originator FetchData received: dsn={} lsn={} originator={}, my_server_id={}", req->dsn(), lsn, - originator, server_id()); + RD_LOGD(NO_TRACE_ID, "non-originator FetchData received: dsn={} lsn={} originator={}, my_server_id={}", + req->dsn(), lsn, originator, server_id()); } else { - RD_LOGD("Data Channel: FetchData received: dsn={} lsn={}", req->dsn(), lsn); + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData received: dsn={} lsn={}", req->dsn(), lsn); } auto const& header = req->user_header(); sisl::blob user_header = sisl::blob{header->Data(), header->size()}; - RD_LOGD("Data Channel: FetchData handled, my_blkid={}", local_blkid.to_string()); + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData handled, my_blkid={}", local_blkid.to_string()); futs.emplace_back(std::move(m_listener->on_fetch_data(lsn, user_header, local_blkid, sgs))); } @@ -858,7 +870,7 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_ } } - RD_LOGD("Data Channel: FetchData data read completed for {} buffers", sgs_vec.size()); + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData data read completed for {} buffers", sgs_vec.size()); // now prepare the io_blob_list to response back to requester; nuraft_mesg::io_blob_list_t pkts = sisl::io_blob_list_t{}; @@ -890,7 +902,7 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons RD_DBG_ASSERT_GT(total_size, 0, "Empty response from remote"); RD_DBG_ASSERT(raw_data, "Empty response from remote"); - RD_LOGD("Data Channel: FetchData completed for {} requests", rreqs.size()); + RD_LOGD(NO_TRACE_ID, "Data Channel: FetchData completed for {} requests", rreqs.size()); for (auto const& rreq : rreqs) { auto const data_size = rreq->remote_blkid().blkid.blk_count() * get_blk_size(); @@ -901,8 +913,9 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons RD_DBG_ASSERT_EQ(data_size, local_size, "Data size mismatch for rreq={} remote size: {}, local size: {}", rreq->to_string(), data_size, local_size); - RD_LOGD("Data Channel: Data already received for rreq=[{}], skip and move on to next rreq.", - rreq->to_string()); + RD_LOGT(rreq->traceID(), + "Data Channel: Data already received for rreq=[{}], skip and move on to next rreq.", + rreq->to_compact_string()); } else { auto const data_write_start_time = Clock::now(); COUNTER_INCREMENT(m_metrics, total_write_cnt, 1); @@ -926,13 +939,15 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->m_data_written_promise.setValue(); - RD_LOGD("Data Channel: Data Write completed rreq=[{}], data_write_latency_us={}, " + RD_LOGD(rreq->traceID(), + "Data Channel: Data Write completed rreq=[{}], data_write_latency_us={}, " "total_write_latency_us={}, write_num_pieces={}", - rreq->to_string(), data_write_latency, total_data_write_latency, write_num_pieces); + rreq->to_compact_string(), data_write_latency, total_data_write_latency, write_num_pieces); }); - RD_LOGD("Data Channel: Data fetched from remote: rreq=[{}], data_size: {}, total_size: {}, local_blkid: {}", - rreq->to_string(), data_size, total_size, rreq->local_blkid().to_string()); + RD_LOGT(rreq->traceID(), + "Data Channel: Data fetched from remote: rreq=[{}], data_size: {}, total_size: {}, local_blkid: {}", + rreq->to_compact_string(), data_size, total_size, rreq->local_blkid().to_string()); } raw_data += data_size; total_size -= data_size; @@ -954,8 +969,8 @@ void RaftReplDev::commit_blk(repl_req_ptr_t rreq) { void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { // 1. call the listener to rollback + RD_LOGD(rreq->traceID(), "Rolling back rreq: {}", rreq->to_compact_string()); m_listener->on_rollback(rreq->lsn(), rreq->header(), rreq->key(), rreq); - // 2. remove the request from maps m_state_machine->unlink_lsn_to_req(rreq->lsn(), rreq); m_repl_key_req_map.erase(rreq->rkey()); @@ -963,9 +978,9 @@ void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { // 3. free the allocated blocks if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { auto blkid = rreq->local_blkid(); - data_service().async_free_blk(blkid).thenValue([this, blkid](auto&& err) { + data_service().async_free_blk(blkid).thenValue([this, blkid, rreq](auto&& err) { HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", blkid.to_string()); - RD_LOGD("Rollback rreq: Releasing blkid={} freed successfully", blkid.to_string()); + RD_LOGD(rreq->traceID(), "Releasing blkid={} freed successfully", blkid.to_string()); }); } } @@ -983,7 +998,7 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { m_next_dsn.compare_exchange_strong(cur_dsn, rreq->dsn() + 1); } - RD_LOGD("Raft channel: Commit rreq=[{}]", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Raft channel: Commit rreq=[{}]", rreq->to_compact_string()); if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY) { leave(); } else if (rreq->op_code() == journal_type_t::HS_CTRL_REPLACE) { @@ -1004,21 +1019,21 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { void RaftReplDev::handle_config_commit(const repl_lsn_t lsn, raft_cluster_config_ptr_t& new_conf) { // when reaching here, the new config has already been applied to the cluster. // since we didn't create repl req for config change, we just need to update m_commit_upto_lsn here. - + RD_LOGD(NO_TRACE_ID, "config commit on lsn {}", lsn); // keep this variable in case it is needed later (void)new_conf; auto prev_lsn = m_commit_upto_lsn.load(std::memory_order_relaxed); if (prev_lsn >= lsn || !m_commit_upto_lsn.compare_exchange_strong(prev_lsn, lsn)) { - RD_LOGE("Raft Channel: unexpected log {} commited before config {} committed", prev_lsn, lsn); + RD_LOGE(NO_TRACE_ID, "Raft Channel: unexpected log {} commited before config {} committed", prev_lsn, lsn); } } void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) { if (err == ReplServiceError::OK) { return; } - RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); + RD_LOGE(rreq->traceID(), "Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); if (!rreq->add_state_if_not_already(repl_req_state_t::ERRORED)) { - RD_LOGE("Raft Channel: Error has been added for rreq=[{}] error={}", rreq->to_string(), err); + RD_LOGE(rreq->traceID(), "Raft Channel: Error has been added for rreq=[{}] error={}", rreq->to_string(), err); return; } @@ -1032,7 +1047,7 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) exist_rreq->to_string()); } if (err == ReplServiceError::DATA_DUPLICATED) { - RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); + RD_LOGE(rreq->traceID(), "Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); m_listener->on_error(err, rreq->header(), rreq->key(), rreq); rreq->clear(); return; @@ -1066,7 +1081,7 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) void RaftReplDev::replace_member(repl_req_ptr_t rreq) { auto members = r_cast< const replace_members_ctx* >(rreq->header().cbytes()); - RD_LOGI("Raft repl replace_member commit member_out={} member_in={}", + RD_LOGI(rreq->traceID(), "Raft repl replace_member commit member_out={} member_in={}", boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); m_listener->on_replace_member(members->replica_out, members->replica_in); @@ -1113,7 +1128,7 @@ AsyncReplResult<> RaftReplDev::become_leader() { return m_msg_mgr.become_leader(m_group_id).via(&folly::InlineExecutor::instance()).thenValue([this](auto&& e) { if (e.hasError()) { - RD_LOGE("Error in becoming leader: {}", e.error()); + RD_LOGE(NO_TRACE_ID, "Error in becoming leader: {}", e.error()); decr_pending_request_num(); return make_async_error<>(RaftReplService::to_repl_error(e.error())); } @@ -1157,7 +1172,8 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { if (p.replication_idx_ >= least_active_repl_idx) { res.insert(p.id_); } else { - RD_LOGW("Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}", + RD_LOGW(NO_TRACE_ID, + "Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}", p.id_, my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_, least_active_repl_idx); } @@ -1249,7 +1265,7 @@ void RaftReplDev::save_config(const nuraft::cluster_config& config) { std::unique_lock lg{m_config_mtx}; (*m_raft_config_sb)["config"] = serialize_cluster_config(config); m_raft_config_sb.write(); - RD_LOGI("Saved config {}", (*m_raft_config_sb)["config"].dump()); + RD_LOGI(NO_TRACE_ID, "Saved config {}", (*m_raft_config_sb)["config"].dump()); } void RaftReplDev::save_state(const nuraft::srv_state& state) { @@ -1259,7 +1275,7 @@ void RaftReplDev::save_state(const nuraft::srv_state& state) { {"election_timer_allowed", state.is_election_timer_allowed()}, {"catching_up", state.is_catching_up()}}; m_raft_config_sb.write(); - RD_LOGI("Saved state {}", (*m_raft_config_sb)["state"].dump()); + RD_LOGI(NO_TRACE_ID, "Saved state {}", (*m_raft_config_sb)["state"].dump()); } nuraft::ptr< nuraft::srv_state > RaftReplDev::read_state() { @@ -1301,7 +1317,7 @@ uint32_t RaftReplDev::get_logstore_id() const { return m_data_journal->logstore_ std::shared_ptr< nuraft::state_machine > RaftReplDev::get_state_machine() { return m_state_machine; } void RaftReplDev::permanent_destroy() { - RD_LOGI("Permanent destroy for raft repl dev group_id={}", group_id_str()); + RD_LOGI(NO_TRACE_ID, "Permanent destroy for raft repl dev group_id={}", group_id_str()); // let the listener know at first, so that they can cleanup persistent structures before raft repl dev is destroyed m_listener->on_destroy(group_id()); m_raft_config_sb.destroy(); @@ -1336,7 +1352,7 @@ void RaftReplDev::leave() { m_rd_sb->destroy_pending = 0x1; m_rd_sb.write(); - RD_LOGI("RaftReplDev leave group_id={}", group_id_str()); + RD_LOGI(NO_TRACE_ID, "RaftReplDev leave group_id={}", group_id_str()); m_destroy_promise.setValue(ReplServiceError::OK); // In case proposer is waiting for the destroy to complete } @@ -1349,71 +1365,72 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, auto const& entries = raft_req->log_entries(); auto start_lsn = raft_req->get_last_log_idx() + 1; - RD_LOGD("Raft channel: Received {} append entries on follower from leader, term {}, lsn {} ~ {} , my commited " - "lsn {} , leader commmited lsn {}", + if (entries.size() == 0) { + RD_LOGT(NO_TRACE_ID, "Raft channel: Received no entry, leader committed lsn {}", + raft_req->get_commit_idx()); + return ret; + } + RD_LOGT(NO_TRACE_ID, + "Raft channel: Received {} append entries on follower from leader, term {}, lsn {} ~ {} , my " + "committed lsn {} , leader committed lsn {}", entries.size(), raft_req->get_last_log_term(), start_lsn, start_lsn + entries.size() - 1, m_commit_upto_lsn.load(), raft_req->get_commit_idx()); - if (!entries.empty()) { - RD_LOGT("Raft channel: Received {} append entries on follower from leader, localizing them", - entries.size()); - - auto reqs = sisl::VectorPool< repl_req_ptr_t >::alloc(); - auto last_commit_lsn = uint64_cast(get_last_commit_lsn()); - for (unsigned long i = 0; i < entries.size(); i++) { - auto& entry = entries[i]; - auto lsn = start_lsn + i; - auto term = entry->get_term(); - if (entry->get_val_type() != nuraft::log_val_type::app_log) { continue; } - if (entry->get_buf_ptr()->size() == 0) { continue; } - // skipping localize for already committed log(dup), they anyway will be discard - // by nuraft before append_log. - if (lsn <= last_commit_lsn) { - RD_LOGT("Raft channel: term {}, lsn {}, skipping dup, last_commit_lsn {}", term, lsn, - last_commit_lsn); - continue; - } - // Those LSNs already in logstore but not yet committed, will be dedup here, - // applier_create_req will return same req as previous one - auto req = m_state_machine->localize_journal_entry_prepare(*entry); - if (req == nullptr) { - sisl::VectorPool< repl_req_ptr_t >::free(reqs); - // The hint set here will be used by the next after next appendEntry, the next one - // always go with -1 from NuRraft code. - // - // We are rejecting this log entry, meaning we can accept previous log entries. - // If there is nothing we can accept(i==0), that maens we are waiting for commit - // of previous lsn, set it to 1 in this case. - m_state_machine->reset_next_batch_size_hint(std::max(1ul, i)); - return nuraft::cb_func::ReturnCode::ReturnNull; - } - reqs->emplace_back(std::move(req)); + auto reqs = sisl::VectorPool< repl_req_ptr_t >::alloc(); + auto last_commit_lsn = uint64_cast(get_last_commit_lsn()); + for (unsigned long i = 0; i < entries.size(); i++) { + auto& entry = entries[i]; + auto lsn = start_lsn + i; + auto term = entry->get_term(); + if (entry->get_val_type() != nuraft::log_val_type::app_log) { continue; } + if (entry->get_buf_ptr()->size() == 0) { continue; } + // skipping localize for already committed log(dup), they anyway will be discard + // by nuraft before append_log. + if (lsn <= last_commit_lsn) { + RD_LOGT(NO_TRACE_ID, "Raft channel: term {}, lsn {}, skipping dup, last_commit_lsn {}", term, lsn, + last_commit_lsn); + continue; + } + // Those LSNs already in logstore but not yet committed, will be dedup here, + // applier_create_req will return same req as previous one + auto req = m_state_machine->localize_journal_entry_prepare(*entry); + if (req == nullptr) { + sisl::VectorPool< repl_req_ptr_t >::free(reqs); + // The hint set here will be used by the next after next appendEntry, the next one + // always go with -1 from NuRraft code. + // + // We are rejecting this log entry, meaning we can accept previous log entries. + // If there is nothing we can accept(i==0), that maens we are waiting for commit + // of previous lsn, set it to 1 in this case. + m_state_machine->reset_next_batch_size_hint(std::max(1ul, i)); + return nuraft::cb_func::ReturnCode::ReturnNull; } + reqs->emplace_back(std::move(req)); + } - // Wait till we receive the data from its originator for all the requests - std::vector< repl_req_ptr_t > timeout_rreqs; - if (!wait_for_data_receive(*reqs, HS_DYNAMIC_CONFIG(consensus.data_receive_timeout_ms), &timeout_rreqs)) { - for (auto const& rreq : timeout_rreqs) { - handle_error(rreq, ReplServiceError::TIMEOUT); - } - ret = nuraft::cb_func::ReturnCode::ReturnNull; + // Wait till we receive the data from its originator for all the requests + std::vector< repl_req_ptr_t > timeout_rreqs; + if (!wait_for_data_receive(*reqs, HS_DYNAMIC_CONFIG(consensus.data_receive_timeout_ms), &timeout_rreqs)) { + for (auto const& rreq : timeout_rreqs) { + handle_error(rreq, ReplServiceError::TIMEOUT); } - sisl::VectorPool< repl_req_ptr_t >::free(reqs); + ret = nuraft::cb_func::ReturnCode::ReturnNull; } + sisl::VectorPool< repl_req_ptr_t >::free(reqs); if (ret == nuraft::cb_func::ReturnCode::Ok) { m_state_machine->inc_next_batch_size_hint(); } return ret; } case nuraft::cb_func::Type::JoinedCluster: - RD_LOGD("Raft channel: Received JoinedCluster, implies become_follower"); + RD_LOGD(NO_TRACE_ID, "Raft channel: Received JoinedCluster, implies become_follower"); become_follower_cb(); return nuraft::cb_func::ReturnCode::Ok; case nuraft::cb_func::Type::BecomeFollower: { - RD_LOGD("Raft channel: Received BecomeFollower"); + RD_LOGD(NO_TRACE_ID, "Raft channel: Received BecomeFollower"); become_follower_cb(); return nuraft::cb_func::ReturnCode::Ok; } case nuraft::cb_func::Type::BecomeLeader: { - RD_LOGD("Raft channel: Received BecomeLeader"); + RD_LOGD(NO_TRACE_ID, "Raft channel: Received BecomeLeader"); become_leader_cb(); return nuraft::cb_func::ReturnCode::Ok; } @@ -1429,11 +1446,12 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, void RaftReplDev::flush_durable_commit_lsn() { if (is_destroyed()) { - RD_LOGI("Raft repl dev is destroyed, ignore flush durable commmit lsn"); + RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore flush durable commit lsn"); return; } auto const lsn = m_commit_upto_lsn.load(); + RD_LOGT(NO_TRACE_ID, "Flushing durable commit lsn to {}", lsn); std::unique_lock lg{m_sb_mtx}; m_rd_sb->durable_commit_lsn = lsn; m_rd_sb.write(); @@ -1442,7 +1460,7 @@ void RaftReplDev::flush_durable_commit_lsn() { /////////////////////////////////// Private metohds //////////////////////////////////// void RaftReplDev::cp_flush(CP* cp, cshared< ReplDevCPContext > ctx) { if (is_destroyed()) { - RD_LOGI("Raft repl dev is destroyed, ignore cp flush"); + RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore cp flush"); return; } @@ -1464,8 +1482,8 @@ void RaftReplDev::cp_flush(CP* cp, cshared< ReplDevCPContext > ctx) { m_rd_sb->last_applied_dsn = dsn; m_rd_sb.write(); m_last_flushed_commit_lsn = lsn; - RD_LOGD("cp flush in raft repl dev, lsn={}, clsn={}, next_dsn={}, cp string:{}", lsn, clsn, m_next_dsn.load(), - cp->to_string()); + RD_LOGD(NO_TRACE_ID, "cp flush in raft repl dev, lsn={}, clsn={}, next_dsn={}, cp string:{}", lsn, clsn, + m_next_dsn.load(), cp->to_string()); } cshared< ReplDevCPContext > RaftReplDev::get_cp_ctx(CP* cp) { @@ -1473,8 +1491,8 @@ cshared< ReplDevCPContext > RaftReplDev::get_cp_ctx(CP* cp) { auto const clsn = m_compact_lsn.load(); auto const dsn = m_next_dsn.load(); - RD_LOGD("getting cp_ctx for raft repl dev {}, cp_lsn={}, clsn={}, next_dsn={}, cp string:{}", (void*)this, cp_lsn, - clsn, dsn, cp->to_string()); + RD_LOGD(NO_TRACE_ID, "getting cp_ctx for raft repl dev {}, cp_lsn={}, clsn={}, next_dsn={}, cp string:{}", + (void*)this, cp_lsn, clsn, dsn, cp->to_string()); auto dev_ctx = std::make_shared< ReplDevCPContext >(); dev_ctx->cp_lsn = cp_lsn; dev_ctx->compacted_to_lsn = clsn; @@ -1495,7 +1513,7 @@ void RaftReplDev::gc_repl_reqs() { std::vector< repl_req_ptr_t > expired_rreqs; auto req_map_size = m_repl_key_req_map.size(); - RD_LOGI("m_repl_key_req_map size is {};", req_map_size); + RD_LOGI(NO_TRACE_ID, "m_repl_key_req_map size is {};", req_map_size); for (auto [key, rreq] : m_repl_key_req_map) { // FIXME: Skipping proposer for now, the DSN in proposer increased in proposing stage, not when commit(). // Need other mechanism. @@ -1505,7 +1523,8 @@ void RaftReplDev::gc_repl_reqs() { } if (rreq->dsn() < cur_dsn && rreq->is_expired()) { // The DSN can be out of order, wait till rreq expired. - RD_LOGD("legacy req with commited DSN, rreq=[{}] , dsn = {}, next_dsn = {}, gap= {}, elapsed_time_sec {}", + RD_LOGD(rreq->traceID(), + "legacy req with commited DSN, rreq=[{}] , dsn = {}, next_dsn = {}, gap= {}, elapsed_time_sec {}", rreq->to_string(), rreq->dsn(), cur_dsn, cur_dsn - rreq->dsn(), get_elapsed_time_sec(rreq->created_time())); expired_rreqs.push_back(rreq); @@ -1523,27 +1542,28 @@ void RaftReplDev::gc_repl_reqs() { return; } if (rreq->is_expired()) { - RD_LOGD("StateMachine: rreq=[{}] is expired, elapsed_time_sec{};", rreq->to_string(), + RD_LOGD(rreq->traceID(), "StateMachine: rreq=[{}] is expired, elapsed_time_sec{};", rreq->to_string(), get_elapsed_time_sec(rreq->created_time())); } }); - RD_LOGI("state_machine req map size is {};", sm_req_cnt); + RD_LOGT(NO_TRACE_ID, "state_machine req map size is {};", sm_req_cnt); for (auto removing_rreq : expired_rreqs) { // once log flushed, the commit progress controlled by raft if (removing_rreq->has_state(repl_req_state_t::LOG_FLUSHED)) { - RD_LOGI("Skipping GC rreq [{}] because it is in state machine", removing_rreq->to_string()); + RD_LOGT(removing_rreq->traceID(), "Skipping GC rreq [{}] because it is in state machine", + removing_rreq->to_string()); continue; } // do garbage collection // 1. free the allocated blocks - RD_LOGI("Removing rreq [{}]", removing_rreq->to_string()); + RD_LOGD(removing_rreq->traceID(), "Removing rreq [{}]", removing_rreq->to_string()); if (removing_rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { auto blkid = removing_rreq->local_blkid(); - data_service().async_free_blk(blkid).thenValue([this, blkid](auto&& err) { + data_service().async_free_blk(blkid).thenValue([this, blkid, removing_rreq](auto&& err) { HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", blkid.to_string()); - RD_LOGD("GC rreq: Releasing blkid={} freed successfully", blkid.to_string()); + RD_LOGD(removing_rreq->traceID(), "GC rreq: Releasing blkid={} freed successfully", blkid.to_string()); }); } // 2. remove from the m_repl_key_req_map @@ -1558,7 +1578,7 @@ void RaftReplDev::set_log_store_last_durable_lsn(store_lsn_t lsn) { m_data_journ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { auto repl_lsn = to_repl_lsn(lsn); if (need_skip_processing(repl_lsn)) { - RD_LOGI("Raft Channel: Log {} is outdated and will be handled by baseline resync. Ignoring replay.", lsn); + RD_LOGI(NO_TRACE_ID, "Raft Channel: Log {} is outdated and will be handled by baseline resync. Ignoring replay.", lsn); return; } @@ -1575,7 +1595,8 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry received from RAFT peer"); - RD_LOGT("Raft Channel: Applying Raft log_entry upon recovery: server_id={}, term={}, lsn={}, journal_entry=[{}] ", + RD_LOGT(jentry->traceID, + "Raft Channel: Applying Raft log_entry upon recovery: server_id={}, term={}, lsn={}, journal_entry=[{}] ", jentry->server_id, lentry->get_term(), repl_lsn, jentry->to_string()); auto entry_to_hdr = [](repl_journal_entry* jentry) { @@ -1619,14 +1640,14 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx auto status = rreq->init(rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), entry_to_key(jentry), data_size, m_listener); if (status != ReplServiceError::OK) { - RD_LOGE("Initializing rreq failed, rreq=[{}], error={}", rreq->to_string(), status); + RD_LOGE(jentry->traceID, "Initializing rreq failed, rreq=[{}], error={}", rreq->to_string(), status); } // we load the log from log device, implies log flushed. We only flush log after data is written to data device. rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->add_state(repl_req_state_t::LOG_RECEIVED); rreq->add_state(repl_req_state_t::LOG_FLUSHED); - RD_LOGD("Replay log on restart, rreq=[{}]", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Replay log on restart, rreq=[{}]", rreq->to_string()); // 2. Pre-commit the log entry as in nuraft pre-commit was called once log appended to logstore. m_listener->on_pre_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq); @@ -1651,7 +1672,7 @@ void RaftReplDev::create_snp_resync_data(raft_buf_ptr_t& data_out) { auto msg_size = sizeof(snp_repl_dev_data); msg.dsn = m_next_dsn; auto crc = crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(&msg), msg_size); - RD_LOGD("create snapshot resync msg, dsn={}, crc={}", msg.dsn, crc); + RD_LOGD(NO_TRACE_ID, "create snapshot resync msg, dsn={}, crc={}", msg.dsn, crc); msg.crc = crc; data_out = nuraft::buffer::alloc(msg_size); std::memcpy(data_out->data_begin(), &msg, msg_size); @@ -1661,17 +1682,20 @@ bool RaftReplDev::save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s auto msg = r_cast< snp_repl_dev_data* >(data.data_begin()); if (msg->magic_num != HOMESTORE_RESYNC_DATA_MAGIC || msg->protocol_version != HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1) { - RD_LOGE("Snapshot resync data validation failed, magic={}, version={}", msg->magic_num, msg->protocol_version); + RD_LOGE(NO_TRACE_ID, "Snapshot resync data validation failed, magic={}, version={}", msg->magic_num, + msg->protocol_version); return false; } auto received_crc = msg->crc; - RD_LOGD("received snapshot resync msg, dsn={}, crc={}, received crc={}", msg->dsn, msg->crc, received_crc); + RD_LOGD(NO_TRACE_ID, "received snapshot resync msg, dsn={}, crc={}, received crc={}", msg->dsn, msg->crc, + received_crc); // Clear the crc field before verification, because the crc value computed by leader doesn't contain it. msg->crc = 0; auto computed_crc = crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(msg), sizeof(snp_repl_dev_data)); if (received_crc != computed_crc) { - RD_LOGE("Snapshot resync data crc mismatch, received_crc={}, computed_crc={}", received_crc, computed_crc); + RD_LOGE(NO_TRACE_ID, "Snapshot resync data crc mismatch, received_crc={}, computed_crc={}", received_crc, + computed_crc); return false; } { @@ -1684,7 +1708,7 @@ bool RaftReplDev::save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s } if (msg->dsn > m_next_dsn) { m_next_dsn = msg->dsn; - RD_LOGD("Update next_dsn from {} to {}", m_next_dsn.load(), msg->dsn); + RD_LOGD(NO_TRACE_ID, "Update next_dsn from {} to {}", m_next_dsn.load(), msg->dsn); return true; } return true; @@ -1698,8 +1722,8 @@ bool RaftReplDev::is_resync_mode() { auto diff = leader_commited_lsn - my_log_idx; bool resync_mode = (diff > HS_DYNAMIC_CONFIG(consensus.resync_log_idx_threshold)); if (resync_mode) { - RD_LOGD("Raft Channel: Resync mode, leader_commited_lsn={}, my_log_idx={}, diff={}", leader_commited_lsn, - my_log_idx, diff); + RD_LOGD(NO_TRACE_ID, "Raft Channel: Resync mode, leader_commited_lsn={}, my_log_idx={}, diff={}", + leader_commited_lsn, my_log_idx, diff); } return resync_mode; } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 62a0c635c..8d8d83315 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -231,7 +231,9 @@ class RaftReplDev : public ReplDev, auto committed_lsn = m_commit_upto_lsn.load(); auto gate = m_traffic_ready_lsn.load(); bool ready = committed_lsn >= gate; - if (!ready) { RD_LOGD("Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); } + if (!ready) { + RD_LOGD(NO_TRACE_ID, "Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); + } return ready; } // purge all resources (e.g., logs in logstore) is a very dangerous operation, it is not supported yet. @@ -270,12 +272,12 @@ class RaftReplDev : public ReplDev, // was a follower, m_traffic_ready_lsn should be zero on follower. RD_REL_ASSERT(existing_gate == 0, "existing gate should be zero"); } - RD_LOGD("become_leader_cb: setting traffic_ready_lsn from {} to {}", existing_gate, new_gate); + RD_LOGD(NO_TRACE_ID, "become_leader_cb: setting traffic_ready_lsn from {} to {}", existing_gate, new_gate); }; void become_follower_cb() { // m_traffic_ready_lsn should be zero on follower. m_traffic_ready_lsn.store(0); - RD_LOGD("become_follower_cb setting traffic_ready_lsn to 0"); + RD_LOGD(NO_TRACE_ID, "become_follower_cb setting traffic_ready_lsn to 0"); } /// @brief This method is called when the data journal is compacted @@ -379,7 +381,7 @@ class RaftReplDev : public ReplDev, void set_log_store_last_durable_lsn(store_lsn_t lsn); void commit_blk(repl_req_ptr_t rreq); void replace_member(repl_req_ptr_t rreq); - void reset_quorum_size(uint32_t commit_quorum); + void reset_quorum_size(uint32_t commit_quorum, uint64_t trace_id); void create_snp_resync_data(raft_buf_ptr_t& data_out); bool save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s); }; diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 21dd04886..5ebfa2ec6 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -34,7 +34,7 @@ static std::pair< sisl::blob, sisl::blob > header_only_extract(nuraft::buffer& b ReplServiceError RaftStateMachine::propose_to_raft(repl_req_ptr_t rreq) { rreq->create_journal_entry(true /* raft_buf */, m_rd.server_id()); - RD_LOGT("Raft Channel: propose journal_entry=[{}] ", rreq->journal_entry()->to_string()); + RD_LOGT(rreq->traceID(), "Raft Channel: propose journal_entry=[{}] ", rreq->journal_entry()->to_string()); auto* vec = sisl::VectorPool< raft_buf_ptr_t >::alloc(); vec->push_back(rreq->raft_journal_buf()); @@ -43,7 +43,7 @@ ReplServiceError RaftStateMachine::propose_to_raft(repl_req_ptr_t rreq) { sisl::VectorPool< raft_buf_ptr_t >::free(vec); if (append_status && !append_status->get_accepted()) { - RD_LOGE("Raft Channel: Failed to propose rreq=[{}] result_code={}", rreq->to_compact_string(), + RD_LOGE(rreq->traceID(), "Raft Channel: Failed to propose rreq=[{}] result_code={}", rreq->to_compact_string(), append_status->get_result_code()); return RaftReplService::to_repl_error(append_status->get_result_code()); } @@ -56,8 +56,8 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry received from RAFT peer"); - RD_LOGT("Raft Channel: Localizing Raft log_entry: server_id={}, term={}, journal_entry=[{}] ", jentry->server_id, - lentry.get_term(), jentry->to_string()); + RD_LOGT(jentry->traceID, "Raft Channel: Localizing Raft log_entry: server_id={}, term={}, journal_entry=[{}] ", + jentry->server_id, lentry.get_term(), jentry->to_string()); auto entry_to_hdr = [](repl_journal_entry* jentry) { return sisl::blob{uintptr_cast(jentry) + sizeof(repl_journal_entry), jentry->user_header_size}; @@ -121,9 +121,9 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr out: if (rreq == nullptr) { - RD_LOG(ERROR, - "Failed to localize journal entry rkey={} jentry=[{}], we return error and let Raft resend this req", - rkey.to_string(), jentry->to_string()); + RD_LOGE(rreq->traceID(), + "Failed to localize journal entry rkey={} jentry=[{}], we return error and let Raft resend this req", + rkey.to_string(), jentry->to_string()); } return rreq; } @@ -182,7 +182,7 @@ raft_buf_ptr_t RaftStateMachine::pre_commit_ext(nuraft::state_machine::ext_op_pa int64_t lsn = s_cast< int64_t >(params.log_idx); repl_req_ptr_t rreq = lsn_to_req(lsn); - RD_LOGD("Raft channel: Precommit rreq=[{}]", rreq->to_compact_string()); + RD_LOGT(rreq->traceID(), "Precommit rreq=[{}]", rreq->to_compact_string()); m_rd.m_listener->on_pre_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq); return m_success_ptr; @@ -190,22 +190,18 @@ raft_buf_ptr_t RaftStateMachine::pre_commit_ext(nuraft::state_machine::ext_op_pa raft_buf_ptr_t RaftStateMachine::commit_ext(nuraft::state_machine::ext_op_params const& params) { int64_t lsn = s_cast< int64_t >(params.log_idx); + repl_req_ptr_t rreq = lsn_to_req(lsn); if (m_rd.need_skip_processing(lsn)) { - RD_LOGI("Raft Channel: Log {} is expected to be handled by snapshot. Skipping commit.", lsn); + RD_LOGI(rreq->traceID(), "Raft Channel: Log {} is expected to be handled by snapshot. Skipping commit.", lsn); return m_success_ptr; } - RD_LOGD("Raft channel: Received Commit message lsn {} store {} logdev {} size {}", lsn, - m_rd.m_data_journal->logstore_id(), m_rd.m_data_journal->logdev_id(), params.data->size()); - repl_req_ptr_t rreq = lsn_to_req(lsn); RD_DBG_ASSERT(rreq != nullptr, "Raft channel got null rreq for lsn={}", lsn); - RD_LOGD("Raft channel: Received Commit message rreq=[{}]", rreq->to_string()); + RD_LOGT(rreq->traceID(), "Raft channel: Received Commit message rreq=[{}]", rreq->to_string()); if (rreq->is_proposer()) { // This is the time to ensure flushing of journal happens in the proposer rreq->add_state(repl_req_state_t::LOG_FLUSHED); } - m_rd.handle_commit(rreq); - return m_success_ptr; } @@ -213,11 +209,11 @@ void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_pt // when reaching here, the config change log has already been committed, and the new config has been applied to the // cluster if (m_rd.need_skip_processing(s_cast< repl_lsn_t >(log_idx))) { - RD_LOGI("Raft Channel: Config {} is expected to be handled by snapshot. Skipping commit.", log_idx); + RD_LOGI(NO_TRACE_ID, "Raft Channel: Config {} is expected to be handled by snapshot. Skipping commit.", log_idx); return; } - RD_LOGD("Raft channel: Commit new cluster conf , log_idx = {}", log_idx); + RD_LOGD(NO_TRACE_ID, "Raft channel: Commit new cluster conf , log_idx = {}", log_idx); #ifdef _PRERELEASE auto& servers_in_new_conf = new_conf->get_servers(); @@ -237,15 +233,15 @@ void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_pt oss << "," << *it; } - RD_LOG(INFO, "Raft channel: server ids in new cluster conf : {}, my_id {}, group_id {}", oss.str(), my_id, - m_rd.group_id_str()); + RD_LOGI(NO_TRACE_ID, "Raft channel: server ids in new cluster conf : {}, my_id {}, group_id {}", oss.str(), my_id, + m_rd.group_id_str()); #endif m_rd.handle_config_commit(s_cast< repl_lsn_t >(log_idx), new_conf); } void RaftStateMachine::rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) { - RD_LOGD("Raft channel: Rollback cluster conf , log_idx = {}", log_idx); + RD_LOGD(NO_TRACE_ID, "Raft channel: Rollback cluster conf , log_idx = {}", log_idx); // TODO:add more logic here if necessary } @@ -253,11 +249,11 @@ void RaftStateMachine::rollback_ext(const nuraft::state_machine::ext_op_params& int64_t lsn = s_cast< int64_t >(params.log_idx); repl_req_ptr_t rreq = lsn_to_req(lsn); if (rreq == nullptr) { - RD_LOG(ERROR, "Raft channel: Rollback lsn {} rreq not found", lsn); + RD_LOGE(NO_TRACE_ID, "Raft channel: Rollback lsn {} rreq not found", lsn); return; } - RD_LOGD("Raft channel: Rollback lsn {}, rreq=[{}]", lsn, rreq->to_string()); + RD_LOGD(rreq->traceID(), "Raft channel: Rollback lsn {}, rreq=[{}]", lsn, rreq->to_string()); m_rd.handle_rollback(rreq); } @@ -287,7 +283,7 @@ void RaftStateMachine::iterate_repl_reqs(std::function< void(int64_t, repl_req_p } uint64_t RaftStateMachine::last_commit_index() { - RD_LOG(DEBUG, "Raft channel: last_commit_index {}", uint64_cast(m_rd.get_last_commit_lsn())); + RD_LOGD(NO_TRACE_ID, "Raft channel: last_commit_index {}", uint64_cast(m_rd.get_last_commit_lsn())); return uint64_cast(m_rd.get_last_commit_lsn()); } @@ -297,7 +293,7 @@ void RaftStateMachine::unlink_lsn_to_req(int64_t lsn, repl_req_ptr_t rreq) { // it is possible a LSN mapped to different rreq in history // due to log overwritten. Verify the rreq before removing auto deleted = m_lsn_req_map.erase_if_equal(lsn, rreq); - if (deleted) { RD_LOG(DEBUG, "Raft channel: erase lsn {}, rreq {}", lsn, rreq->to_string()); } + if (deleted) { RD_LOGT(rreq->traceID(), "Raft channel: erase lsn {}, rreq {}", lsn, rreq->to_string()); } } void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) { @@ -307,8 +303,8 @@ void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) { rreq->set_created_time(); auto r = m_lsn_req_map.insert(lsn, std::move(rreq)); if (!r.second) { - RD_LOG(ERROR, "lsn={} already in precommit list, exist_term={}, is_volatile={}", - lsn, r.first->second->term(), r.first->second->is_volatile()); + RD_LOGE(rreq->traceID(), "lsn={} already in precommit list, exist_term={}, is_volatile={}", lsn, + r.first->second->term(), r.first->second->is_volatile()); // TODO: we need to think about the case where volatile is in the map already, is it safe to overwrite it? } } @@ -339,7 +335,7 @@ int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, // uncommitted logs may or may not included in the snapshot data sent by leader, // depending on the racing of commit vs snapshot read, leading to data inconsistency. if (s_cast< repl_lsn_t >(s.get_last_log_idx()) > m_rd.get_last_commit_lsn()) { - RD_LOG(WARN, "not ready to read because there are some uncommitted logs in snapshot, " + RD_LOGW(NO_TRACE_ID, "not ready to read because there are some uncommitted logs in snapshot, " "let nuraft retry later. snapshot log_idx={}, last_commit_lsn={}", s.get_last_log_idx(), m_rd.get_last_commit_lsn()); return -1; @@ -390,7 +386,7 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, snp_data->is_last_obj = is_last_obj; // We are doing a copy here. - sisl::io_blob_safe blob{static_cast(data.size())}; + sisl::io_blob_safe blob{static_cast< uint32_t >(data.size())}; std::memcpy(blob.bytes(), data.data_begin(), data.size()); snp_data->blob = std::move(blob); diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index 97de4ec3b..7da37d5c5 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -24,7 +24,9 @@ namespace homestore { class ReplicaSetImpl; class StateMachineStore; -#define RD_LOG(level, msg, ...) LOG##level##MOD(replication, "[{}] " msg, identify_str(), ##__VA_ARGS__) +#define NO_TRACE_ID "n/a" +#define RD_LOG(level, traceID, msg, ...) \ + LOG##level##MOD(replication, "[traceID={}] [{}] " msg, traceID, identify_str(), ##__VA_ARGS__) #define RD_ASSERT_CMP(assert_type, val1, cmp, val2, ...) \ { \ @@ -69,12 +71,12 @@ class StateMachineStore; #define RD_REL_ASSERT_GT(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, >, val2, ##__VA_ARGS__) #define RD_REL_ASSERT_GE(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, >=, val2, ##__VA_ARGS__) -#define RD_LOGT(...) RD_LOG(TRACE, ##__VA_ARGS__) -#define RD_LOGD(...) RD_LOG(DEBUG, ##__VA_ARGS__) -#define RD_LOGI(...) RD_LOG(INFO, ##__VA_ARGS__) -#define RD_LOGW(...) RD_LOG(WARN, ##__VA_ARGS__) -#define RD_LOGE(...) RD_LOG(ERROR, ##__VA_ARGS__) -#define RD_LOGC(...) RD_LOG(CRITICAL, ##__VA_ARGS__) +#define RD_LOGT(traceID, ...) RD_LOG(TRACE, traceID, ##__VA_ARGS__) +#define RD_LOGD(traceID, ...) RD_LOG(DEBUG, traceID, ##__VA_ARGS__) +#define RD_LOGI(traceID, ...) RD_LOG(INFO, traceID, ##__VA_ARGS__) +#define RD_LOGW(traceID, ...) RD_LOG(WARN, traceID, ##__VA_ARGS__) +#define RD_LOGE(traceID, ...) RD_LOG(ERROR, traceID, ##__VA_ARGS__) +#define RD_LOGC(traceID, ...) RD_LOG(CRITICAL, traceID, ##__VA_ARGS__) // For the logic snapshot obj_id, we use the highest bit to indicate the type of the snapshot message. // 0 is for HS, 1 is for Application. From 199bfc18f285787da4c29e65aefe793bb715e1a4 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 2 Apr 2025 02:15:32 -0700 Subject: [PATCH 091/170] bump version Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 4f17bf904..eb0674ef3 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.7.8" + version = "6.8.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" From 65fb0b97530362c06dd441742563611265c57676 Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Wed, 2 Apr 2025 23:50:38 +0800 Subject: [PATCH 092/170] Fix NPE Signed-off-by: Xiaoxi Chen --- src/lib/replication/repl_dev/raft_state_machine.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 5ebfa2ec6..0b6b03a31 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -121,7 +121,7 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr out: if (rreq == nullptr) { - RD_LOGE(rreq->traceID(), + RD_LOGE(rkey.traceID, "Failed to localize journal entry rkey={} jentry=[{}], we return error and let Raft resend this req", rkey.to_string(), jentry->to_string()); } From aefa0bba3bb6c309d625325d0b837e7064040e56 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Mon, 7 Apr 2025 16:05:45 +0800 Subject: [PATCH 093/170] Support custom rdev name - Add a set_custom_rdev_name function to support users to assign more meaningful name for debugging - Add repl_req_ctx into get_blk_alloc_hints --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 5 ++++- src/lib/replication/repl_dev/common.cpp | 2 +- src/lib/replication/repl_dev/common.h | 6 ++++++ src/lib/replication/repl_dev/raft_repl_dev.cpp | 5 +++-- src/lib/replication/repl_dev/raft_repl_dev.h | 8 +++++++- src/lib/replication/repl_dev/solo_repl_dev.h | 5 +++++ src/tests/test_common/raft_repl_test_base.hpp | 2 +- src/tests/test_solo_repl_dev.cpp | 2 +- 9 files changed, 29 insertions(+), 8 deletions(-) diff --git a/conanfile.py b/conanfile.py index eb0674ef3..c8a8d479f 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.8.0" + version = "6.9.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 5b0395f44..945bf9133 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -336,7 +336,7 @@ class ReplDevListener { /// @return Expected to return blk_alloc_hints for this write. If the hints are not available, then return the /// error. It is to be noted this method should return error only in very abnornal cases as in some code flow, an /// error would result in a crash or stall of the entire commit thread. - virtual ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) = 0; + virtual ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, cintrusive< homestore::repl_req_ctx >& hs_ctx) = 0; /// @brief Called when the repl_dev is being destroyed. The consumer is expected to clean up any related resources. /// However, it is expected that this call be idempotent. It is possible in rare scenarios that this can be called @@ -458,6 +458,9 @@ class ReplDev { /// @return group_id virtual group_id_t group_id() const = 0; + /// @brief Sets a custom name for the repldev. Users can assign a meaningful name to the repldev for easy debugging. + virtual void set_custom_rdev_name(std::string const& name) = 0; + /// @brief Gets the block size with which IO will happen on this device /// @return Block size virtual uint32_t get_blk_size() const = 0; diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 5d0f262f0..8cea3cc5a 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -103,7 +103,7 @@ void repl_req_ctx::change_raft_journal_buf(raft_buf_ptr_t new_buf, bool adjust_h ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& listener, uint32_t data_size) { DEBUG_ASSERT(has_linked_data(), "Trying to allocate a block for non-inlined block"); - auto const hints_result = listener->get_blk_alloc_hints(m_header, data_size); + auto const hints_result = listener->get_blk_alloc_hints(m_header, data_size, repl_req_ptr_t(this)); if (hints_result.hasError()) { return hints_result.error(); } if (hints_result.value().committed_blk_id.has_value()) { diff --git a/src/lib/replication/repl_dev/common.h b/src/lib/replication/repl_dev/common.h index cf8f53759..43bbb7cbf 100644 --- a/src/lib/replication/repl_dev/common.h +++ b/src/lib/replication/repl_dev/common.h @@ -58,6 +58,7 @@ struct repl_journal_entry { struct repl_dev_superblk { static constexpr uint64_t REPL_DEV_SB_MAGIC = 0xABCDF00D; static constexpr uint32_t REPL_DEV_SB_VERSION = 1; + static constexpr size_t max_name_len = 64; uint64_t magic{REPL_DEV_SB_MAGIC}; uint32_t version{REPL_DEV_SB_VERSION}; @@ -68,9 +69,14 @@ struct repl_dev_superblk { repl_lsn_t checkpoint_lsn; // LSN upto which this replica have checkpointed the Data repl_lsn_t compact_lsn; // maximum LSN that can be compacted to uint64_t group_ordinal; // Ordinal number which will be used to indicate the rdevXYZ for debugging + char rdev_name[max_name_len]; // Short name for the group for easy debugging uint64_t get_magic() const { return magic; } uint32_t get_version() const { return version; } + void set_rdev_name(std::string const& name) { + std::strncpy(rdev_name, name.c_str(), max_name_len - 1); + rdev_name[max_name_len - 1] = '\0'; + } }; #pragma pack() diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index fcf49becc..1d3a7de11 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -48,8 +48,7 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_last_flushed_commit_lsn = m_commit_upto_lsn; m_compact_lsn = m_rd_sb->compact_lsn; - m_rdev_name = fmt::format("rdev{}", m_rd_sb->group_ordinal); - + m_rdev_name = m_rd_sb->rdev_name; // Its ok not to do compare exchange, because loading is always single threaded as of now if (m_rd_sb->group_ordinal >= s_next_group_ordinal.load()) { s_next_group_ordinal.store(m_rd_sb->group_ordinal + 1); @@ -72,6 +71,7 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_rd_sb->last_snapshot_lsn = 0; m_rd_sb->group_ordinal = s_next_group_ordinal.fetch_add(1); m_rdev_name = fmt::format("rdev{}", m_rd_sb->group_ordinal); + m_rd_sb->set_rdev_name(m_rdev_name); if (m_rd_sb->is_timeline_consistent) { m_free_blks_journal = logstore_service().create_new_log_store(m_rd_sb->logdev_id, false /* append_mode */); @@ -80,6 +80,7 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_rd_sb.write(); bind_data_service(); } + m_identify_str = m_rdev_name + ":" + group_id_str(); RD_LOGI(NO_TRACE_ID, diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 8d8d83315..ba0bc2f27 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -216,8 +216,14 @@ class RaftReplDev : public ReplDev, std::vector< peer_info > get_replication_status() const override; std::set< replica_id_t > get_active_peers() const; group_id_t group_id() const override { return m_group_id; } + void set_custom_rdev_name(std::string const& name) override { + RD_LOGI(NO_TRACE_ID, "Resetting repl dev name from {} to {}", m_rdev_name, name); + m_rdev_name = name; + m_identify_str = name + ":" + group_id_str(); + m_rd_sb->set_rdev_name(m_rdev_name); + } std::string group_id_str() const { return boost::uuids::to_string(m_group_id); } - std::string rdev_name() const { return m_rdev_name; }; + std::string rdev_name() const { return m_rd_sb->rdev_name; }; std::string identify_str() const { return m_identify_str; }; std::string my_replica_id_str() const { return boost::uuids::to_string(m_my_repl_id); } uint32_t get_blk_size() const override; diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 78cace9f8..397f461da 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -64,6 +64,11 @@ class SoloReplDev : public ReplDev { uuid_t group_id() const override { return m_group_id; } + void set_custom_rdev_name(std::string const& name) override { + std::strncpy(m_rd_sb->rdev_name, name.c_str(), m_rd_sb->max_name_len - 1); + m_rd_sb->rdev_name[m_rd_sb->max_name_len - 1] = '\0'; + } + repl_lsn_t get_last_commit_lsn() const override { return 0; } uint32_t get_blk_size() const override; diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 8fe72ac1d..6b8fb4c35 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -318,7 +318,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { void free_user_snp_ctx(void*& user_snp_ctx) override {} - ReplResult get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { + ReplResult get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, cintrusive< homestore::repl_req_ctx >& hs_ctx) override { auto jheader = r_cast(header.cbytes()); Key k{.id_ = jheader->key_id}; auto iter = inmem_db_.find(k); diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index e525ff494..ec45ef5b4 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -126,7 +126,7 @@ class SoloReplDevTest : public testing::Test { void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, cintrusive< repl_req_ctx >& ctx) override {} - ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { + ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, cintrusive< homestore::repl_req_ctx >& hs_ctx) override { return blk_alloc_hints{}; } From 47410ee898d0994cade007c8c6cd09dfc41bec52 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Tue, 8 Apr 2025 11:41:37 +0800 Subject: [PATCH 094/170] add grpc message size as config --- conanfile.py | 4 ++-- src/include/homestore/homestore_decl.hpp | 2 ++ src/lib/common/homestore_config.fbs | 3 +++ src/lib/homestore.cpp | 9 +++++++++ .../replication/service/raft_repl_service.cpp | 4 +++- src/tests/test_common/raft_repl_test_base.hpp | 8 +++++--- src/tests/test_raft_repl_dev.cpp | 16 ++++++++++++++++ 7 files changed, 40 insertions(+), 6 deletions(-) diff --git a/conanfile.py b/conanfile.py index c8a8d479f..b234ca953 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.9.0" + version = "6.9.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" @@ -54,7 +54,7 @@ def build_requirements(self): def requirements(self): self.requires("iomgr/[^11.3]@oss/master", transitive_headers=True) self.requires("sisl/[^12.2]@oss/master", transitive_headers=True) - self.requires("nuraft_mesg/[^3.7.3]@oss/main", transitive_headers=True) + self.requires("nuraft_mesg/[>=3.7.5]@oss/main", transitive_headers=True) self.requires("farmhash/cci.20190513@", transitive_headers=True) if self.settings.arch in ['x86', 'x86_64']: diff --git a/src/include/homestore/homestore_decl.hpp b/src/include/homestore/homestore_decl.hpp index 96c26ac09..b36317ea9 100644 --- a/src/include/homestore/homestore_decl.hpp +++ b/src/include/homestore/homestore_decl.hpp @@ -169,6 +169,8 @@ struct hs_input_params { uint64_t app_mem_size{static_cast< uint64_t >(1024) * static_cast< uint64_t >(1024) * static_cast< uint64_t >(1024)}; // memory available for the app (including cache) uint64_t hugepage_size{0}; // memory available for the hugepage + int max_data_size{0}; // max data size in byte on the data plane + int max_snapshot_batch_size{0}; // max snapshot batch size in byte for the raft state machine bool is_read_only{false}; // Is read only bool auto_recovery{true}; // Recovery of data is automatic or controlled by the caller diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index ef27b3a5c..20821fb96 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -235,6 +235,9 @@ table Consensus { // Max append batch size max_append_batch_size: int32 = 64; + // Max grpc message size + max_grpc_message_size: int32 = 67108864; + // Threshold of log gap from leader to consider a replica as stale stale_log_gap_hi_threshold: int32 = 200; diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 793bc90d8..eb276349e 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -136,6 +136,15 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ HomeStoreDynamicConfig::init_settings_default(); + // Check if the max_grpc_message_size is large enough to hold the data and snapshot batch size + if (HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < input.max_data_size || + HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < input.max_snapshot_batch_size) { + LOGERROR("max_grpc_message_size {} is too small to hold max_data_size {} and max_snapshot_batch_size {}", + HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), input.max_data_size, + input.max_snapshot_batch_size); + throw std::invalid_argument("max_grpc_message_size is insufficient for the configured data or snapshot sizes"); + } + #ifdef _PRERELEASE // Start a default crash simulator which raises SIGKILL, in case user has not provided with_crash_simulator() // callback diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index ef0aaecc5..280aa0032 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -79,7 +79,9 @@ void RaftReplService::start() { .ssl_key_ = ioenvironment.get_ssl_key(), .ssl_cert_ = ioenvironment.get_ssl_cert(), .token_verifier_ = std::dynamic_pointer_cast< sisl::GrpcTokenVerifier >(ioenvironment.get_token_verifier()), - .token_client_ = std::dynamic_pointer_cast< sisl::GrpcTokenClient >(ioenvironment.get_token_client())}; + .token_client_ = std::dynamic_pointer_cast< sisl::GrpcTokenClient >(ioenvironment.get_token_client()), + .max_receive_message_size_ = HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), + .max_send_message_size_ = HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size)}; m_msg_mgr = nuraft_mesg::init_messaging(params, weak_from_this(), true /* with_data_channel */); LOGINFO("Starting RaftReplService with server_uuid={} port={}", boost::uuids::to_string(params.server_uuid_), diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 6b8fb4c35..47778d9a8 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -566,7 +566,8 @@ class RaftReplDevTestBase : public testing::Test { } while (true); } - void write_on_leader(uint32_t num_entries, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr) { + void write_on_leader(uint32_t num_entries, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr, + uint64_t* data_size = nullptr) { if (dbs_[0]->repl_dev() == nullptr) return; do { @@ -587,9 +588,10 @@ class RaftReplDevTestBase : public testing::Test { g_helper->runner().set_num_tasks(num_entries); LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); - g_helper->runner().set_task([this, block_size, db]() { + g_helper->runner().set_task([this, block_size, db, data_size]() { static std::normal_distribution<> num_blks_gen{3.0, 2.0}; - this->generate_writes(std::abs(std::lround(num_blks_gen(g_re))) * block_size, block_size, db); + uint64_t size = data_size == nullptr ? std::abs(std::lround(num_blks_gen(g_re))) * block_size : *data_size; + this->generate_writes(size, block_size, db); }); if (wait_for_commit) { g_helper->runner().execute().get(); } break; diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index cdcfa9b1e..7f7345e10 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -451,6 +451,22 @@ TEST_F(RaftReplDevTest, BaselineTest) { LOGINFO("BaselineTest done"); } +TEST_F(RaftReplDevTest, LargeDataWrite) { + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + + // TODO: Increase the data size (e.g., to 16MB) for testing. + // For now, use 4MB to ensure the test passes since there are issues with larger IO sizes on the uring drive. + uint64_t entries_per_attempt = SISL_OPTIONS["num_io"].as< uint64_t >(); + uint64_t data_size = 4 * 1024 * 1024; + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */, nullptr, &data_size); + + g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + g_helper->sync_for_cleanup_start(); +} + int main(int argc, char* argv[]) { int parsed_argc = argc; char** orig_argv = argv; From 1f043bfcdaf611fdfb514a4cda4b53db716fa291 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Wed, 9 Apr 2025 14:51:43 +0800 Subject: [PATCH 095/170] add metric for blk usage --- conanfile.py | 2 +- src/lib/device/chunk.cpp | 4 ++++ src/lib/device/chunk.h | 3 +++ src/lib/replication/repl_dev/raft_repl_dev.cpp | 15 +++++++++++++++ src/lib/replication/repl_dev/raft_repl_dev.h | 9 +++++++++ 5 files changed, 32 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index b234ca953..e4da8a5db 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.9.1" + version = "6.9.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/device/chunk.cpp b/src/lib/device/chunk.cpp index 9eb8563de..4962be386 100644 --- a/src/lib/device/chunk.cpp +++ b/src/lib/device/chunk.cpp @@ -29,6 +29,10 @@ std::string Chunk::to_string() const { vdev_ordinal(), stream_id()); } +float Chunk::get_blk_usage() const { + return s_cast(m_blk_allocator->get_used_blks()) / s_cast(m_blk_allocator->get_total_blks()); +} + void Chunk::set_user_private(const sisl::blob& data) { std::unique_lock lg{m_mgmt_mutex}; m_chunk_info.set_user_private(data); diff --git a/src/lib/device/chunk.h b/src/lib/device/chunk.h index 77b275e4b..b9d84abdb 100644 --- a/src/lib/device/chunk.h +++ b/src/lib/device/chunk.h @@ -27,6 +27,7 @@ class Chunk { const uint32_t m_stream_id; uint32_t m_vdev_ordinal{0}; shared< BlkAllocator > m_blk_allocator; + float blk_usage_report_threshold{0.9}; public: static constexpr auto MAX_CHUNK_SIZE = std::numeric_limits< uint32_t >::max(); @@ -66,6 +67,8 @@ class Chunk { nlohmann::json get_status([[maybe_unused]] int log_level) const; const BlkAllocator* blk_allocator() const { return m_blk_allocator.get(); } BlkAllocator* blk_allocator_mutable() { return m_blk_allocator.get(); } + float get_blk_usage_report_threshold() const { return blk_usage_report_threshold; } + float get_blk_usage() const; ////////////// Setters ///////////////////// void set_user_private(const sisl::blob& data); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 1d3a7de11..d28e13a47 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -18,6 +18,7 @@ // #include "common/homestore_flip.hpp" #include "replication/service/raft_repl_service.h" #include "replication/repl_dev/raft_repl_dev.h" +#include "device/chunk.h" #include "device/device.h" #include "push_data_rpc_generated.h" #include "fetch_data_rpc_generated.h" @@ -1406,6 +1407,7 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, m_state_machine->reset_next_batch_size_hint(std::max(1ul, i)); return nuraft::cb_func::ReturnCode::ReturnNull; } + report_blk_metrics_if_needed(req); reqs->emplace_back(std::move(req)); } @@ -1729,4 +1731,17 @@ bool RaftReplDev::is_resync_mode() { return resync_mode; } +void RaftReplDev::report_blk_metrics_if_needed(repl_req_ptr_t rreq) { + auto chunk_id = rreq->local_blkid().chunk_num(); + auto chunk = hs()->device_mgr()->get_chunk(chunk_id); + if (chunk->get_blk_usage() >= chunk->get_blk_usage_report_threshold()) { + auto local_blk_num = rreq->local_blkid().blk_num(); + auto remote_blk_num = rreq->remote_blkid().blkid.blk_num(); + // Focus only on cases where the locally allocated blocks exceed the proposer's allocated blocks, + // as this indicates that the member might encounter NO_SPACE_LEFT before the proposer. + auto blk_diff_with_remote = local_blk_num > remote_blk_num ? local_blk_num - remote_blk_num : 0; + HISTOGRAM_OBSERVE(m_metrics, blk_diff_with_proposer, blk_diff_with_remote); + } +} + } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index ba0bc2f27..fb9d2aab7 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -90,6 +90,13 @@ class RaftReplDevMetrics : public sisl::MetricsGroup { REGISTER_HISTOGRAM(rreq_pieces_per_write, "Number of individual pieces per write", HistogramBucketsType(LinearUpto64Buckets)); + // In the identical layout chunk, the blk num of the follower and leader is expected to be the same. + // However, due to the concurrency between the data channel and the raft channel, there might be some + // allocation differences on the same lsn. When a leader switch occurs, these differences could become garbage. + // This metric can partially reflect the potential amount of garbage. + REGISTER_HISTOGRAM(blk_diff_with_proposer, + "allocated blk num diff on the same lsn with proposer when chunk usage >= 0.9"); + // Raft channel metrics REGISTER_HISTOGRAM(raft_end_of_append_batch_latency_us, "Raft end_of_append_batch latency in us", "raft_logstore_append_latency", {"op", "end_of_append_batch"}); @@ -390,6 +397,8 @@ class RaftReplDev : public ReplDev, void reset_quorum_size(uint32_t commit_quorum, uint64_t trace_id); void create_snp_resync_data(raft_buf_ptr_t& data_out); bool save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s); + + void report_blk_metrics_if_needed(repl_req_ptr_t rreq); }; } // namespace homestore From 9feabb30e6ac425050571e3236c2110f8772f868 Mon Sep 17 00:00:00 2001 From: Sanal Date: Thu, 10 Apr 2025 02:41:21 +0530 Subject: [PATCH 096/170] Fix solo repl dev log flush and graceful shutdown. Add flush mode to logdev as nublocks uses timer, nuobject uses explicit log flush mode. Flush mode has to be stored in superblk to support recovery. Enable solo repl dev UT. Add graceful shutdown for UT to work. --- conanfile.py | 2 +- .../homestore/logstore/log_store_internal.hpp | 7 +++ src/include/homestore/logstore_service.hpp | 6 +-- src/lib/logstore/log_dev.cpp | 15 +++--- src/lib/logstore/log_dev.hpp | 10 ++-- src/lib/logstore/log_store_service.cpp | 15 +++--- .../log_store/home_raft_log_store.cpp | 8 ++-- .../replication/repl_dev/solo_repl_dev.cpp | 26 +++++++++-- src/lib/replication/repl_dev/solo_repl_dev.h | 2 +- .../replication/service/generic_repl_svc.cpp | 23 +++++++++- src/tests/CMakeLists.txt | 2 +- src/tests/log_store_benchmark.cpp | 2 +- src/tests/test_log_dev.cpp | 46 +++++++++---------- src/tests/test_log_store.cpp | 6 +-- src/tests/test_log_store_long_run.cpp | 6 +-- src/tests/test_solo_repl_dev.cpp | 20 ++++---- 16 files changed, 119 insertions(+), 77 deletions(-) diff --git a/conanfile.py b/conanfile.py index e4da8a5db..8b399c66c 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.9.2" + version = "6.9.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/logstore/log_store_internal.hpp b/src/include/homestore/logstore/log_store_internal.hpp index 9b7019cfb..7768086ee 100644 --- a/src/include/homestore/logstore/log_store_internal.hpp +++ b/src/include/homestore/logstore/log_store_internal.hpp @@ -52,6 +52,12 @@ typedef std::function< void(std::shared_ptr< HomeLogStore >, logstore_seq_num_t) typedef int64_t logid_t; +VENUM(flush_mode_t, uint32_t, // Various flush modes (can be or'ed together) + INLINE = 1 << 0, // Allow flush inline with the append + TIMER = 1 << 1, // Allow timer based automatic flush + EXPLICIT = 1 << 2, // Allow explcitly user calling flush +); + struct logdev_key { logid_t idx; off_t dev_offset; @@ -172,4 +178,5 @@ struct logstore_superblk { logstore_seq_num_t m_first_seq_num{0}; }; #pragma pack() + } // namespace homestore \ No newline at end of file diff --git a/src/include/homestore/logstore_service.hpp b/src/include/homestore/logstore_service.hpp index fe65c7c13..48183a56c 100644 --- a/src/include/homestore/logstore_service.hpp +++ b/src/include/homestore/logstore_service.hpp @@ -94,7 +94,7 @@ class LogStoreService { * chunks. Logdev can start with zero chunks and dynamically add chunks based on write request. * @return Newly created log dev id. */ - logdev_id_t create_new_logdev(); + logdev_id_t create_new_logdev(flush_mode_t flush_mode); /** * @brief Open a log dev. @@ -102,7 +102,7 @@ class LogStoreService { * @param logdev_id: Logdev ID * @return Newly created log dev id. */ - void open_logdev(logdev_id_t logdev_id); + void open_logdev(logdev_id_t logdev_id, flush_mode_t flush_mode); /** * @brief Destroy a log dev. @@ -178,7 +178,7 @@ class LogStoreService { void delete_unopened_logdevs(); private: - std::shared_ptr< LogDev > create_new_logdev_internal(logdev_id_t logdev_id); + std::shared_ptr< LogDev > create_new_logdev_internal(logdev_id_t logdev_id, flush_mode_t flush_mode); void on_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie); logdev_id_t get_next_logdev_id(); void logdev_super_blk_found(const sisl::byte_view& buf, void* meta_cookie); diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 3716cb70e..a23b7c900 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -64,7 +64,7 @@ void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) { // First read the info block if (format) { HS_LOG_ASSERT(m_logdev_meta.is_empty(), "Expected meta to be not present"); - m_logdev_meta.create(m_logdev_id); + m_logdev_meta.create(m_logdev_id, m_flush_mode); m_vdev_jd->update_data_start_offset(0); } else { HS_LOG_ASSERT(!m_logdev_meta.is_empty(), "Expected meta data to be read already before loading"); @@ -108,7 +108,6 @@ LogDev::~LogDev() { HS_LOG_ASSERT((m_pending_flush_size.load() == 0), "LogDev stop attempted while writes to logdev are pending completion"); - if (allow_timer_flush()) stop_timer(); m_log_records.reset(nullptr); m_logdev_meta.reset(); m_log_idx.store(0); @@ -149,6 +148,7 @@ void LogDev::stop() { // after we call stop, we need to do any pending device truncations truncate(); m_id_logstore_map.clear(); + if (allow_timer_flush()) stop_timer(); } void LogDev::destroy() { @@ -521,7 +521,7 @@ void LogDev::on_flush_completion(LogGroup* lg) { // since we support out-of-order lsn write, so no need to guarantee the order of logstore write completion for (auto const& [idx, req] : req_map) { m_pending_callback++; - iomanager.run_on_forget(iomgr::reactor_regex::random_worker, iomgr::fiber_regex::syncio_only, + iomanager.run_on_forget(iomgr::reactor_regex::random_worker, /* iomgr::fiber_regex::syncio_only, */ [this, dev_offset, idx, req]() { auto ld_key = logdev_key{idx, dev_offset}; auto comp_cb = req->log_store->get_comp_cb(); @@ -561,11 +561,13 @@ uint64_t LogDev::truncate() { // Persist the logstore superblock to ensure correct start LSN during recovery. Avoid such scenario: // 1. Follower1 appends logs up to 100, then is stopped by a sigkill. // 2. Upon restart, a baseline resync is triggered using snapshot 2000. - // 3. Baseline resync completed with start_lsn=2001, but m_trunc_ld_key remains {0,0} since we cannot get a valid + // 3. Baseline resync completed with start_lsn=2001, but m_trunc_ld_key remains {0,0} since we cannot get a + // valid // device offset for LSN 2000 to update it. // 4. Follower1 appends logs from 2001 to 2500, making tail_lsn > 2000. // 5. Get m_trunc_ld_key={0,0}, goto here and return 0 without persist. - // 6. Follower1 is killed again, after restart, its start index remains 0, misinterpreting the range as [1,2500]. + // 6. Follower1 is killed again, after restart, its start index remains 0, misinterpreting the range as + // [1,2500]. m_logdev_meta.persist(); decr_pending_request_num(); return 0; @@ -781,7 +783,7 @@ nlohmann::json LogDev::get_status(int verbosity) const { /////////////////////////////// LogDevMetadata Section /////////////////////////////////////// LogDevMetadata::LogDevMetadata() : m_sb{logdev_sb_meta_name}, m_rollback_sb{logdev_rollback_sb_meta_name} {} -logdev_superblk* LogDevMetadata::create(logdev_id_t id) { +logdev_superblk* LogDevMetadata::create(logdev_id_t id, flush_mode_t flush_mode) { logdev_superblk* sb = m_sb.create(logdev_sb_size_needed(0)); rollback_superblk* rsb = m_rollback_sb.create(rollback_superblk::size_needed(1)); @@ -790,6 +792,7 @@ logdev_superblk* LogDevMetadata::create(logdev_id_t id) { m_id_reserver = std::make_unique< sisl::IDReserver >(); m_sb->logdev_id = id; + m_sb->flush_mode = flush_mode; m_sb.write(); m_rollback_sb->logdev_id = id; diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index 719a58861..d43dab219 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -404,6 +404,8 @@ struct logdev_superblk { uint32_t num_stores{0}; uint64_t start_dev_offset{0}; logid_t key_idx{0}; + flush_mode_t flush_mode; + // The meta data starts immediately after the super block // Equivalent of: // logstore_superblk meta[0]; @@ -481,7 +483,7 @@ class LogDevMetadata { LogDevMetadata& operator=(LogDevMetadata&&) noexcept = delete; ~LogDevMetadata() = default; - logdev_superblk* create(logdev_id_t id); + logdev_superblk* create(logdev_id_t id, flush_mode_t); void reset(); std::vector< std::pair< logstore_id_t, logstore_superblk > > load(); void persist(); @@ -572,12 +574,6 @@ struct logstore_info { static std::string const logdev_sb_meta_name{"Logdev_sb"}; static std::string const logdev_rollback_sb_meta_name{"Logdev_rollback_sb"}; -VENUM(flush_mode_t, uint32_t, // Various flush modes (can be or'ed together) - INLINE = 1 << 0, // Allow flush inline with the append - TIMER = 1 << 1, // Allow timer based automatic flush - EXPLICIT = 1 << 2, // Allow explcitly user calling flush -); - class LogDev : public std::enable_shared_from_this< LogDev > { friend class HomeLogStore; diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index 86f404e8c..542204386 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -142,12 +142,12 @@ logdev_id_t LogStoreService::get_next_logdev_id() { return id; } -logdev_id_t LogStoreService::create_new_logdev() { +logdev_id_t LogStoreService::create_new_logdev(flush_mode_t flush_mode) { if (is_stopping()) return 0; incr_pending_request_num(); folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); logdev_id_t logdev_id = get_next_logdev_id(); - auto logdev = create_new_logdev_internal(logdev_id); + auto logdev = create_new_logdev_internal(logdev_id, flush_mode); logdev->start(true /* format */, m_logdev_vdev); COUNTER_INCREMENT(m_metrics, logdevs_count, 1); HS_LOG(INFO, logstore, "Created log_dev={}", logdev_id); @@ -189,19 +189,19 @@ void LogStoreService::delete_unopened_logdevs() { m_unopened_logdev.clear(); } -std::shared_ptr< LogDev > LogStoreService::create_new_logdev_internal(logdev_id_t logdev_id) { - auto logdev = std::make_shared< LogDev >(logdev_id); +std::shared_ptr< LogDev > LogStoreService::create_new_logdev_internal(logdev_id_t logdev_id, flush_mode_t flush_mode) { + auto logdev = std::make_shared< LogDev >(logdev_id, flush_mode); const auto it = m_id_logdev_map.find(logdev_id); HS_REL_ASSERT((it == m_id_logdev_map.end()), "logdev id {} already exists", logdev_id); m_id_logdev_map.insert(std::make_pair<>(logdev_id, logdev)); return logdev; } -void LogStoreService::open_logdev(logdev_id_t logdev_id) { +void LogStoreService::open_logdev(logdev_id_t logdev_id, flush_mode_t flush_mode) { folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); const auto it = m_id_logdev_map.find(logdev_id); if (it == m_id_logdev_map.end()) { - auto logdev = std::make_shared< LogDev >(logdev_id); + auto logdev = std::make_shared< LogDev >(logdev_id, flush_mode); m_id_logdev_map.emplace(logdev_id, logdev); LOGDEBUGMOD(logstore, "log_dev={} does not exist, created!", logdev_id); } @@ -238,13 +238,14 @@ void LogStoreService::logdev_super_blk_found(const sisl::byte_view& buf, void* m folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); std::shared_ptr< LogDev > logdev; auto id = sb->logdev_id; + auto flush_mode = sb->flush_mode; const auto it = m_id_logdev_map.find(id); // We could update the logdev map either with logdev or rollback superblks found callbacks. if (it != m_id_logdev_map.end()) { logdev = it->second; HS_LOG(DEBUG, logstore, "Log dev superblk found log_dev={}", id); } else { - logdev = std::make_shared< LogDev >(id); + logdev = std::make_shared< LogDev >(id, flush_mode); m_id_logdev_map.emplace(id, logdev); // when recover logdev meta blk, we get all the logdevs from the superblk. we put them in m_unopened_logdev // too. after logdev meta blks are all recovered, when a client opens a logdev, we remove it from diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index be7039059..1dc9fb199 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -92,7 +92,7 @@ HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore m_dummy_log_entry = nuraft::cs_new< nuraft::log_entry >(0, nuraft::buffer::alloc(0), nuraft::log_val_type::app_log); if (logstore_id == UINT32_MAX) { - m_logdev_id = logstore_service().create_new_logdev(); + m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); m_log_store = logstore_service().create_new_log_store(m_logdev_id, true); if (!m_log_store) { throw std::runtime_error("Failed to create log store"); } m_logstore_id = m_log_store->get_store_id(); @@ -101,7 +101,7 @@ HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore m_logdev_id = logdev_id; m_logstore_id = logstore_id; LOGDEBUGMOD(replication, "Opening existing home log_dev={} log_store={}", m_logdev_id, logstore_id); - logstore_service().open_logdev(m_logdev_id); + logstore_service().open_logdev(m_logdev_id, flush_mode_t::EXPLICIT); m_log_store_future = logstore_service() .open_log_store(m_logdev_id, logstore_id, true, log_found_cb, log_replay_done_cb) .thenValue([this](auto log_store) { @@ -382,8 +382,8 @@ ulong HomeRaftLogStore::last_durable_index() { void HomeRaftLogStore::purge_all_logs() { auto last_lsn = m_log_store->get_contiguous_issued_seq_num(m_last_durable_lsn); - REPL_STORE_LOG(INFO, "Store={} LogDev={}: Purging all logs in the log store, last_lsn={}", - m_logstore_id, m_logdev_id, last_lsn); + REPL_STORE_LOG(INFO, "Store={} LogDev={}: Purging all logs in the log store, last_lsn={}", m_logstore_id, + m_logdev_id, last_lsn); m_log_store->truncate(last_lsn, false /* in_memory_truncate_only */); } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index bc278303a..22f2446d0 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -10,7 +10,7 @@ namespace homestore { SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing) : m_rd_sb{std::move(rd_sb)}, m_group_id{m_rd_sb->group_id} { if (load_existing) { - logstore_service().open_logdev(m_rd_sb->logdev_id); + logstore_service().open_logdev(m_rd_sb->logdev_id, flush_mode_t::TIMER); logstore_service() .open_log_store(m_rd_sb->logdev_id, m_rd_sb->logstore_id, true /* append_mode */) .thenValue([this](auto log_store) { @@ -19,7 +19,7 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi m_data_journal->register_log_found_cb(bind_this(SoloReplDev::on_log_found, 3)); }); } else { - m_logdev_id = logstore_service().create_new_logdev(); + m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::TIMER); m_data_journal = logstore_service().create_new_log_store(m_logdev_id, true /* append_mode */); m_rd_sb->logstore_id = m_data_journal->get_store_id(); m_rd_sb->logdev_id = m_logdev_id; @@ -30,6 +30,8 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, repl_req_ptr_t rreq, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } + + incr_pending_request_num(); auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1, .traceID = tid}, value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, header, key, value.size, m_listener); @@ -60,6 +62,7 @@ void SoloReplDev::write_journal(repl_req_ptr_t rreq) { data_service().commit_blk(rreq->local_blkid()); m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkid(), rreq); + decr_pending_request_num(); }); } @@ -68,7 +71,6 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx uint32_t remain_size = buf.size() - sizeof(repl_journal_entry); HS_REL_ASSERT_EQ(entry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry found"); - HS_REL_ASSERT_EQ(entry->code, journal_type_t::HS_DATA_LINKED, "Found a journal entry which is not data"); uint8_t const* raw_ptr = r_cast< uint8_t const* >(entry) + sizeof(repl_journal_entry); sisl::blob header{raw_ptr, entry->user_header_size}; @@ -95,11 +97,25 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch, trace_id_t tid) { - return data_service().async_read(bid, sgs, size, part_of_batch); + if (is_stopping()) { + LOGINFO("repl dev is being shutdown!"); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + } + incr_pending_request_num(); + auto result = data_service().async_read(bid, sgs, size, part_of_batch); + decr_pending_request_num(); + return result; } folly::Future< std::error_code > SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) { - return data_service().async_free_blk(bid); + if (is_stopping()) { + LOGINFO("repl dev is being shutdown!"); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + } + incr_pending_request_num(); + auto result = data_service().async_free_blk(bid); + decr_pending_request_num(); + return result; } uint32_t SoloReplDev::get_blk_size() const { return data_service().get_blk_size(); } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 397f461da..579506db1 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -58,7 +58,7 @@ class SoloReplDev : public ReplDev { bool is_ready_for_traffic() const override { return true; } void purge() override {} - std::shared_ptr deserialize_snapshot_context(sisl::io_blob_safe &snp_ctx) override { + std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { return nullptr; } diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index f357cb819..20a5f8436 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -78,7 +78,7 @@ hs_stats GenericReplService::get_cap_stats() const { ///////////////////// SoloReplService specializations and CP Callbacks ///////////////////////////// SoloReplService::SoloReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} {} -SoloReplService::~SoloReplService(){}; +SoloReplService::~SoloReplService() {}; void SoloReplService::start() { for (auto const& [buf, mblk] : m_sb_bufs) { @@ -97,7 +97,23 @@ void SoloReplService::start() { } void SoloReplService::stop() { - // TODO: Implement graceful shutdown for soloReplService + start_stopping(); + while (true) { + auto pending_request_num = get_pending_request_num(); + if (!pending_request_num) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + + // stop all repl_devs + { + std::unique_lock lg(m_rd_map_mtx); + for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) { + auto rdev = std::dynamic_pointer_cast< SoloReplDev >(it->second); + rdev->stop(); + } + } + hs()->logstore_service().stop(); + hs()->data_service().stop(); } AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t group_id, @@ -110,6 +126,7 @@ AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t auto listener = m_repl_app->create_repl_dev_listener(group_id); listener->set_repl_dev(rdev); rdev->attach_listener(std::move(listener)); + incr_pending_request_num(); { std::unique_lock lg(m_rd_map_mtx); @@ -117,10 +134,12 @@ AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t if (!happened) { // We should never reach here, as we have failed to emplace in map, but couldn't find entry DEBUG_ASSERT(false, "Unable to put the repl_dev in rd map"); + decr_pending_request_num(); return make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_ALREADY_EXISTS); } } + decr_pending_request_num(); return make_async_success< shared< ReplDev > >(rdev); } diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 940d2e891..cba159954 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -131,7 +131,7 @@ if (${io_tests}) add_test(NAME DataService-Epoll COMMAND test_data_service) add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev) # add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic) - # add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) + add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) endif() can_build_spdk_io_tests(spdk_tests) diff --git a/src/tests/log_store_benchmark.cpp b/src/tests/log_store_benchmark.cpp index c4e37fa25..986ab1cc7 100644 --- a/src/tests/log_store_benchmark.cpp +++ b/src/tests/log_store_benchmark.cpp @@ -55,7 +55,7 @@ class BenchLogStore { public: friend class SampleDB; BenchLogStore() { - m_logdev_id = logstore_service().create_new_logdev(); + m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); m_log_store = logstore_service().create_new_log_store(m_logdev_id, true /* append_mode */); m_log_store->register_log_found_cb(bind_this(BenchLogStore::on_log_found, 3)); m_nth_entry.store(0); diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp index 7bde7bc12..45ecee96f 100644 --- a/src/tests/test_log_dev.cpp +++ b/src/tests/test_log_dev.cpp @@ -158,9 +158,10 @@ class LogDevTest : public ::testing::Test { } } - void insert_batch_sync(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& lsn, int64_t batch, uint32_t fixed_size = 0) { + void insert_batch_sync(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& lsn, int64_t batch, + uint32_t fixed_size = 0) { bool io_memory{false}; - std::vector data_vector; + std::vector< test_log_data* > data_vector; for (int64_t i = 0; i < batch; ++i) { auto* d = prepare_data(lsn + i, io_memory, fixed_size); @@ -246,20 +247,16 @@ class LogDevTest : public ::testing::Test { logid_t get_last_truncate_idx(logdev_id_t logdev_id) { auto status = logstore_service().get_logdev(logdev_id)->get_status(0); - if (status.contains("last_truncate_log_idx")) { - return s_cast(status["last_truncate_log_idx"]); - } + if (status.contains("last_truncate_log_idx")) { return s_cast< logid_t >(status["last_truncate_log_idx"]); } LOGERROR("Failed to get last_truncate_log_idx from logdev status for logdev_id {}", logdev_id); - return static_cast(-1); + return static_cast< logid_t >(-1); } logid_t get_current_log_idx(logdev_id_t logdev_id) { auto status = logstore_service().get_logdev(logdev_id)->get_status(0); - if (status.contains("current_log_idx")) { - return s_cast(status["current_log_idx"]); - } + if (status.contains("current_log_idx")) { return s_cast< logid_t >(status["current_log_idx"]); } LOGERROR("Failed to get current_log_idx from logdev status for logdev_id {}", logdev_id); - return static_cast(-1); + return static_cast< logid_t >(-1); } }; @@ -268,7 +265,7 @@ TEST_F(LogDevTest, WriteSyncThenRead) { for (uint32_t iteration{0}; iteration < iterations; ++iteration) { LOGINFO("Iteration {}", iteration); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); const auto store_id = log_store->get_store_id(); @@ -288,7 +285,7 @@ TEST_F(LogDevTest, WriteSyncThenRead) { TEST_F(LogDevTest, Rollback) { LOGINFO("Step 1: Create a single logstore to start rollback test"); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); auto store_id = log_store->get_store_id(); @@ -296,7 +293,7 @@ TEST_F(LogDevTest, Rollback) { auto restart = [&]() { std::promise< bool > p; auto starting_cb = [&]() { - logstore_service().open_logdev(logdev_id); + logstore_service().open_logdev(logdev_id, flush_mode_t::EXPLICIT); logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) { log_store = store; p.set_value(true); @@ -355,7 +352,7 @@ TEST_F(LogDevTest, Rollback) { TEST_F(LogDevTest, ReTruncate) { LOGINFO("Step 1: Create a single logstore to start re-truncate test"); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); @@ -382,7 +379,7 @@ TEST_F(LogDevTest, ReTruncate) { TEST_F(LogDevTest, TruncateWithExceedingLSN) { LOGINFO("Step 1: Create a single logstore to start truncate with exceeding LSN test"); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); @@ -426,7 +423,7 @@ TEST_F(LogDevTest, TruncateWithExceedingLSN) { TEST_F(LogDevTest, TruncateAfterRestart) { LOGINFO("Step 1: Create a single logstore to start truncate with overlapping LSN test"); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); auto store_id = log_store->get_store_id(); @@ -434,7 +431,7 @@ TEST_F(LogDevTest, TruncateAfterRestart) { auto restart = [&]() { std::promise< bool > p; auto starting_cb = [&]() { - logstore_service().open_logdev(logdev_id); + logstore_service().open_logdev(logdev_id, flush_mode_t::EXPLICIT); logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) { log_store = store; p.set_value(true); @@ -477,13 +474,12 @@ TEST_F(LogDevTest, TruncateAfterRestart) { TEST_F(LogDevTest, TruncateAcrossMultipleStores) { LOGINFO("Step 1: Create 3 log stores to start truncate across multiple stores test"); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto store1 = logstore_service().create_new_log_store(logdev_id, false); auto store2 = logstore_service().create_new_log_store(logdev_id, false); auto store3 = logstore_service().create_new_log_store(logdev_id, false); - LOGINFO("Step 2: Insert 100 entries to store {}", store1->get_store_id()); logstore_seq_num_t cur_lsn = 0; kickstart_inserts(store1, cur_lsn, 100); @@ -644,15 +640,15 @@ TEST_F(LogDevTest, TruncateAcrossMultipleStores) { TEST_F(LogDevTest, TruncateLogsAfterFlushAndRestart) { LOGINFO("Step 1: Create a single logstore to start truncate-logs-after-flush-and-restart test"); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); auto store_id = log_store->get_store_id(); auto restart = [&]() { - std::promise < bool > p; + std::promise< bool > p; auto starting_cb = [&]() { - logstore_service().open_logdev(logdev_id); + logstore_service().open_logdev(logdev_id, flush_mode_t::EXPLICIT); logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) { log_store = store; p.set_value(true); @@ -712,7 +708,7 @@ TEST_F(LogDevTest, CreateRemoveLogDev) { ASSERT_EQ(vdev->num_descriptors(), 0); for (uint32_t i{0}; i < num_logdev; ++i) { - auto id = logstore_service().create_new_logdev(); + auto id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(id)->get_flush_size_multiple(); auto store = logstore_service().create_new_log_store(id, false); log_stores.push_back(store); @@ -760,7 +756,7 @@ TEST_F(LogDevTest, DeleteUnopenedLogDev) { // Test deletion of unopened logdev. std::set< logdev_id_t > id_set, unopened_id_set; for (uint32_t i{0}; i < num_logdev; ++i) { - auto id = logstore_service().create_new_logdev(); + auto id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); id_set.insert(id); if (i >= num_logdev / 2) { unopened_id_set.insert(id); } s_max_flush_multiple = logstore_service().get_logdev(id)->get_flush_size_multiple(); @@ -784,7 +780,7 @@ TEST_F(LogDevTest, DeleteUnopenedLogDev) { auto starting_cb = [&]() { auto it = id_set.begin(); for (uint32_t i{0}; i < id_set.size() / 2; i++, it++) { - logstore_service().open_logdev(*it); + logstore_service().open_logdev(*it, flush_mode_t::EXPLICIT); } }; start_homestore(true /* restart */, starting_cb); diff --git a/src/tests/test_log_store.cpp b/src/tests/test_log_store.cpp index 3b1b2c60b..8f18d71f2 100644 --- a/src/tests/test_log_store.cpp +++ b/src/tests/test_log_store.cpp @@ -455,7 +455,7 @@ class SampleDB { for (uint32_t i{0}; i < n_log_stores; ++i) { SampleLogStoreClient* client = m_log_store_clients[i].get(); - logstore_service().open_logdev(client->m_logdev_id); + logstore_service().open_logdev(client->m_logdev_id, flush_mode_t::EXPLICIT); logstore_service() .open_log_store(client->m_logdev_id, client->m_store_id, false /* append_mode */) .thenValue([i, this, client](auto log_store) { client->set_log_store(log_store); }); @@ -479,7 +479,7 @@ class SampleDB { std::vector< logdev_id_t > logdev_id_vec; for (uint32_t i{0}; i < n_log_devs; ++i) { - logdev_id_vec.push_back(logstore_service().create_new_logdev()); + logdev_id_vec.push_back(logstore_service().create_new_logdev(flush_mode_t::EXPLICIT)); } for (uint32_t i{0}; i < n_log_stores; ++i) { @@ -1225,7 +1225,7 @@ TEST_F(LogStoreTest, WriteSyncThenRead) { for (uint32_t iteration{0}; iteration < iterations; ++iteration) { LOGINFO("Iteration {}", iteration); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); auto tmp_log_store = logstore_service().create_new_log_store(logdev_id, false); const auto store_id = tmp_log_store->get_store_id(); LOGINFO("Created new log store -> id {}", store_id); diff --git a/src/tests/test_log_store_long_run.cpp b/src/tests/test_log_store_long_run.cpp index 5fd0ec21f..5a7437754 100644 --- a/src/tests/test_log_store_long_run.cpp +++ b/src/tests/test_log_store_long_run.cpp @@ -294,7 +294,7 @@ class LogStoreLongRun : public ::testing::Test { HS_SETTINGS_FACTORY().save(); for (uint32_t i{0}; i < n_log_stores; ++i) { SampleLogStoreClient* client = m_log_store_clients[i].get(); - logstore_service().open_logdev(client->m_logdev_id); + logstore_service().open_logdev(client->m_logdev_id, flush_mode_t::EXPLICIT); logstore_service() .open_log_store(client->m_logdev_id, client->m_store_id, false /* append_mode */) .thenValue([i, this, client](auto log_store) { client->set_log_store(log_store); }); @@ -318,7 +318,7 @@ class LogStoreLongRun : public ::testing::Test { std::vector< logdev_id_t > logdev_id_vec; for (uint32_t i{0}; i < n_log_devs; ++i) - logdev_id_vec.push_back(logstore_service().create_new_logdev()); + logdev_id_vec.push_back(logstore_service().create_new_logdev(flush_mode_t::EXPLICIT)); for (uint32_t i{0}; i < n_log_stores; ++i) m_log_store_clients.push_back(std::make_unique< SampleLogStoreClient >( @@ -466,7 +466,7 @@ class LogStoreLongRun : public ::testing::Test { validate_num_stores(); // Create a new logstore. - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); m_log_store_clients.push_back(std::make_unique< SampleLogStoreClient >( logdev_id, bind_this(LogStoreLongRun::on_log_insert_completion, 3))); validate_num_stores(); diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index ec45ef5b4..aaec8851f 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -99,6 +99,7 @@ class SoloReplDevTest : public testing::Test { void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, cintrusive< repl_req_ctx >& ctx) override { + LOGINFO("Received on_commit lsn={}", lsn); if (ctx == nullptr) { m_test.validate_replay(*repl_dev(), lsn, header, key, blkids); } else { @@ -232,8 +233,8 @@ class SoloReplDevTest : public testing::Test { uint32_t size = blkids.blk_count() * g_block_size; if (size) { auto read_sgs = HSTestHelper::create_sgs(size, size); - LOGDEBUG("[{}] Validating replay of lsn={} blkid = {}", boost::uuids::to_string(rdev.group_id()), lsn, - blkids.to_string()); + LOGINFO("[{}] Validating replay of lsn={} blkid = {}", boost::uuids::to_string(rdev.group_id()), lsn, + blkids.to_string()); rdev.async_read(blkids, read_sgs, size) .thenValue([this, hdr = *jhdr, read_sgs, lsn, blkids, &rdev](auto&& err) { RELEASE_ASSERT(!err, "Error during async_read"); @@ -243,8 +244,8 @@ class SoloReplDevTest : public testing::Test { HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr.data_pattern); iomanager.iobuf_free(uintptr_cast(iov.iov_base)); } - LOGDEBUG("[{}] Replay of lsn={} blkid={} validated successfully", - boost::uuids::to_string(rdev.group_id()), lsn, blkids.to_string()); + LOGINFO("[{}] Replay of lsn={} blkid={} validated successfully", + boost::uuids::to_string(rdev.group_id()), lsn, blkids.to_string()); m_task_waiter.one_complete(); }); } else { @@ -258,15 +259,15 @@ class SoloReplDevTest : public testing::Test { req->read_sgs = HSTestHelper::create_sgs(req->write_sgs.size, req->write_sgs.size); auto const cap = hs()->repl_service().get_cap_stats(); - LOGDEBUG("Write complete with cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); + LOGINFO("Write complete with cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); rdev.async_read(req->written_blkids, req->read_sgs, req->read_sgs.size) .thenValue([this, &rdev, req](auto&& err) { RELEASE_ASSERT(!err, "Error during async_read"); - LOGDEBUG("[{}] Write complete with lsn={} for size={} blkids={}", - boost::uuids::to_string(rdev.group_id()), req->lsn(), req->write_sgs.size, - req->written_blkids.to_string()); + LOGINFO("[{}] Write complete with lsn={} for size={} blkids={}", + boost::uuids::to_string(rdev.group_id()), req->lsn(), req->write_sgs.size, + req->written_blkids.to_string()); auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); HS_REL_ASSERT_EQ(hdr->data_size, req->read_sgs.size, "journal hdr data size mismatch with actual size"); @@ -298,7 +299,9 @@ TEST_F(SoloReplDevTest, TestRandomSizedDataBlock) { uint32_t key_size = rand() % 512 + 8; this->write_io(key_size, nblks * g_block_size, g_block_size); }); + this->m_io_runner.execute().get(); + LOGINFO("Step 2: Restart homestore and validate replay data.", g_block_size); this->m_task_waiter.start([this]() { this->restart(); }).get(); } @@ -306,6 +309,7 @@ TEST_F(SoloReplDevTest, TestHeaderOnly) { LOGINFO("Step 1: run on worker threads to schedule write"); this->m_io_runner.set_task([this]() { this->write_io(0u, 0u, g_block_size); }); this->m_io_runner.execute().get(); + LOGINFO("Step 2: Restart homestore and validate replay data.", g_block_size); this->m_task_waiter.start([this]() { this->restart(); }).get(); } From 8b8bb95c8c0c31c8a21315f11cf9630b477a373d Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Thu, 17 Apr 2025 15:41:53 +0800 Subject: [PATCH 097/170] Support handling no_space_left error in raft channel (#682) --- conanfile.py | 2 +- .../homestore/replication/repl_decls.h | 1 + src/include/homestore/replication/repl_dev.h | 29 ++- src/lib/blkalloc/append_blk_allocator.cpp | 7 +- .../replication/log_store/repl_log_store.cpp | 6 +- src/lib/replication/push_data_rpc.fbs | 2 +- src/lib/replication/repl_dev/common.cpp | 36 ++- .../replication/repl_dev/raft_repl_dev.cpp | 233 ++++++++++++++---- src/lib/replication/repl_dev/raft_repl_dev.h | 84 ++++--- .../repl_dev/raft_state_machine.cpp | 23 +- .../replication/repl_dev/raft_state_machine.h | 2 +- src/lib/replication/repl_dev/solo_repl_dev.h | 11 + src/tests/test_raft_repl_dev.cpp | 33 ++- 13 files changed, 352 insertions(+), 117 deletions(-) diff --git a/conanfile.py b/conanfile.py index 8b399c66c..89e4c8616 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.9.3" + version = "6.9.5" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 83f806c40..90f2c67f7 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -36,6 +36,7 @@ VENUM(ReplServiceError, int32_t, NO_SPACE_LEFT = -20000, DRIVE_WRITE_ERROR = -20001, DATA_DUPLICATED = -20002, + QUIENCE_STATE = -20003, FAILED = -32768); // clang-format on diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 945bf9133..b0d9a7358 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -142,7 +142,6 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: repl_journal_entry const* journal_entry() const { return m_journal_entry; } uint32_t journal_entry_size() const; bool is_localize_pending() const { return m_is_jentry_localize_pending; } - bool is_data_inlined() const { return (m_op_code == journal_type_t::HS_DATA_INLINED); } bool has_linked_data() const { return (m_op_code == journal_type_t::HS_DATA_LINKED); } raft_buf_ptr_t& raft_journal_buf(); @@ -336,7 +335,8 @@ class ReplDevListener { /// @return Expected to return blk_alloc_hints for this write. If the hints are not available, then return the /// error. It is to be noted this method should return error only in very abnornal cases as in some code flow, an /// error would result in a crash or stall of the entire commit thread. - virtual ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, cintrusive< homestore::repl_req_ctx >& hs_ctx) = 0; + virtual ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, + cintrusive< homestore::repl_req_ctx >& hs_ctx) = 0; /// @brief Called when the repl_dev is being destroyed. The consumer is expected to clean up any related resources. /// However, it is expected that this call be idempotent. It is possible in rare scenarios that this can be called @@ -383,12 +383,12 @@ class ReplDevListener { } /// @brief ask upper layer to handle no_space_left event - virtual folly::Future< std::error_code > on_no_space_left(uint32_t pdev_id, chunk_num_t chunk_id) { - return folly::makeFuture< std::error_code >(std::error_code{}); - } + // @param lsn - on which repl_lsn no_space_left happened + // @param chunk_id - on which chunk no_space_left happened + virtual void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) { return; } /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer - virtual void on_log_replay_done(const group_id_t& group_id) {}; + virtual void on_log_replay_done(const group_id_t& group_id){}; private: std::weak_ptr< ReplDev > m_repl_dev; @@ -469,6 +469,10 @@ class ReplDev { /// @return last_commit_lsn virtual repl_lsn_t get_last_commit_lsn() const = 0; + /// @brief Gets the repl lsn of the last log in log store + /// @return last_append_repl_lsn + virtual repl_lsn_t get_last_append_lsn() = 0; + /// @brief if this replica is ready for accepting client IO. /// @return true if ready, false otherwise virtual bool is_ready_for_traffic() const = 0; @@ -500,6 +504,19 @@ class ReplDev { } } + // pause/resume statemachine(commiting thread) + virtual void pause_statemachine() = 0; + virtual void resume_statemachine() = 0; + + // complete all the requests that are in progress and start refusing new reqs + virtual void quiesce_reqs() = 0; + + // start accepting new reqs + virtual void resume_accepting_reqs() = 0; + + // clear reqs that has allocated blks on the given chunk. + virtual void clear_chunk_req(chunk_num_t chunk_id) = 0; + protected: shared< ReplDevListener > m_listener; diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index 141d09279..2f6cec25c 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -73,12 +73,17 @@ BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hint } if (avail_blks < nblks) { // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); - LOGERROR("No space left to serve request nblks: {}, available_blks: {}, actual available_blks(exclude reserved blks): {}", nblks, available_blks(), avail_blks); + LOGERROR("No space left to serve request nblks: {}, available_blks: {}, actual available_blks(exclude reserved " + "blks): {}", + nblks, available_blks(), avail_blks); + // the caller can know in which chunk no_space_left happened; + out_bid = BlkId{0, 0, m_chunk_id}; return BlkAllocStatus::SPACE_FULL; } else if (nblks > max_blks_per_blkid()) { // consumer(vdev) already handles this case. // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); LOGERROR("Can't serve request nblks: {} larger than max_blks_in_op: {}", nblks, max_blks_per_blkid()); + out_bid = BlkId{0, 0, m_chunk_id}; return BlkAllocStatus::FAILED; } diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index ca62c3197..f9b3d454e 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -16,9 +16,9 @@ uint64_t ReplLogStore::append(nuraft::ptr< nuraft::log_entry >& entry) { } repl_req_ptr_t rreq = m_sm.localize_journal_entry_finish(*entry); + RELEASE_ASSERT_NE(nullptr != rreq, "Failed to localize journal entry before appending log"); ulong lsn = HomeRaftLogStore::append(entry); m_sm.link_lsn_to_req(rreq, int64_cast(lsn)); - RD_LOGT(rreq->traceID(), "Raft Channel: Received append log entry rreq=[{}]", rreq->to_compact_string()); return lsn; } @@ -31,6 +31,7 @@ void ReplLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry } repl_req_ptr_t rreq = m_sm.localize_journal_entry_finish(*entry); + RELEASE_ASSERT(nullptr != rreq, "Failed to localize journal entry before overwriting log at index {}", index); HomeRaftLogStore::write_at(index, entry); m_sm.link_lsn_to_req(rreq, int64_cast(index)); RD_LOGT(rreq->traceID(), "Raft Channel: Received write_at log entry rreq=[{}]", rreq->to_compact_string()); @@ -66,7 +67,8 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { // Wait for the fetch and write to be completed successfully. // It is essential to complete the data write before appending to the log. If the logs are flushed // before the data is written, a restart and subsequent log replay occurs, as the in-memory state is lost, - // it leaves us uncertain about whether the data was actually written, potentially leading to data inconsistency. + // it leaves us uncertain about whether the data was actually written, potentially leading to data + // inconsistency. std::move(fut).wait(); HISTOGRAM_OBSERVE(m_rd.metrics(), data_channel_wait_latency_us, get_elapsed_time_us(cur_time)); } diff --git a/src/lib/replication/push_data_rpc.fbs b/src/lib/replication/push_data_rpc.fbs index 279fefcb5..d9a981e7c 100644 --- a/src/lib/replication/push_data_rpc.fbs +++ b/src/lib/replication/push_data_rpc.fbs @@ -2,7 +2,7 @@ native_include "boost/uuid/uuid.hpp"; namespace homestore; table PushDataRequest { - traceID: uint64; // traceID for the REQ + trace_id: uint64; // traceID for the REQ issuer_replica_id : int32; // Replica id of the issuer raft_term : uint64; // Raft term number dsn : uint64; // Data Sequence number diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 8cea3cc5a..6a39256f9 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -10,8 +10,9 @@ namespace homestore { -ReplServiceError repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size, cshared< ReplDevListener >& listener) { +ReplServiceError repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer, + sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, + cshared< ReplDevListener >& listener) { m_rkey = std::move(rkey); #ifndef NDEBUG if (data_size > 0) { @@ -26,17 +27,34 @@ ReplServiceError repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool m_key = key; m_is_jentry_localize_pending = (!is_proposer && (data_size > 0)); // Pending on the applier and with linked data - // We need to allocate the block if the req has data linked, since entry doesn't exist or if it exist, two threads(data channel and raft channel) are trying to do the same - // thing. So take state mutex and allocate the blk + // We need to allocate the block if the req has data linked, since entry doesn't exist or if it exist, two + // threads(data channel and raft channel) are trying to do the same thing. So take state mutex and allocate the blk std::unique_lock< std::mutex > lg(m_state_mtx); if (has_linked_data() && !has_state(repl_req_state_t::BLK_ALLOCATED)) { - auto alloc_status = alloc_local_blks(listener, data_size); + ReplServiceError alloc_status; +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("simulate_no_space_left") && !is_proposer) { + LOGERROR("Simulate no space left on follower for testing purposes"); + // TODO: support `simulate_no_space_left` for the leader, do not throw exception in on-error in the test + // framework, it will cause the leader to fail and exit. + alloc_status = ReplServiceError::NO_SPACE_LEFT; + } else { + alloc_status = alloc_local_blks(listener, data_size); + if (alloc_status != ReplServiceError::OK) { + LOGERRORMOD(replication, "[traceID={}] Allocate blk for rreq failed error={}", m_rkey.traceID, + alloc_status); + } + } +#else + alloc_status = alloc_local_blks(listener, data_size); if (alloc_status != ReplServiceError::OK) { LOGERRORMOD(replication, "[traceID={}] Allocate blk for rreq failed error={}", m_rkey.traceID, alloc_status); } +#endif return alloc_status; } + return ReplServiceError::OK; } @@ -107,8 +125,9 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list if (hints_result.hasError()) { return hints_result.error(); } if (hints_result.value().committed_blk_id.has_value()) { - //if the committed_blk_id is already present, use it and skip allocation and commitment - LOGINFOMOD(replication, "[traceID={}] For Repl_key=[{}] data already exists, skip", rkey().traceID, rkey().to_string()); + // if the committed_blk_id is already present, use it and skip allocation and commitment + LOGINFOMOD(replication, "[traceID={}] For Repl_key=[{}] data already exists, skip", rkey().traceID, + rkey().to_string()); m_local_blkid = hints_result.value().committed_blk_id.value(); add_state(repl_req_state_t::BLK_ALLOCATED); add_state(repl_req_state_t::DATA_RECEIVED); @@ -122,7 +141,8 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list auto status = data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), hints_result.value(), m_local_blkid); if (status != BlkAllocStatus::SUCCESS) { - LOGWARNMOD(replication, "[traceID={}] block allocation failure, repl_key=[{}], status=[{}]", rkey().traceID, rkey(), status); + LOGWARNMOD(replication, "[traceID={}] block allocation failure, repl_key=[{}], status=[{}]", rkey().traceID, + rkey(), status); DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks"); return ReplServiceError::NO_SPACE_LEFT; } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index d28e13a47..1b1ff0e8c 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -15,7 +15,6 @@ #include "common/homestore_assert.hpp" #include "common/homestore_config.hpp" -// #include "common/homestore_flip.hpp" #include "replication/service/raft_repl_service.h" #include "replication/repl_dev/raft_repl_dev.h" #include "device/chunk.h" @@ -200,18 +199,27 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ members.replica_in = member_in; sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_members_ctx)); - rreq->init(repl_key{.server_id = server_id(), - .term = raft_server()->get_term(), - .dsn = m_next_dsn.fetch_add(1), - .traceID = trace_id}, - journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0, m_listener); - - auto err = m_state_machine->propose_to_raft(std::move(rreq)); - if (err != ReplServiceError::OK) { - RD_LOGE(trace_id, "Replace member propose to raft failed {}", err); + auto status = init_req_ctx(rreq, + repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = trace_id}, + journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0, m_listener); + + if (status != ReplServiceError::OK) { + // Failed to initialize the repl_req_ctx for replace member. + RD_LOGE(trace_id, "Failed to initialize repl_req_ctx for replace member, error={}", status); reset_quorum_size(0, trace_id); decr_pending_request_num(); - return make_async_error<>(std::move(err)); + return make_async_error<>(std::move(status)); + } + + status = m_state_machine->propose_to_raft(std::move(rreq)); + if (status != ReplServiceError::OK) { + RD_LOGE(trace_id, "Replace member propose to raft failed {}", status); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(status)); } RD_LOGI(trace_id, "Replace member proposed to raft group_id={}", group_id_str()); @@ -273,13 +281,20 @@ folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { // here, we set the dsn to a new one , which is definitely unique in the follower, so that the new rreq will not // have a conflict with the old rreq. - rreq->init(repl_key{.server_id = server_id(), - .term = raft_server()->get_term(), - .dsn = m_next_dsn.fetch_add(1), - .traceID = std::numeric_limits< uint64_t >::max()}, - journal_type_t::HS_CTRL_DESTROY, true, sisl::blob{}, sisl::blob{}, 0, m_listener); + auto err = init_req_ctx(rreq, + repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = std::numeric_limits< uint64_t >::max()}, + journal_type_t::HS_CTRL_DESTROY, true, sisl::blob{}, sisl::blob{}, 0, m_listener); + + if (err != ReplServiceError::OK) { + // Failed to initialize the repl_req_ctx for replace member. + LOGERROR("Failed to initialize repl_req_ctx for destorying group, error={}", err); + return folly::makeSemiFuture< ReplServiceError >(std::move(err)); + } - auto err = m_state_machine->propose_to_raft(std::move(rreq)); + err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::ACTIVE; }); return folly::makeSemiFuture< ReplServiceError >(std::move(err)); @@ -324,12 +339,16 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } } - auto status = rreq->init(repl_key{.server_id = server_id(), - .term = raft_server()->get_term(), - .dsn = m_next_dsn.fetch_add(1), - .traceID = tid}, - data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, - true /* is_proposer */, header, key, data.size, m_listener); + auto status = init_req_ctx( + rreq, repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, + data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, header, + key, data.size, m_listener); + + if (status != ReplServiceError::OK) { + RD_LOGI(tid, "Initializing rreq failed error={}, failing this req", status); + handle_error(rreq, status); + return; + } RD_LOGD(tid, "repl_key [{}], header size [{}] bytes, user_key size [{}] bytes, data size [{}] bytes", rreq->rkey(), header.size(), key.size(), data.size); @@ -338,12 +357,6 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& auto const [it, happened] = m_repl_key_req_map.emplace(rreq->rkey(), rreq); RD_DBG_ASSERT(happened, "Duplicate repl_key={} found in the map", rreq->rkey().to_string()); - if (status != ReplServiceError::OK) { - RD_LOGI(tid, "Initializing rreq failed error={}, failing this req", status); - handle_error(rreq, status); - return; - } - // If it is header only entry, directly propose to the raft if (rreq->has_linked_data()) { if (rreq->is_proposer() && rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { @@ -462,7 +475,7 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d repl_key rkey{.server_id = push_req->issuer_replica_id(), .term = push_req->raft_term(), .dsn = push_req->dsn(), - .traceID = push_req->traceID()}; + .traceID = push_req->trace_id()}; auto const req_orig_time_ms = push_req->time_ms(); RD_LOGD(rkey.traceID, "Data Channel: PushData received: time diff={} ms.", get_elapsed_time_ms(req_orig_time_ms)); @@ -540,8 +553,10 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d } repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_type_t code, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size, - [[maybe_unused]] bool is_data_channel) { + sisl::blob const& key, uint32_t data_size, bool is_data_channel, + int64_t lsn) { + if (is_data_channel) RD_DBG_ASSERT(-1 == lsn, "lsn from data channel should always be -1 , got lsn {}", lsn); + auto const [it, happened] = m_repl_key_req_map.try_emplace(rkey, repl_req_ptr_t(new repl_req_ctx())); RD_DBG_ASSERT((it != m_repl_key_req_map.end()), "Unexpected error in map_repl_key_to_req"); auto rreq = it->second; @@ -562,30 +577,29 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ } // rreq->init will allocate the block if it has linked data. - auto status = rreq->init(rkey, code, false /* is_proposer */, user_header, key, data_size, m_listener); - if (!rreq->has_linked_data()) { return rreq; } -#ifdef _PRERELEASE - if (is_data_channel) { - if (iomgr_flip::instance()->test_flip("fake_reject_append_data_channel")) { - LOGINFO("Data Channel: Reject append_entries flip is triggered for rkey={}", rkey.to_string()); - status = ReplServiceError::NO_SPACE_LEFT; - } - } else { - if (iomgr_flip::instance()->test_flip("fake_reject_append_raft_channel")) { - LOGINFO("Raft Channel: Reject append_entries flip is triggered for rkey={}", rkey.to_string()); - status = ReplServiceError::NO_SPACE_LEFT; - } - } -#endif + auto status = init_req_ctx(rreq, rkey, code, false /* is_proposer */, user_header, key, data_size, m_listener); + if (status != ReplServiceError::OK) { RD_LOGD(rkey.traceID, "For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), status); + if (status == ReplServiceError::NO_SPACE_LEFT && !is_data_channel && !rreq->is_proposer()) { + const auto& chunk_id = rreq->local_blkid().chunk_num(); + RD_LOGD(rkey.traceID, + "For Repl_key=[{}] alloc hints returned error={} when trying to allocate blk on chunk={}", + rkey.to_string(), status, chunk_id); + m_listener->on_no_space_left(lsn, chunk_id); + } else { + RD_LOGD( + rkey.traceID, + "For Repl_key=[{}] alloc hints returned error={}, failing this req, data_channl: {}, is_proposer: {} ", + rkey.to_string(), status, is_data_channel, rreq->is_proposer()); + } // Do not call handle_error here, because handle_error is for rreq which needs to be terminated. This one can be // retried. return nullptr; } - RD_LOGD(rreq->traceID(), "in follower_create_req: rreq={}, addr=0x{:x}", rreq->to_string(), + RD_LOGD(rkey.traceID, , "in follower_create_req: rreq={}, addr=0x{:x}", rreq->to_string(), reinterpret_cast< uintptr_t >(rreq.get())); return rreq; } @@ -1015,7 +1029,8 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { "Out of order commit of lsns, it is not expected in RaftReplDev. cur_lsns={}, prev_lsns={}", rreq->lsn(), prev_lsn); } - if (!rreq->is_proposer()) { rreq->clear(); } + + if (!rreq->is_proposer()) rreq->clear(); } void RaftReplDev::handle_config_commit(const repl_lsn_t lsn, raft_cluster_config_ptr_t& new_conf) { @@ -1366,7 +1381,7 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, auto raft_req = r_cast< nuraft::req_msg* >(param->ctx); auto const& entries = raft_req->log_entries(); - auto start_lsn = raft_req->get_last_log_idx() + 1; + auto start_lsn = to_repl_lsn(raft_req->get_last_log_idx() + 1); if (entries.size() == 0) { RD_LOGT(NO_TRACE_ID, "Raft channel: Received no entry, leader committed lsn {}", raft_req->get_commit_idx()); @@ -1395,7 +1410,7 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, } // Those LSNs already in logstore but not yet committed, will be dedup here, // applier_create_req will return same req as previous one - auto req = m_state_machine->localize_journal_entry_prepare(*entry); + auto req = m_state_machine->localize_journal_entry_prepare(*entry, lsn); if (req == nullptr) { sisl::VectorPool< repl_req_ptr_t >::free(reqs); // The hint set here will be used by the next after next appendEntry, the next one @@ -1581,7 +1596,8 @@ void RaftReplDev::set_log_store_last_durable_lsn(store_lsn_t lsn) { m_data_journ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { auto repl_lsn = to_repl_lsn(lsn); if (need_skip_processing(repl_lsn)) { - RD_LOGI(NO_TRACE_ID, "Raft Channel: Log {} is outdated and will be handled by baseline resync. Ignoring replay.", lsn); + RD_LOGI(NO_TRACE_ID, + "Raft Channel: Log {} is outdated and will be handled by baseline resync. Ignoring replay.", lsn); return; } @@ -1640,8 +1656,8 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx rreq->set_lsn(repl_lsn); // keep lentry in scope for the lyfe cycle of the rreq rreq->set_lentry(lentry); - auto status = rreq->init(rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), entry_to_key(jentry), - data_size, m_listener); + auto status = init_req_ctx(rreq, rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), + entry_to_key(jentry), data_size, m_listener); if (status != ReplServiceError::OK) { RD_LOGE(jentry->traceID, "Initializing rreq failed, rreq=[{}], error={}", rreq->to_string(), status); } @@ -1744,4 +1760,113 @@ void RaftReplDev::report_blk_metrics_if_needed(repl_req_ptr_t rreq) { } } +void RaftReplDev::pause_statemachine() { + if (!raft_server()->is_state_machine_execution_paused()) { + raft_server()->pause_state_machine_exeuction(); + while (!raft_server()->wait_for_state_machine_pause(100)) { + RD_LOGD(NO_TRACE_ID, "wait for statemachine pause!"); + } + } +} + +void RaftReplDev::resume_statemachine() { + if (raft_server()->is_state_machine_execution_paused()) { + raft_server()->resume_state_machine_execution(); + RD_LOGD(NO_TRACE_ID, "statemachine is resumed!"); + } +} + +void RaftReplDev::quiesce_reqs() { + // all the block allocation happens in rreq->init. so after we wait for all the pending req has been initialized we + // can make sure + // 1 all the pending reqs has allocated their blocks + // 2 no new pending reqs will be initialized again. + m_in_quience.store(true, std::memory_order_release); + RD_LOGD(NO_TRACE_ID, "enter quience state, waiting for all the pending req to be initialized"); + while (true) { + uint64_t pending_req_num = get_pending_init_req_num(); + if (pending_req_num) { + RD_LOGD(NO_TRACE_ID, "wait for {} pending create_req requests to be completed", pending_req_num); + std::this_thread::sleep_for(std::chrono::microseconds(1)); + } else + break; + } +} + +void RaftReplDev::resume_accepting_reqs() { m_in_quience.store(false, std::memory_order_release); } + +void RaftReplDev::clear_chunk_req(chunk_num_t chunk_id) { + RD_LOGD(NO_TRACE_ID, + "start cleaning all the in-memory rreqs, which has allocated blk on the emergent chunk={} before handling " + "no_space_left error", + chunk_id); + std::vector< folly::Future< folly::Unit > > futs; + for (auto& [key, rreq] : m_repl_key_req_map) { + if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { + auto blkid = rreq->local_blkid(); + if (chunk_id == blkid.chunk_num()) { + // only clean the rreqs which has allocated blks on the emergent chunk + futs.emplace_back( + std::move(data_service().async_free_blk(blkid).thenValue([this, &blkid, &key](auto&& err) { + HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", + blkid.to_string()); + RD_LOGD(NO_TRACE_ID, "blkid={} freed successfully for handling no_space_left error", + blkid.to_string()); + m_repl_key_req_map.erase(key); // remove from the req map after freeing the blk + }))); + } + } + } + + folly::collectAllUnsafe(futs) + .thenValue([this](auto&& vf) { + // TODO:: handle the error in freeing blk if necessary in the future. + // for nuobject case, error for freeing blk in the emergent chunk can be ingored + RD_LOGD( + NO_TRACE_ID, + "all the necessary in-memory rreqs which has allocated blks on the emergent chunk have been cleaned up " + "successfully, continue to handle no_space_left error."); + }) + // need to wait for the completion + .wait(); +} + +ReplServiceError RaftReplDev::init_req_ctx(repl_req_ptr_t rreq, repl_key rkey, journal_type_t op_code, bool is_proposer, + sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, + cshared< ReplDevListener >& listener) { + if (!rreq) { + RD_LOGD(rkey.traceID, "got nullptr for initing req, rkey=[{}]", rkey.to_string()); + return ReplServiceError::CANCELLED; + } + + init_req_counter counter(m_pending_init_req_num); + if (is_in_quience()) { + // In quience state, reject any new requests. + RD_LOGD(rkey.traceID, "Rejecting new request in quience state, rkey=[{}]", rkey.to_string()); + return ReplServiceError::QUIENCE_STATE; + } + + return rreq->init(rkey, op_code, is_proposer, user_header, key, data_size, m_listener); +} + +void RaftReplDev::become_leader_cb() { + auto new_gate = raft_server()->get_last_log_idx(); + repl_lsn_t existing_gate = 0; + if (!m_traffic_ready_lsn.compare_exchange_strong(existing_gate, new_gate)) { + // was a follower, m_traffic_ready_lsn should be zero on follower. + RD_REL_ASSERT(!existing_gate, "existing gate should be zero"); + } + RD_LOGD(NO_TRACE_ID, "become_leader_cb: setting traffic_ready_lsn from {} to {}", existing_gate, new_gate); +} + +bool RaftReplDev::is_ready_for_traffic() const { + if (is_stopping()) return false; + auto committed_lsn = m_commit_upto_lsn.load(); + auto gate = m_traffic_ready_lsn.load(); + bool ready = committed_lsn >= gate; + if (!ready) { + RD_LOGD(NO_TRACE_ID, "Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); + } + return ready; +} } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index fb9d2aab7..e8aee7f34 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -156,16 +156,29 @@ class nuraft_snapshot_context : public snapshot_context { class RaftReplDev : public ReplDev, public nuraft_mesg::mesg_state_mgr, public std::enable_shared_from_this< RaftReplDev > { +private: + class init_req_counter { + public: + init_req_counter(std::atomic_uint64_t& counter) : my_counter(counter) { + my_counter.fetch_add(1, std::memory_order_acq_rel); + } + + ~init_req_counter() { my_counter.fetch_sub(1, std::memory_order_acq_rel); } + + private: + std::atomic_uint64_t& my_counter; + }; + private: shared< RaftStateMachine > m_state_machine; RaftReplService& m_repl_svc; folly::ConcurrentHashMap< repl_key, repl_req_ptr_t, repl_key::Hasher > m_repl_key_req_map; nuraft_mesg::Manager& m_msg_mgr; - group_id_t m_group_id; // Replication Group id - std::string m_rdev_name; // Short name for the group for easy debugging + group_id_t m_group_id; // Replication Group id + std::string m_rdev_name; // Short name for the group for easy debugging std::string m_identify_str; // combination of rdev_name:group_id - replica_id_t m_my_repl_id; // This replica's uuid - int32_t m_raft_server_id; // Server ID used by raft (unique within raft group) + replica_id_t m_my_repl_id; // This replica's uuid + int32_t m_raft_server_id; // Server ID used by raft (unique within raft group) shared< ReplLogStore > m_data_journal; shared< HomeLogStore > m_free_blks_journal; sisl::urcu_scoped_ptr< repl_dev_stage_t > m_stage; @@ -176,7 +189,7 @@ class RaftReplDev : public ReplDev, mutable folly::SharedMutexWritePriority m_sb_lock; // Lock to protect staged sb and persisting sb raft_repl_dev_superblk m_sb_in_mem; // Cached version which is used to read and for staging - std::atomic< repl_lsn_t > m_commit_upto_lsn{0}; // LSN which was lastly written, to track flushes + std::atomic< repl_lsn_t > m_commit_upto_lsn{0}; // LSN which was lastly committed, to track flushes std::atomic< repl_lsn_t > m_compact_lsn{0}; // LSN upto which it was compacted, it is used to track where to // The `traffic_ready_lsn` variable holds the Log Sequence Number (LSN) up to which // the state machine should committed to before accepting traffic. This threshold ensures that @@ -199,6 +212,10 @@ class RaftReplDev : public ReplDev, static std::atomic< uint64_t > s_next_group_ordinal; bool m_log_store_replay_done{false}; + // pending create requests, including both raft and data channel + std::atomic_uint64_t m_pending_init_req_num; + std::atomic< bool > m_in_quience; + public: friend class RaftStateMachine; @@ -236,23 +253,14 @@ class RaftReplDev : public ReplDev, uint32_t get_blk_size() const override; repl_lsn_t get_last_commit_lsn() const override { return m_commit_upto_lsn.load(); } void set_last_commit_lsn(repl_lsn_t lsn) { m_commit_upto_lsn.store(lsn); } + repl_lsn_t get_last_append_lsn() override { return raft_server()->get_last_log_idx() + 1; /*to_repl_lsn*/ } bool is_destroy_pending() const; bool is_destroyed() const; + Clock::time_point destroyed_time() const { return m_destroyed_time; } - bool is_ready_for_traffic() const override { - if (is_stopping()) return false; - auto committed_lsn = m_commit_upto_lsn.load(); - auto gate = m_traffic_ready_lsn.load(); - bool ready = committed_lsn >= gate; - if (!ready) { - RD_LOGD(NO_TRACE_ID, "Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); - } - return ready; - } + bool is_ready_for_traffic() const override; // purge all resources (e.g., logs in logstore) is a very dangerous operation, it is not supported yet. - void purge() override { - RD_REL_ASSERT(false, "NOT SUPPORTED YET"); - } + void purge() override { RD_REL_ASSERT(false, "NOT SUPPORTED YET"); } std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { return std::make_shared< nuraft_snapshot_context >(snp_ctx); @@ -271,22 +279,17 @@ class RaftReplDev : public ReplDev, void handle_rollback(repl_req_ptr_t rreq); repl_req_ptr_t repl_key_to_req(repl_key const& rkey) const; repl_req_ptr_t applier_create_req(repl_key const& rkey, journal_type_t code, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size, bool is_data_channel); + sisl::blob const& key, uint32_t data_size, bool is_data_channel, + int64_t lsn = -1 /*init lsn*/); folly::Future< folly::Unit > notify_after_data_written(std::vector< repl_req_ptr_t >* rreqs); void check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreqs); void cp_flush(CP* cp, cshared< ReplDevCPContext > ctx); cshared< ReplDevCPContext > get_cp_ctx(CP* cp); void cp_cleanup(CP* cp); void become_ready(); - void become_leader_cb() { - auto new_gate = raft_server()->get_last_log_idx(); - repl_lsn_t existing_gate = 0; - if (!m_traffic_ready_lsn.compare_exchange_strong(existing_gate, new_gate)) { - // was a follower, m_traffic_ready_lsn should be zero on follower. - RD_REL_ASSERT(existing_gate == 0, "existing gate should be zero"); - } - RD_LOGD(NO_TRACE_ID, "become_leader_cb: setting traffic_ready_lsn from {} to {}", existing_gate, new_gate); - }; + + void become_leader_cb(); + void become_follower_cb() { // m_traffic_ready_lsn should be zero on follower. m_traffic_ready_lsn.store(0); @@ -345,15 +348,23 @@ class RaftReplDev : public ReplDev, /** * \brief This method is called to check if the given LSN is within the last snapshot LSN received from the leader. - * All logs with LSN less than or equal to the last snapshot LSN are considered as part of the baseline resync, which - * doesn't need any more operations (e.g., replay, commit). + * All logs with LSN less than or equal to the last snapshot LSN are considered as part of the baseline resync, + * which doesn't need any more operations (e.g., replay, commit). * * \param lsn The LSN to be checked. * \return true if the LSN is within the last snapshot LSN, false otherwise. */ - bool need_skip_processing(const repl_lsn_t lsn) { - return lsn <= m_rd_sb->last_snapshot_lsn; - } + bool need_skip_processing(const repl_lsn_t lsn) { return lsn <= m_rd_sb->last_snapshot_lsn; } + + // pause/resume statemachine(commiting thread) + void pause_statemachine(); + void resume_statemachine(); + + void quiesce_reqs(); + void resume_accepting_reqs(); + + // clear reqs that has allocated blks on the given chunk. + void clear_chunk_req(chunk_num_t chunk_id); protected: //////////////// All nuraft::state_mgr overrides /////////////////////// @@ -399,6 +410,13 @@ class RaftReplDev : public ReplDev, bool save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s); void report_blk_metrics_if_needed(repl_req_ptr_t rreq); + ReplServiceError init_req_ctx(repl_req_ptr_t rreq, repl_key rkey, journal_type_t op_code, bool is_proposer, + sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, + cshared< ReplDevListener >& listener); + + bool is_in_quience() { return m_in_quience.load(std::memory_order_acquire); } + + uint64_t get_pending_init_req_num() { return m_pending_init_req_num.load(std::memory_order_acquire); } }; } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 0b6b03a31..022d718a6 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -50,7 +50,7 @@ ReplServiceError RaftStateMachine::propose_to_raft(repl_req_ptr_t rreq) { return ReplServiceError::OK; } -repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entry& lentry) { +repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entry& lentry, int64_t lsn /*repl_lsn*/) { // Validate the journal entry and see if it needs to be transformed repl_journal_entry* jentry = r_cast< repl_journal_entry* >(lentry.get_buf().data_begin()); RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, @@ -85,8 +85,9 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr MultiBlkId entry_blkid; entry_blkid.deserialize(entry_to_val(jentry), true /* copy */); - rreq = m_rd.applier_create_req(rkey, jentry->code, entry_to_hdr(jentry), entry_to_key(jentry), - (entry_blkid.blk_count() * m_rd.get_blk_size()), false /* is_data_channel */); + rreq = + m_rd.applier_create_req(rkey, jentry->code, entry_to_hdr(jentry), entry_to_key(jentry), + (entry_blkid.blk_count() * m_rd.get_blk_size()), false /* is_data_channel */, lsn); if (rreq == nullptr) { goto out; } rreq->set_remote_blkid(RemoteBlkId{jentry->server_id, entry_blkid}); @@ -111,7 +112,8 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr std::memcpy(blkid_location, rreq->local_blkid().serialize().cbytes(), local_size); } else { rreq = m_rd.applier_create_req(rkey, jentry->code, entry_to_hdr(jentry), entry_to_key(jentry), - jentry->value_size, false /* is_data_channel */); + jentry->value_size, false /* is_data_channel */, lsn); + if (rreq == nullptr) goto out; } // We might have localized the journal entry with new blkid. We need to also update the header/key pointers pointing @@ -156,7 +158,8 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_finish(nuraft::log_entry auto rreq = m_rd.repl_key_to_req(rkey); if ((rreq == nullptr) || (rreq->is_localize_pending())) { - rreq = localize_journal_entry_prepare(lentry); + rreq = localize_journal_entry_prepare(lentry, + -1 /* lsn=-1, since this is a finish call and we don't have lsn yet */); if (rreq == nullptr) { RELEASE_ASSERT(rreq != nullptr, "We get an linked data for rkey=[{}], jentry=[{}] not as part of Raft Append but " @@ -209,7 +212,8 @@ void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_pt // when reaching here, the config change log has already been committed, and the new config has been applied to the // cluster if (m_rd.need_skip_processing(s_cast< repl_lsn_t >(log_idx))) { - RD_LOGI(NO_TRACE_ID, "Raft Channel: Config {} is expected to be handled by snapshot. Skipping commit.", log_idx); + RD_LOGI(NO_TRACE_ID, "Raft Channel: Config {} is expected to be handled by snapshot. Skipping commit.", + log_idx); return; } @@ -335,9 +339,10 @@ int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, // uncommitted logs may or may not included in the snapshot data sent by leader, // depending on the racing of commit vs snapshot read, leading to data inconsistency. if (s_cast< repl_lsn_t >(s.get_last_log_idx()) > m_rd.get_last_commit_lsn()) { - RD_LOGW(NO_TRACE_ID, "not ready to read because there are some uncommitted logs in snapshot, " - "let nuraft retry later. snapshot log_idx={}, last_commit_lsn={}", - s.get_last_log_idx(), m_rd.get_last_commit_lsn()); + RD_LOGW(NO_TRACE_ID, + "not ready to read because there are some uncommitted logs in snapshot, " + "let nuraft retry later. snapshot log_idx={}, last_commit_lsn={}", + s.get_last_log_idx(), m_rd.get_last_commit_lsn()); return -1; } diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index 7da37d5c5..0de9b2744 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -122,7 +122,7 @@ class RaftStateMachine : public nuraft::state_machine { ////////// APIs outside of nuraft::state_machine requirements //////////////////// ReplServiceError propose_to_raft(repl_req_ptr_t rreq); - repl_req_ptr_t localize_journal_entry_prepare(nuraft::log_entry& lentry); + repl_req_ptr_t localize_journal_entry_prepare(nuraft::log_entry& lentry, int64_t lsn = -1); repl_req_ptr_t localize_journal_entry_finish(nuraft::log_entry& lentry); void link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn); void unlink_lsn_to_req(int64_t lsn, repl_req_ptr_t rreq); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 579506db1..c5781bb4b 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -70,9 +70,20 @@ class SoloReplDev : public ReplDev { } repl_lsn_t get_last_commit_lsn() const override { return 0; } + repl_lsn_t get_last_append_lsn() override { return 0; }; uint32_t get_blk_size() const override; + // pause/resume statemachine(commiting thread) + void pause_statemachine() override { return; } + void resume_statemachine() override { return; } + + void quiesce_reqs() override { return; } + void resume_accepting_reqs() override { return; } + + // clear reqs that has allocated blks on the given chunk. + void clear_chunk_req(chunk_num_t chunk_id) override { return; } + void cp_flush(CP* cp); void cp_cleanup(CP* cp); diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index 7f7345e10..ab40e9ea5 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -111,7 +111,7 @@ TEST_F(RaftReplDevTest, Follower_Fetch_OnActive_ReplicaGroup) { } TEST_F(RaftReplDevTest, Write_With_Diabled_Leader_Push_Data) { - g_helper->set_basic_flip("disable_leader_push_data"); + g_helper->set_basic_flip("disable_leader_push_data", std::numeric_limits< int >::max(), 100); LOGINFO("Homestore replica={} setup completed, all the push_data from leader are disabled", g_helper->replica_num()); LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); @@ -125,6 +125,37 @@ TEST_F(RaftReplDevTest, Write_With_Diabled_Leader_Push_Data) { this->validate_data(); g_helper->sync_for_cleanup_start(); + g_helper->remove_flip("disable_leader_push_data"); +} + +TEST_F(RaftReplDevTest, Write_With_Handling_No_Space_Left) { + g_helper->set_basic_flip("simulate_no_space_left", std::numeric_limits< int >::max(), 50); + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + + // this test is slow, so use a smaller number of entries to write in each attempt + uint64_t entries_per_attempt = 50; + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + + g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + g_helper->sync_for_cleanup_start(); + + LOGINFO("Restart all the homestore replicas"); + g_helper->restart(); + g_helper->sync_for_test_start(); + + // Reassign the leader to replica 0, in case restart switched leaders + this->assign_leader(0); + + LOGINFO("Post restart write the data again on the leader"); + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + + LOGINFO("Validate all data written (including pre-restart data) by reading them"); + this->validate_data(); + g_helper->sync_for_cleanup_start(); + g_helper->remove_flip("simulate_no_space_left"); } #endif From 63a8bbe37dde30f728b4ad27927b3bc7d59d1013 Mon Sep 17 00:00:00 2001 From: ywz <649521587@qq.com> Date: Fri, 18 Apr 2025 09:57:26 +0800 Subject: [PATCH 098/170] Adjust grpc message size according to fetch data limit as well (#691) Co-authored-by: yawzhang --- conanfile.py | 2 +- src/lib/common/homestore_config.fbs | 5 +++-- src/lib/homestore.cpp | 10 +++++++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/conanfile.py b/conanfile.py index 89e4c8616..95f4af9ba 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.9.5" + version = "6.9.6" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index 20821fb96..ade32ba0e 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -235,8 +235,9 @@ table Consensus { // Max append batch size max_append_batch_size: int32 = 64; - // Max grpc message size - max_grpc_message_size: int32 = 67108864; + // Max grpc message size, use 64M (max data size on data channel) + 128M (max snasphot batch size) + 1M + // Please adjust it if data_fetch_max_size_kb is increased as well + max_grpc_message_size: int32 = 202375168; // Threshold of log gap from leader to consider a replica as stale stale_log_gap_hi_threshold: int32 = 200; diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index eb276349e..7296b8721 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -137,11 +137,15 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ HomeStoreDynamicConfig::init_settings_default(); // Check if the max_grpc_message_size is large enough to hold the data and snapshot batch size + auto data_fetch_max_size_in_byte = HS_DYNAMIC_CONFIG(consensus.data_fetch_max_size_kb) * 1024ull; + RELEASE_ASSERT(data_fetch_max_size_in_byte <= INT_MAX, "data fetch size is larger than the grpc limit"); if (HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < input.max_data_size || - HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < input.max_snapshot_batch_size) { - LOGERROR("max_grpc_message_size {} is too small to hold max_data_size {} and max_snapshot_batch_size {}", + HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < input.max_snapshot_batch_size || + HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < s_cast< int >(data_fetch_max_size_in_byte)) { + LOGERROR("max_grpc_message_size {} is too small to hold max_data_size {}, max_snapshot_batch_size {} and " + "data_fetch_max_size {}", HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), input.max_data_size, - input.max_snapshot_batch_size); + input.max_snapshot_batch_size, data_fetch_max_size_in_byte); throw std::invalid_argument("max_grpc_message_size is insufficient for the configured data or snapshot sizes"); } From 395cda95fe5d68a5565317007d93b6ecff40a5dd Mon Sep 17 00:00:00 2001 From: Sanal Date: Mon, 21 Apr 2025 23:55:52 +0530 Subject: [PATCH 099/170] Add additional on_commit repldev listener api's. (#692) Add additional on_commit to support vector of blkids. --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 21 ++++++++-- .../replication/repl_dev/raft_repl_dev.cpp | 2 +- src/lib/replication/repl_dev/raft_repl_dev.h | 2 +- .../replication/repl_dev/solo_repl_dev.cpp | 2 +- src/lib/replication/repl_dev/solo_repl_dev.h | 2 +- src/tests/test_common/raft_repl_test_base.hpp | 39 ++++++++++--------- src/tests/test_solo_repl_dev.cpp | 6 ++- 8 files changed, 49 insertions(+), 27 deletions(-) diff --git a/conanfile.py b/conanfile.py index 95f4af9ba..c13df147c 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.9.6" + version = "6.10.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index b0d9a7358..08c73ee09 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -270,6 +270,20 @@ class ReplDevListener { virtual void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, cintrusive< repl_req_ctx >& ctx) = 0; + /// @brief Called when the log entry has been committed in the replica set. + /// + /// This function is called from a dedicated commit thread which is different from the original thread calling + /// replica_set::write(). There is only one commit thread, and lsn is guaranteed to be monotonically increasing. + /// + /// @param lsn - The log sequence number + /// @param header - Header originally passed with replica_set::write() api + /// @param key - Key originally passed with replica_set::write() api + /// @param blkids - List of independent blkids where data is written to the storage engine. + /// @param ctx - Context passed as part of the replica_set::write() api + /// + virtual void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) = 0; + /// @brief Called when the log entry has been received by the replica dev. /// /// On recovery, this is called from a random worker thread before the raft server is started. It is @@ -416,10 +430,11 @@ class ReplDev { /// cases /// @param value - vector of io buffers that contain value for the key. It is an optional field and if the value /// list size is 0, then only key is written to replicadev without data. - /// @param ctx - User supplied context which will be passed to listener - /// callbacks + /// @param ctx - User supplied context which will be passed to listener callbacks + /// @param part_of_batch Is write is part of a batch. If part of the batch, then submit_batch needs to be called at + /// the end virtual void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx, trace_id_t tid = 0) = 0; + repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) = 0; /// @brief Reads the data and returns a future to continue on /// @param bid Block id to read diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 1b1ff0e8c..f5e333d06 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -325,7 +325,7 @@ void RaftReplDev::on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< // we do not have shutdown for async_alloc_write according to the two points above. void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& data, - repl_req_ptr_t rreq, trace_id_t tid) { + repl_req_ptr_t rreq, bool part_of_batch, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index e8aee7f34..696e98737 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -230,7 +230,7 @@ class RaftReplDev : public ReplDev, //////////////// All ReplDev overrides/implementation /////////////////////// void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx, trace_id_t tid = 0) override; + repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) override; folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch = false, trace_id_t tid = 0) override; folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid, trace_id_t tid = 0) override; diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 22f2446d0..1db373150 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -28,7 +28,7 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi } void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t rreq, trace_id_t tid) { + repl_req_ptr_t rreq, bool part_of_batch, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } incr_pending_request_num(); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index c5781bb4b..0fcbeb2aa 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -42,7 +42,7 @@ class SoloReplDev : public ReplDev { // TODO: implement graceful shutdown for solo repl dev void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx, trace_id_t tid = 0) override; + repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) override; folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch = false, trace_id_t tid = 0) override; diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 47778d9a8..636fa5f7c 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -94,7 +94,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { struct journal_header { uint64_t data_size; uint64_t data_pattern; - uint64_t key_id; //put it in header to test duplication in alloc_local_blks + uint64_t key_id; // put it in header to test duplication in alloc_local_blks }; journal_header jheader; uint64_t key_id; @@ -151,6 +151,9 @@ class TestReplicatedDB : public homestore::ReplDevListener { if (ctx->is_proposer()) { g_helper->runner().next_task(); } } + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override {} + bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, cintrusive< repl_req_ctx >& ctx) override { LOGINFOMOD(replication, "[Replica={}] Received pre-commit on lsn={} dsn={}", g_helper->replica_num(), lsn, @@ -172,7 +175,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { cintrusive< repl_req_ctx >& ctx) override { LOGINFOMOD(replication, "[Replica={}] Received error={} on key={}", g_helper->replica_num(), enum_name(error), *(r_cast< uint64_t const* >(key.cbytes()))); - g_helper->runner().comp_promise_.setException(folly::make_exception_wrapper(error)); + g_helper->runner().comp_promise_.setException(folly::make_exception_wrapper< ReplServiceError >(error)); } AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { @@ -318,8 +321,9 @@ class TestReplicatedDB : public homestore::ReplDevListener { void free_user_snp_ctx(void*& user_snp_ctx) override {} - ReplResult get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, cintrusive< homestore::repl_req_ctx >& hs_ctx) override { - auto jheader = r_cast(header.cbytes()); + ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, + cintrusive< homestore::repl_req_ctx >& hs_ctx) override { + auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); Key k{.id_ = jheader->key_id}; auto iter = inmem_db_.find(k); if (iter != inmem_db_.end()) { @@ -357,7 +361,7 @@ class TestReplicatedDB : public homestore::ReplDevListener { test_common::HSTestHelper::create_sgs(data_size, max_size_per_iov, req->jheader.data_pattern); } - repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req, s_uniq_num); + repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req, false, s_uniq_num); } void validate_db_data() { @@ -590,7 +594,8 @@ class RaftReplDevTestBase : public testing::Test { LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); g_helper->runner().set_task([this, block_size, db, data_size]() { static std::normal_distribution<> num_blks_gen{3.0, 2.0}; - uint64_t size = data_size == nullptr ? std::abs(std::lround(num_blks_gen(g_re))) * block_size : *data_size; + uint64_t size = + data_size == nullptr ? std::abs(std::lround(num_blks_gen(g_re))) * block_size : *data_size; this->generate_writes(size, block_size, db); }); if (wait_for_commit) { g_helper->runner().execute().get(); } @@ -631,11 +636,11 @@ class RaftReplDevTestBase : public testing::Test { auto data_size = std::max(1L, std::abs(std::lround(num_blks_gen(g_re)))) * block_size; ASSERT_GT(data_size, 0); LOGINFO("data_size larger than 0, go ahead, data_size= {}.", data_size); - static std::atomic s_uniq_num{0}; + static std::atomic< uint32_t > s_uniq_num{0}; auto req = intrusive(new TestReplicatedDB::test_req()); req->jheader.data_size = data_size; req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num; - //overwrite the key_id with the id passed in + // overwrite the key_id with the id passed in req->jheader.key_id = id; req->key_id = id; @@ -650,17 +655,15 @@ class RaftReplDevTestBase : public testing::Test { db->repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req); }); - if (!wait_for_commit) { - return ReplServiceError::OK; + if (!wait_for_commit) { return ReplServiceError::OK; } + try { + g_helper->runner().execute().get(); + LOGDEBUG("write data task complete, id={}", id) + } catch (const ReplServiceError& e) { + LOGERRORMOD(replication, "[Replica={}] Error in writing data: id={}, error={}", g_helper->replica_num(), id, + enum_name(e)); + return e; } - try { - g_helper->runner().execute().get(); - LOGDEBUG("write data task complete, id={}", id) - } catch (const ReplServiceError& e) { - LOGERRORMOD(replication, "[Replica={}] Error in writing data: id={}, error={}", g_helper->replica_num(), - id, enum_name(e)); - return e; - } written_entries_ += 1; LOGINFO("wait_for_commit={}", written_entries_); diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index aaec8851f..c161df521 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -109,6 +109,9 @@ class SoloReplDevTest : public testing::Test { } } + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override {} + AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { return make_async_success<>(); } @@ -127,7 +130,8 @@ class SoloReplDevTest : public testing::Test { void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, cintrusive< repl_req_ctx >& ctx) override {} - ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, cintrusive< homestore::repl_req_ctx >& hs_ctx) override { + ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, + cintrusive< homestore::repl_req_ctx >& hs_ctx) override { return blk_alloc_hints{}; } From 4f6fe3784c3c7e0770328fd2d878680ff56d7782 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Wed, 23 Apr 2025 00:39:15 +0800 Subject: [PATCH 100/170] reduce io number in simulate_no_space_left and disable_leader_push_data flip test (#694) --- conanfile.py | 2 +- src/tests/test_raft_repl_dev.cpp | 20 +++----------------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/conanfile.py b/conanfile.py index c13df147c..0117e5bb4 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.10.0" + version = "6.10.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index ab40e9ea5..6e21a64e8 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -117,7 +117,7 @@ TEST_F(RaftReplDevTest, Write_With_Diabled_Leader_Push_Data) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); g_helper->sync_for_test_start(); - this->write_on_leader(100, true /* wait_for_commit */); + this->write_on_leader(20, true /* wait_for_commit */); g_helper->sync_for_verify_start(); @@ -133,27 +133,13 @@ TEST_F(RaftReplDevTest, Write_With_Handling_No_Space_Left) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); g_helper->sync_for_test_start(); - // this test is slow, so use a smaller number of entries to write in each attempt - uint64_t entries_per_attempt = 50; - this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + this->write_on_leader(20, true /* wait_for_commit */); g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); this->validate_data(); - g_helper->sync_for_cleanup_start(); - - LOGINFO("Restart all the homestore replicas"); - g_helper->restart(); - g_helper->sync_for_test_start(); - // Reassign the leader to replica 0, in case restart switched leaders - this->assign_leader(0); - - LOGINFO("Post restart write the data again on the leader"); - this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); - - LOGINFO("Validate all data written (including pre-restart data) by reading them"); - this->validate_data(); g_helper->sync_for_cleanup_start(); g_helper->remove_flip("simulate_no_space_left"); } From 5d301ea2407f22191834846dbbd6456b8f743b6c Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Tue, 22 Apr 2025 16:41:13 -0700 Subject: [PATCH 101/170] Fix nightly Jenkins project (#697) --- .jenkins/jenkinsfile_nightly | 38 +++++++++++++++++++++--------------- conanfile.py | 2 +- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/.jenkins/jenkinsfile_nightly b/.jenkins/jenkinsfile_nightly index 7efd9b935..7100a0230 100644 --- a/.jenkins/jenkinsfile_nightly +++ b/.jenkins/jenkinsfile_nightly @@ -1,5 +1,5 @@ pipeline { - agent { label 'sds-builder-2204' } + agent { label 'sds-builder-v5' } triggers { cron('TZ=US/Pacific\nH H(0-2) * * *') } @@ -8,7 +8,7 @@ pipeline { ORG = 'sds' ECR_URL = 'hub.tess.io' ARTIFACTORY_PASS = credentials('ARTIFACTORY_PASS') - CONAN_USER = 'sds' + CONAN_USER = 'oss' failed_stage = "" } stages { @@ -26,6 +26,7 @@ pipeline { VER = sh(script: "grep -m 1 ' version =' conanfile.py | awk '{print \$3}' | tr -d '\n' | tr -d '\"'", returnStdout: true) NIGHTLY_TAG = "master-nightly-debug-4.0" ECR_PATH = "${ECR_URL}/${ORG}/${PROJECT}" + CONAN_FLAGS="--name ${PROJECT} --user ${CONAN_USER} --channel ${NIGHTLY_TAG}" failed_stage = "" } } @@ -40,20 +41,25 @@ pipeline { } stage("Build") { steps { - sh "conan create --build missing -o homestore:sanitize=True -pr debug . ${PROJECT}/${VER}@" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_index_btree' -exec cp {} .jenkins/test_index_btree \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_index_crash_recovery' -exec cp {} .jenkins/test_index_crash_recovery \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_meta_blk_mgr' -exec cp {} .jenkins/test_meta_blk_mgr \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_log_store' -exec cp {} .jenkins/test_log_store \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_home_raft_logstore' -exec cp {} .jenkins/test_home_raft_logstore \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_log_store_long_run' -exec cp {} .jenkins/test_log_store_long_run \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_data_service' -exec cp {} .jenkins/test_data_service \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_raft_repl_dev' -exec cp {} .jenkins/test_raft_repl_dev \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_solo_repl_dev' -exec cp {} .jenkins/test_solo_repl_dev \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/index_test.py' -exec install -Dm755 {} .jenkins/index_test.py \\; " - sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/log_meta_test.py' -exec install -Dm755 {} .jenkins/log_meta_test.py \\; " - sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/data_test.py' -exec install -Dm755 {} .jenkins/data_test.py \\; " - sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/long_running.py' -exec install -Dm755 {} .jenkins/long_running.py \\; " + sh ''' + hostname + echo $NODE_NAME + conan create --build missing -s:h build_type=Debug -o ${PROJECT}/*:sanitize=True ${CONAN_FLAGS} . + + find /home/jenkins -type f -wholename '*/test_index_btree' -exec cp {} .jenkins/test_index_btree \\; + find /home/jenkins -type f -wholename '*/test_index_crash_recovery' -exec cp {} .jenkins/test_index_crash_recovery \\; + find /home/jenkins -type f -wholename '*/test_meta_blk_mgr' -exec cp {} .jenkins/test_meta_blk_mgr \\; + find /home/jenkins -type f -wholename '*/test_log_store' -exec cp {} .jenkins/test_log_store \\; + find /home/jenkins -type f -wholename '*/test_home_raft_logstore' -exec cp {} .jenkins/test_home_raft_logstore \\; + find /home/jenkins -type f -wholename '*/test_log_store_long_run' -exec cp {} .jenkins/test_log_store_long_run \\; + find /home/jenkins -type f -wholename '*/test_data_service' -exec cp {} .jenkins/test_data_service \\; + find /home/jenkins -type f -wholename '*/test_raft_repl_dev' -exec cp {} .jenkins/test_raft_repl_dev \\; + find /home/jenkins -type f -wholename '*/test_solo_repl_dev' -exec cp {} .jenkins/test_solo_repl_dev \\; + find /home/jenkins -type f -wholename '*/scripts/index_test.py' -exec install -Dm755 {} .jenkins/index_test.py \\; + find /home/jenkins -type f -wholename '*/scripts/log_meta_test.py' -exec install -Dm755 {} .jenkins/log_meta_test.py \\; + find /home/jenkins -type f -wholename '*/scripts/data_test.py' -exec install -Dm755 {} .jenkins/data_test.py \\; + find /home/jenkins -type f -wholename '*/scripts/long_running.py' -exec install -Dm755 {} .jenkins/long_running.py \\; + ''' } post { failure { diff --git a/conanfile.py b/conanfile.py index 0117e5bb4..bbccb37e7 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.10.1" + version = "6.10.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" From 2047529970543bf35562d7eaef1d8685164d3ecf Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Wed, 23 Apr 2025 16:48:34 +0800 Subject: [PATCH 102/170] fix repl lsn (#699) --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 2 +- src/lib/replication/repl_dev/raft_repl_dev.h | 2 +- src/lib/replication/repl_dev/raft_state_machine.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conanfile.py b/conanfile.py index bbccb37e7..55d990705 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.10.2" + version = "6.10.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index f5e333d06..4f380c0d9 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1381,7 +1381,7 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, auto raft_req = r_cast< nuraft::req_msg* >(param->ctx); auto const& entries = raft_req->log_entries(); - auto start_lsn = to_repl_lsn(raft_req->get_last_log_idx() + 1); + auto start_lsn = raft_req->get_last_log_idx() + 1; if (entries.size() == 0) { RD_LOGT(NO_TRACE_ID, "Raft channel: Received no entry, leader committed lsn {}", raft_req->get_commit_idx()); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 696e98737..19b672e7b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -253,7 +253,7 @@ class RaftReplDev : public ReplDev, uint32_t get_blk_size() const override; repl_lsn_t get_last_commit_lsn() const override { return m_commit_upto_lsn.load(); } void set_last_commit_lsn(repl_lsn_t lsn) { m_commit_upto_lsn.store(lsn); } - repl_lsn_t get_last_append_lsn() override { return raft_server()->get_last_log_idx() + 1; /*to_repl_lsn*/ } + repl_lsn_t get_last_append_lsn() override { return raft_server()->get_last_log_idx(); } bool is_destroy_pending() const; bool is_destroyed() const; diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 022d718a6..458320944 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -50,7 +50,7 @@ ReplServiceError RaftStateMachine::propose_to_raft(repl_req_ptr_t rreq) { return ReplServiceError::OK; } -repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entry& lentry, int64_t lsn /*repl_lsn*/) { +repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entry& lentry, int64_t lsn) { // Validate the journal entry and see if it needs to be transformed repl_journal_entry* jentry = r_cast< repl_journal_entry* >(lentry.get_buf().data_begin()); RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, From ff453b21a2b954933fe6344a0bf7b171028fb959 Mon Sep 17 00:00:00 2001 From: Sanal Date: Thu, 24 Apr 2025 09:33:15 -0700 Subject: [PATCH 103/170] Make a single on_commit listener function. (#700) Make a single on_commit listener function. List of multiblkids could point to different contigious areas of data. --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 16 +--------------- src/lib/replication/repl_dev/raft_repl_dev.cpp | 2 +- src/lib/replication/repl_dev/solo_repl_dev.cpp | 4 ++-- src/tests/test_common/raft_repl_test_base.hpp | 10 ++++------ src/tests/test_solo_repl_dev.cpp | 12 +++++------- 6 files changed, 14 insertions(+), 32 deletions(-) diff --git a/conanfile.py b/conanfile.py index 55d990705..698ad5865 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.10.3" + version = "6.11.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 08c73ee09..36c07e819 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -256,20 +256,6 @@ class ReplDevListener { void set_repl_dev(shared< ReplDev > rdev) { m_repl_dev = rdev; } shared< ReplDev > repl_dev() { return m_repl_dev.lock(); } - /// @brief Called when the log entry has been committed in the replica set. - /// - /// This function is called from a dedicated commit thread which is different from the original thread calling - /// replica_set::write(). There is only one commit thread, and lsn is guaranteed to be monotonically increasing. - /// - /// @param lsn - The log sequence number - /// @param header - Header originally passed with replica_set::write() api - /// @param key - Key originally passed with replica_set::write() api - /// @param blkids - List of blkids where data is written to the storage engine. - /// @param ctx - Context passed as part of the replica_set::write() api - /// - virtual void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, - cintrusive< repl_req_ctx >& ctx) = 0; - /// @brief Called when the log entry has been committed in the replica set. /// /// This function is called from a dedicated commit thread which is different from the original thread calling @@ -402,7 +388,7 @@ class ReplDevListener { virtual void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) { return; } /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer - virtual void on_log_replay_done(const group_id_t& group_id){}; + virtual void on_log_replay_done(const group_id_t& group_id) {}; private: std::weak_ptr< ReplDev > m_repl_dev; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 4f380c0d9..97f7ed272 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1020,7 +1020,7 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { } else if (rreq->op_code() == journal_type_t::HS_CTRL_REPLACE) { replace_member(rreq); } else { - m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkid(), rreq); + m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), {rreq->local_blkid()}, rreq); } if (!recovery) { diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 1db373150..bc6bdb8bb 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -61,7 +61,7 @@ void SoloReplDev::write_journal(repl_req_ptr_t rreq) { if (cur_lsn < lsn) { m_commit_upto.compare_exchange_strong(cur_lsn, lsn); } data_service().commit_blk(rreq->local_blkid()); - m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkid(), rreq); + m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), {rreq->local_blkid()}, rreq); decr_pending_request_num(); }); } @@ -92,7 +92,7 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx auto cur_lsn = m_commit_upto.load(); if (cur_lsn < lsn) { m_commit_upto.compare_exchange_strong(cur_lsn, lsn); } - m_listener->on_commit(lsn, header, key, blkid, nullptr); + m_listener->on_commit(lsn, header, key, {blkid}, nullptr); } folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 636fa5f7c..6a4be1b41 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -125,16 +125,17 @@ class TestReplicatedDB : public homestore::ReplDevListener { TestReplicatedDB() = default; virtual ~TestReplicatedDB() = default; - void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, - cintrusive< repl_req_ctx >& ctx) override { + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override { ASSERT_EQ(header.size(), sizeof(test_req::journal_header)); + ASSERT_EQ(blkids.size(), 1); auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); Key k{.id_ = *(r_cast< uint64_t const* >(key.cbytes()))}; Value v{.lsn_ = lsn, .data_size_ = jheader->data_size, .data_pattern_ = jheader->data_pattern, - .blkid_ = blkids, + .blkid_ = blkids[0], .id_ = k.id_}; LOGINFOMOD(replication, "[Replica={}] Received commit on lsn={} dsn={} key={} value[blkid={} pattern={}]", @@ -151,9 +152,6 @@ class TestReplicatedDB : public homestore::ReplDevListener { if (ctx->is_proposer()) { g_helper->runner().next_task(); } } - void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, - std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override {} - bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, cintrusive< repl_req_ctx >& ctx) override { LOGINFOMOD(replication, "[Replica={}] Received pre-commit on lsn={} dsn={}", g_helper->replica_num(), lsn, diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index c161df521..03de50531 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -97,21 +97,19 @@ class SoloReplDevTest : public testing::Test { Listener(SoloReplDevTest& test) : m_test{test} {} virtual ~Listener() = default; - void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, - cintrusive< repl_req_ctx >& ctx) override { + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received on_commit lsn={}", lsn); + HS_REL_ASSERT(!blkids.empty(), "Invalid blkids size"); if (ctx == nullptr) { - m_test.validate_replay(*repl_dev(), lsn, header, key, blkids); + m_test.validate_replay(*repl_dev(), lsn, header, key, blkids[0]); } else { auto req = boost::static_pointer_cast< test_repl_req >(ctx); - req->written_blkids = std::move(blkids); + req->written_blkids = blkids[0]; m_test.on_write_complete(*repl_dev(), req); } } - void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, - std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override {} - AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { return make_async_success<>(); } From 2d5899044921654fab5e83ac4f2076dcfebb7cf2 Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Mon, 21 Apr 2025 14:42:55 -0700 Subject: [PATCH 104/170] Add unit test to trigger eviction --- conanfile.py | 2 +- src/lib/common/homestore_config.fbs | 6 +++- src/lib/homestore.cpp | 2 +- src/lib/index/index_cp.cpp | 24 +++++++++------ src/lib/index/index_cp.hpp | 10 +++++-- src/lib/index/wb_cache.cpp | 4 +-- src/tests/btree_helpers/btree_test_helper.hpp | 3 +- src/tests/test_index_btree.cpp | 30 ++++++++++++++++++- 8 files changed, 62 insertions(+), 19 deletions(-) diff --git a/conanfile.py b/conanfile.py index 698ad5865..344563c1d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.11.0" + version = "6.11.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index ade32ba0e..227fc7b9f 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -130,7 +130,11 @@ table Generic { cache_max_throttle_cnt : uint32 = 4; // writeback cache max q depth - cache_min_throttle_cnt : uint32 = 4; // writeback cache min q deoth + cache_min_throttle_cnt : uint32 = 4; // writeback cache min q depth + + cache_hashmap_nbuckets : uint32 = 1000000; // num buckets for sisl::SimpleHashmap used in wbcache + + cache_evictor_npartitions: uint32 = 1000; // num partitions for lru evictor in the cache // if this value is set to 0, no sanity check will be run; sanity_check_level: uint32 = 1 (hotswap); diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 7296b8721..2475362f9 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -261,7 +261,7 @@ void HomeStore::do_start() { const auto& inp_params = HomeStoreStaticConfig::instance().input; uint64_t cache_size = resource_mgr().get_cache_size(); - m_evictor = std::make_shared< sisl::LRUEvictor >(cache_size, 1000); + m_evictor = std::make_shared< sisl::LRUEvictor >(cache_size, HS_DYNAMIC_CONFIG(generic.cache_evictor_npartitions)); if (m_before_services_starting_cb) { m_before_services_starting_cb(); } diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index 122667726..584b58355 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -63,8 +63,8 @@ void IndexCPContext::add_to_txn_journal(uint32_t index_ordinal, const IndexBuffe } } -void IndexCPContext::add_to_dirty_list(const IndexBufferPtr& buf) { - m_dirty_buf_list.push_back(buf); +void IndexCPContext::add_to_dirty_list(const IndexBufferPtr& buf, const BtreeNodePtr& node) { + m_dirty_buf_list.push_back(std::make_pair(buf, node)); buf->set_state(index_buf_state_t::DIRTY); m_dirty_buf_count.increment(1); } @@ -75,7 +75,7 @@ void IndexCPContext::prepare_flush_iteration() { m_dirty_buf_it = m_dirty_buf_li std::optional< IndexBufferPtr > IndexCPContext::next_dirty() { if (m_dirty_buf_it == m_dirty_buf_list.end()) { return std::nullopt; } - IndexBufferPtr ret = *m_dirty_buf_it; + IndexBufferPtr ret = (*m_dirty_buf_it).first; ++m_dirty_buf_it; return ret; } @@ -88,12 +88,14 @@ std::string IndexCPContext::to_string() { // Display all buffers and its dependencies and state. std::unordered_map< IndexBuffer*, std::vector< IndexBuffer* > > parents; - m_dirty_buf_list.foreach_entry([&parents](IndexBufferPtr buf) { + m_dirty_buf_list.foreach_entry([&parents](dirty_buf_entry_t entry) { // Add this buf to his children. + auto& buf = entry.first; parents[buf->m_up_buffer.get()].emplace_back(buf.get()); }); - m_dirty_buf_list.foreach_entry([&str, &parents](IndexBufferPtr buf) { + m_dirty_buf_list.foreach_entry([&str, &parents](dirty_buf_entry_t entry) { + auto& buf = entry.first; fmt::format_to(std::back_inserter(str), "{}", buf->to_string()); auto first = true; for (const auto& p : parents[buf.get()]) { @@ -117,11 +119,13 @@ void IndexCPContext::to_string_dot(const std::string& filename) { // Mapping from a node to all its parents in the graph. std::unordered_map< IndexBuffer*, std::vector< IndexBuffer* > > parents; - m_dirty_buf_list.foreach_entry([&parents](IndexBufferPtr buf) { + m_dirty_buf_list.foreach_entry([&parents](dirty_buf_entry_t entry) { + auto& buf = entry.first; // Add this buf to his children. parents[buf->m_up_buffer.get()].emplace_back(buf.get()); }); - m_dirty_buf_list.foreach_entry([&file, &parents, this](IndexBufferPtr buf) { + m_dirty_buf_list.foreach_entry([&file, &parents, this](dirty_buf_entry_t entry) { + auto& buf = entry.first; std::vector< std::string > colors = {"lightgreen", "lightcoral", "lightyellow"}; auto sbuf = BtreeNode::to_string_buf(buf->raw_buffer()); auto pos = sbuf.find("LEAF"); @@ -149,7 +153,8 @@ uint16_t IndexCPContext::num_dags() { // count number of buffers whose up_buffers are nullptr uint16_t count = 0; std::unique_lock lg{m_flush_buffer_mtx}; - m_dirty_buf_list.foreach_entry([&count](IndexBufferPtr buf) { + m_dirty_buf_list.foreach_entry([&count](dirty_buf_entry_t entry) { + auto &buf = entry.first; if (buf->m_up_buffer == nullptr) { count++; } }); return count; @@ -176,7 +181,8 @@ std::string IndexCPContext::to_string_with_dags() { std::unique_lock lg{m_flush_buffer_mtx}; // Create the graph - m_dirty_buf_list.foreach_entry([&get_insert_buf, &group_roots](IndexBufferPtr buf) { + m_dirty_buf_list.foreach_entry([&get_insert_buf, &group_roots](dirty_buf_entry_t entry) { + auto& buf = entry.first; if (buf->m_up_buffer == nullptr) { auto dgn = get_insert_buf(buf); group_roots.emplace_back(dgn); diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/index_cp.hpp index d7bd124df..3ff267d67 100644 --- a/src/lib/index/index_cp.hpp +++ b/src/lib/index/index_cp.hpp @@ -133,13 +133,15 @@ struct IndexCPContext : public VDevCPContext { }; #pragma pack() + using dirty_buf_entry_t = std::pair< IndexBufferPtr, BtreeNodePtr >; + public: std::atomic< uint64_t > m_num_nodes_added{0}; std::atomic< uint64_t > m_num_nodes_removed{0}; - sisl::ConcurrentInsertVector< IndexBufferPtr > m_dirty_buf_list; + sisl::ConcurrentInsertVector< dirty_buf_entry_t > m_dirty_buf_list; sisl::atomic_counter< int64_t > m_dirty_buf_count{0}; std::mutex m_flush_buffer_mtx; - sisl::ConcurrentInsertVector< IndexBufferPtr >::iterator m_dirty_buf_it; + sisl::ConcurrentInsertVector< dirty_buf_entry_t >::iterator m_dirty_buf_it; iomgr::FiberManagerLib::mutex m_txn_journal_mtx; sisl::io_blob_safe m_txn_journal_buf; @@ -156,7 +158,9 @@ struct IndexCPContext : public VDevCPContext { sisl::io_blob_safe const& journal_buf() const { return m_txn_journal_buf; } - void add_to_dirty_list(const IndexBufferPtr& buf); + // The BtreeNodePtr is added added only to increment the ref count + // which is used by the wbcache to evict the node + void add_to_dirty_list(const IndexBufferPtr& buf, const BtreeNodePtr& node); bool any_dirty_buffers() const; void prepare_flush_iteration(); std::optional< IndexBufferPtr > next_dirty(); diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index fdd635589..10e8082b6 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -45,7 +45,7 @@ IndexWBCacheBase& wb_cache() { IndexWBCache::IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size) : m_vdev{vdev}, - m_cache{evictor, 100000, node_size, + m_cache{evictor, HS_DYNAMIC_CONFIG(generic.cache_hashmap_nbuckets), node_size, [](const BtreeNodePtr& node) -> BlkId { return static_cast< IndexBtreeNode* >(node.get())->m_idx_buf->m_blkid; }, @@ -132,7 +132,7 @@ void IndexWBCache::write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf } else { if (node != nullptr) { m_cache.upsert(node); } LOGTRACEMOD(wbcache, "add to dirty list cp {} {}", cp_ctx->id(), buf->to_string()); - r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf); + r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf, node); resource_mgr().inc_dirty_buf_size(m_node_size); } } diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index 1480f5358..3ab8632e6 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -327,7 +327,8 @@ struct BtreeTestHelper { auto req = BtreeSingleGetRequest{copy_key.get(), out_v.get()}; req.enable_route_tracing(); const auto ret = m_bt->get(req); - ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map"; + ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map" << + " - status=" << enum_name(ret); ASSERT_EQ((const V&)req.value(), value) << "Found value in btree doesn't return correct data for key=" << key; }); diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp index 6b5ff27ab..98a82e48b 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_index_btree.cpp @@ -39,7 +39,7 @@ SISL_OPTION_GROUP( (num_iters, "", "num_iters", "number of iterations for rand ops", ::cxxopts::value< uint32_t >()->default_value("500"), "number"), (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), + ::cxxopts::value< uint32_t >()->default_value("10000"), "number"), (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", @@ -192,6 +192,34 @@ TYPED_TEST(BtreeTest, RandomInsert) { this->get_all(); } +TYPED_TEST(BtreeTest, TriggerCacheEviction) { + // restart homestore with smaller cache % + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.resource_limits.cache_size_percent = 1u; + HS_SETTINGS_FACTORY().save(); + }); + + this->restart_homestore(); + + LOGINFO("TriggerCacheEviction test start"); + const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); + LOGINFO("Step 1: Do insert for {} entries", num_entries); + for (uint32_t i{0}; i < num_entries; ++i) { + this->put(i, btree_put_type::INSERT); + // this->print(); + } + + this->get_all(); + + // reset cache pct + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.resource_limits.cache_size_percent = 65u; + HS_SETTINGS_FACTORY().save(); + }); + + LOGINFO("TriggerCacheEviction test end"); +} + TYPED_TEST(BtreeTest, SequentialRemove) { LOGINFO("SequentialRemove test start"); // Forward sequential insert From a633946b9a94dd6442005eb350a2800b781d504b Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Thu, 24 Apr 2025 15:29:38 -0700 Subject: [PATCH 105/170] Use the index_buffer.is_clean() to determine if it is safe to evict a btree node from cache --- src/lib/index/index_cp.cpp | 24 +++++++++--------------- src/lib/index/index_cp.hpp | 10 +++------- src/lib/index/wb_cache.cpp | 4 ++-- src/tests/test_index_btree.cpp | 2 +- 4 files changed, 15 insertions(+), 25 deletions(-) diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index 584b58355..122667726 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -63,8 +63,8 @@ void IndexCPContext::add_to_txn_journal(uint32_t index_ordinal, const IndexBuffe } } -void IndexCPContext::add_to_dirty_list(const IndexBufferPtr& buf, const BtreeNodePtr& node) { - m_dirty_buf_list.push_back(std::make_pair(buf, node)); +void IndexCPContext::add_to_dirty_list(const IndexBufferPtr& buf) { + m_dirty_buf_list.push_back(buf); buf->set_state(index_buf_state_t::DIRTY); m_dirty_buf_count.increment(1); } @@ -75,7 +75,7 @@ void IndexCPContext::prepare_flush_iteration() { m_dirty_buf_it = m_dirty_buf_li std::optional< IndexBufferPtr > IndexCPContext::next_dirty() { if (m_dirty_buf_it == m_dirty_buf_list.end()) { return std::nullopt; } - IndexBufferPtr ret = (*m_dirty_buf_it).first; + IndexBufferPtr ret = *m_dirty_buf_it; ++m_dirty_buf_it; return ret; } @@ -88,14 +88,12 @@ std::string IndexCPContext::to_string() { // Display all buffers and its dependencies and state. std::unordered_map< IndexBuffer*, std::vector< IndexBuffer* > > parents; - m_dirty_buf_list.foreach_entry([&parents](dirty_buf_entry_t entry) { + m_dirty_buf_list.foreach_entry([&parents](IndexBufferPtr buf) { // Add this buf to his children. - auto& buf = entry.first; parents[buf->m_up_buffer.get()].emplace_back(buf.get()); }); - m_dirty_buf_list.foreach_entry([&str, &parents](dirty_buf_entry_t entry) { - auto& buf = entry.first; + m_dirty_buf_list.foreach_entry([&str, &parents](IndexBufferPtr buf) { fmt::format_to(std::back_inserter(str), "{}", buf->to_string()); auto first = true; for (const auto& p : parents[buf.get()]) { @@ -119,13 +117,11 @@ void IndexCPContext::to_string_dot(const std::string& filename) { // Mapping from a node to all its parents in the graph. std::unordered_map< IndexBuffer*, std::vector< IndexBuffer* > > parents; - m_dirty_buf_list.foreach_entry([&parents](dirty_buf_entry_t entry) { - auto& buf = entry.first; + m_dirty_buf_list.foreach_entry([&parents](IndexBufferPtr buf) { // Add this buf to his children. parents[buf->m_up_buffer.get()].emplace_back(buf.get()); }); - m_dirty_buf_list.foreach_entry([&file, &parents, this](dirty_buf_entry_t entry) { - auto& buf = entry.first; + m_dirty_buf_list.foreach_entry([&file, &parents, this](IndexBufferPtr buf) { std::vector< std::string > colors = {"lightgreen", "lightcoral", "lightyellow"}; auto sbuf = BtreeNode::to_string_buf(buf->raw_buffer()); auto pos = sbuf.find("LEAF"); @@ -153,8 +149,7 @@ uint16_t IndexCPContext::num_dags() { // count number of buffers whose up_buffers are nullptr uint16_t count = 0; std::unique_lock lg{m_flush_buffer_mtx}; - m_dirty_buf_list.foreach_entry([&count](dirty_buf_entry_t entry) { - auto &buf = entry.first; + m_dirty_buf_list.foreach_entry([&count](IndexBufferPtr buf) { if (buf->m_up_buffer == nullptr) { count++; } }); return count; @@ -181,8 +176,7 @@ std::string IndexCPContext::to_string_with_dags() { std::unique_lock lg{m_flush_buffer_mtx}; // Create the graph - m_dirty_buf_list.foreach_entry([&get_insert_buf, &group_roots](dirty_buf_entry_t entry) { - auto& buf = entry.first; + m_dirty_buf_list.foreach_entry([&get_insert_buf, &group_roots](IndexBufferPtr buf) { if (buf->m_up_buffer == nullptr) { auto dgn = get_insert_buf(buf); group_roots.emplace_back(dgn); diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/index_cp.hpp index 3ff267d67..d7bd124df 100644 --- a/src/lib/index/index_cp.hpp +++ b/src/lib/index/index_cp.hpp @@ -133,15 +133,13 @@ struct IndexCPContext : public VDevCPContext { }; #pragma pack() - using dirty_buf_entry_t = std::pair< IndexBufferPtr, BtreeNodePtr >; - public: std::atomic< uint64_t > m_num_nodes_added{0}; std::atomic< uint64_t > m_num_nodes_removed{0}; - sisl::ConcurrentInsertVector< dirty_buf_entry_t > m_dirty_buf_list; + sisl::ConcurrentInsertVector< IndexBufferPtr > m_dirty_buf_list; sisl::atomic_counter< int64_t > m_dirty_buf_count{0}; std::mutex m_flush_buffer_mtx; - sisl::ConcurrentInsertVector< dirty_buf_entry_t >::iterator m_dirty_buf_it; + sisl::ConcurrentInsertVector< IndexBufferPtr >::iterator m_dirty_buf_it; iomgr::FiberManagerLib::mutex m_txn_journal_mtx; sisl::io_blob_safe m_txn_journal_buf; @@ -158,9 +156,7 @@ struct IndexCPContext : public VDevCPContext { sisl::io_blob_safe const& journal_buf() const { return m_txn_journal_buf; } - // The BtreeNodePtr is added added only to increment the ref count - // which is used by the wbcache to evict the node - void add_to_dirty_list(const IndexBufferPtr& buf, const BtreeNodePtr& node); + void add_to_dirty_list(const IndexBufferPtr& buf); bool any_dirty_buffers() const; void prepare_flush_iteration(); std::optional< IndexBufferPtr > next_dirty(); diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 10e8082b6..749b530d9 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -51,7 +51,7 @@ IndexWBCache::IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< }, [](const sisl::CacheRecord& rec) -> bool { const auto& hnode = (sisl::SingleEntryHashNode< BtreeNodePtr >&)rec; - return (hnode.m_value->m_refcount.test_le(1)); + return static_cast< IndexBtreeNode* >(hnode.m_value.get())->m_idx_buf->is_clean(); }}, m_node_size{node_size}, m_meta_blk{sb.first} { @@ -132,7 +132,7 @@ void IndexWBCache::write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf } else { if (node != nullptr) { m_cache.upsert(node); } LOGTRACEMOD(wbcache, "add to dirty list cp {} {}", cp_ctx->id(), buf->to_string()); - r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf, node); + r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf); resource_mgr().inc_dirty_buf_size(m_node_size); } } diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp index 98a82e48b..6083140bf 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_index_btree.cpp @@ -39,7 +39,7 @@ SISL_OPTION_GROUP( (num_iters, "", "num_iters", "number of iterations for rand ops", ::cxxopts::value< uint32_t >()->default_value("500"), "number"), (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value< uint32_t >()->default_value("10000"), "number"), + ::cxxopts::value< uint32_t >()->default_value("7000"), "number"), (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", From 9f426e098003595c5bffd0e7b1e05776284c439b Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Thu, 24 Apr 2025 21:16:21 -0700 Subject: [PATCH 106/170] Issue 696 Support remove_repl_dev for solo repl dev (#698) * Issue 696 Support remove_repl_dev for solo repl dev * trigger destroy repl dev listener in solo repl dev remove api --- conanfile.py | 2 +- src/include/homestore/replication_service.hpp | 4 +++ .../replication/repl_dev/raft_repl_dev.cpp | 1 + .../replication/service/generic_repl_svc.cpp | 27 +++++++++++++++++-- .../replication/service/raft_repl_service.cpp | 3 +++ src/tests/test_common/hs_repl_test_common.hpp | 2 ++ src/tests/test_solo_repl_dev.cpp | 1 + 7 files changed, 37 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index 344563c1d..7c1a5d566 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.11.1" + version = "6.12.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index bac805dd5..448bb9afe 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -75,6 +75,10 @@ class ReplApplication { // Listener corresponding to the ReplDev which will be used to perform the precommit/commit/rollback. virtual shared< ReplDevListener > create_repl_dev_listener(group_id_t group_id) = 0; + // Called when the repl dev is destroyed. This interface provides the application a chance to cleanup any resources + // assocated with this listener; + virtual void destroy_repl_dev_listener(group_id_t group_id) = 0; + // Called after all the repl devs are found upon restart of the homestore instance. // it is a nice place for upper layer to recovery anything depends on repl_devs virtual void on_repl_devs_init_completed() = 0; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 97f7ed272..5080d4689 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1341,6 +1341,7 @@ void RaftReplDev::permanent_destroy() { m_data_journal->remove_store(); logstore_service().destroy_log_dev(m_data_journal->logdev_id()); m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::PERMANENT_DESTROYED; }); + // we should destroy repl_dev superblk only after all the resources are cleaned up, so that is crash recovery // occurs, we have a chance to find the stale repl_dev and reclaim all the stale resources. m_rd_sb.destroy(); diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 20a5f8436..e2932bef7 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include "common/homestore_assert.hpp" #include "replication/service/generic_repl_svc.h" #include "replication/service/raft_repl_service.h" @@ -78,7 +79,7 @@ hs_stats GenericReplService::get_cap_stats() const { ///////////////////// SoloReplService specializations and CP Callbacks ///////////////////////////// SoloReplService::SoloReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} {} -SoloReplService::~SoloReplService() {}; +SoloReplService::~SoloReplService(){}; void SoloReplService::start() { for (auto const& [buf, mblk] : m_sb_bufs) { @@ -144,7 +145,29 @@ AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t } folly::SemiFuture< ReplServiceError > SoloReplService::remove_repl_dev(group_id_t group_id) { - return folly::makeSemiFuture< ReplServiceError >(ReplServiceError::NOT_IMPLEMENTED); + // RD_LOGI("Removing repl dev for group_id={}", boost::uuids::to_string(group_id)); + auto rdev = get_repl_dev(group_id); + if (rdev.hasError()) { return folly::makeSemiFuture(rdev.error()); } + + auto rdev_ptr = rdev.value(); + + // 1. Firstly stop the repl dev which waits for any outstanding requests to finish + rdev_ptr->stop(); + + // 2. detaches both ways: + // detach rdev from its listener and listener from rdev; + rdev_ptr->detach_listener(); + { + // 3. remove from rd map which finally call SoloReplDev's destructor because this is the last one holding ref to + // this instance; + std::unique_lock lg(m_rd_map_mtx); + m_rd_map.erase(group_id); + } + + // 4. now destroy the upper layer's listener instance; + m_repl_app->destroy_repl_dev_listener(group_id); + + return folly::makeSemiFuture(ReplServiceError::OK); } void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 280aa0032..d70593a94 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -412,6 +412,7 @@ folly::SemiFuture< ReplServiceError > RaftReplService::remove_repl_dev(group_id_ } auto ret = std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value())->destroy_group(); + decr_pending_request_num(); return ret; } @@ -589,6 +590,8 @@ void RaftReplService::gc_repl_devs() { // Therefore, we perform it outside the lock scope and then remove group from m_rd_map. for (const auto& group_id : groups_to_leave) { m_msg_mgr->leave_group(group_id); + // notify consumer to cleanup any resources associated with the listener itself; + m_repl_app->destroy_repl_dev_listener(group_id); { std::unique_lock lg(m_rd_map_mtx); m_rd_map.erase(group_id); diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp index 7b93cccb2..4393b13d5 100644 --- a/src/tests/test_common/hs_repl_test_common.hpp +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -115,6 +115,8 @@ class HSReplTestHelper : public HSTestHelper { create_repl_dev_listener(homestore::group_id_t group_id) override { return helper_.get_listener(group_id); } + void destroy_repl_dev_listener(homestore::group_id_t) override {} + void on_repl_devs_init_completed() { LOGINFO("Repl dev init completed CB called"); } std::pair< std::string, uint16_t > lookup_peer(homestore::replica_id_t replica_id) const override { diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 03de50531..8415b46db 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -156,6 +156,7 @@ class SoloReplDevTest : public testing::Test { shared< ReplDevListener > create_repl_dev_listener(uuid_t) override { return std::make_shared< Listener >(m_test); } + void destroy_repl_dev_listener(uuid_t) override {} void on_repl_devs_init_completed() { LOGINFO("Repl dev init completed CB called"); } std::pair< std::string, uint16_t > lookup_peer(uuid_t uuid) const override { return std::make_pair("", 0u); } replica_id_t get_my_repl_id() const override { return hs_utils::gen_random_uuid(); } From 460695bfedab4189aedae140e4fbaf77a52f0fb0 Mon Sep 17 00:00:00 2001 From: yuwmao <148639999+yuwmao@users.noreply.github.com> Date: Fri, 25 Apr 2025 15:09:31 +0800 Subject: [PATCH 107/170] Set priority when create RaftReplDev (#695) --- conanfile.py | 2 +- .../homestore/replication/repl_decls.h | 6 ++ src/lib/common/homestore_config.fbs | 6 ++ .../replication/repl_dev/raft_repl_dev.cpp | 9 ++- src/lib/replication/repl_dev/solo_repl_dev.h | 7 +- .../replication/service/generic_repl_svc.cpp | 4 +- .../replication/service/raft_repl_service.cpp | 21 +++++- .../replication/service/raft_repl_service.h | 4 + src/tests/test_common/hs_repl_test_common.hpp | 17 +++++ src/tests/test_raft_repl_dev.cpp | 74 +++++++++++++++++++ 10 files changed, 140 insertions(+), 10 deletions(-) diff --git a/conanfile.py b/conanfile.py index 7c1a5d566..cfffdf0ba 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.12.1" + version = "6.12.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 90f2c67f7..54435c1c7 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -79,6 +79,12 @@ struct peer_info { uint64_t replication_idx_; // The elapsed time since the last successful response from this peer, set to 0 on leader uint64_t last_succ_resp_us_; + // The priority for leader election + uint32_t priority_; + // The peer is learner or not + bool is_learner_; + // The peer is new joiner or not + bool is_new_joiner_; }; struct replica_member_info { diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index 227fc7b9f..a661da497 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -284,6 +284,12 @@ table Consensus { // Reading snapshot objects will be done by a background thread asynchronously // instead of synchronous read by Raft worker threads use_bg_thread_for_snapshot_io: bool = true; + + // Maximum number of election timeout rounds to wait during a prioritized leader election process. + // Every election timeout will compare its priority with the target_priority(max priority of the peers initially) + // then decay the target_priority and wait again until its priority >= target_priority. This setting helps us to set proper priority for peers. + // 0 means all members have the same priority. + max_wait_rounds_of_priority_election: uint32 = 2; } table HomeStoreSettings { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 5080d4689..92bc155ba 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1168,7 +1168,10 @@ std::vector< peer_info > RaftReplDev::get_replication_status() const { for (auto const& pinfo : rep_status) { pi.emplace_back(peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), .replication_idx_ = pinfo.last_log_idx_, - .last_succ_resp_us_ = pinfo.last_succ_resp_us_}); + .last_succ_resp_us_ = pinfo.last_succ_resp_us_, + .priority_ = pinfo.priority_, + .is_learner_ = pinfo.is_learner_, + .is_new_joiner_ = pinfo.is_new_joiner_}); } return pi; } @@ -1271,8 +1274,8 @@ nuraft::ptr< nuraft::cluster_config > RaftReplDev::load_config() { if (!js.contains("config")) { auto cluster_conf = nuraft::cs_new< nuraft::cluster_config >(); - cluster_conf->get_servers().push_back( - nuraft::cs_new< nuraft::srv_config >(m_raft_server_id, my_replica_id_str())); + cluster_conf->get_servers().push_back(nuraft::cs_new< nuraft::srv_config >( + m_raft_server_id, 0, my_replica_id_str(), "", false, raft_leader_priority)); js["config"] = serialize_cluster_config(*cluster_conf); } return deserialize_cluster_config(js["config"]); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 0fcbeb2aa..932e74511 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -53,7 +53,12 @@ class SoloReplDev : public ReplDev { bool is_leader() const override { return true; } replica_id_t get_leader_id() const override { return m_group_id; } std::vector< peer_info > get_replication_status() const override { - return std::vector< peer_info >{peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0}}; + return std::vector< peer_info >{peer_info{.id_ = m_group_id, + .replication_idx_ = 0, + .last_succ_resp_us_ = 0, + .priority_ = 1, + .is_learner_ = false, + .is_new_joiner_ = false}}; } bool is_ready_for_traffic() const override { return true; } void purge() override {} diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index e2932bef7..3fb0357fc 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -117,8 +117,8 @@ void SoloReplService::stop() { hs()->data_service().stop(); } -AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t group_id, - std::set< replica_id_t > const& members) { +AsyncReplResult< shared< ReplDev > > +SoloReplService::create_repl_dev(group_id_t group_id, std::set< replica_id_t > const& members) { superblk< repl_dev_superblk > rd_sb{get_meta_blk_name()}; rd_sb.create(); rd_sb->group_id = group_id; diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index d70593a94..c1016e452 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -59,6 +59,17 @@ ReplServiceError RaftReplService::to_repl_error(nuraft::cmd_result_code code) { return ret; } +// NuRaft priority decay coefficient is set to 0.8(currently not configurable). For more details, please refer to +// https://github.com/eBay/NuRaft/blob/master/docs/leader_election_priority.md +int32_t RaftReplService::compute_raft_follower_priority() { + auto max_wait_round = std::min(raft_priority_election_round_upper_limit, + HS_DYNAMIC_CONFIG(consensus.max_wait_rounds_of_priority_election)); + if (max_wait_round == 0) { return raft_leader_priority; } + auto priority = 1 + static_cast< int32_t >( + std::ceil(raft_leader_priority * std::pow(raft_priority_decay_coefficient, max_wait_round))); + return priority; +} + RaftReplService::RaftReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} { m_config_sb_bufs.reserve(100); meta_service().register_handler( @@ -344,14 +355,18 @@ AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t return make_async_error< shared< ReplDev > >(to_repl_error(status.error())); } + auto follower_priority = compute_raft_follower_priority(); + auto my_id = m_repl_app->get_my_repl_id(); for (auto& member : members) { if (member == my_id) { continue; } // Skip myself do { - auto const result = m_msg_mgr->add_member(group_id, member).get(); + auto srv_config = nuraft::srv_config(nuraft_mesg::to_server_id(member), 0, boost::uuids::to_string(member), "", + false, follower_priority); + auto const result = m_msg_mgr->add_member(group_id, srv_config).get(); if (result) { - LOGINFOMOD(replication, "Groupid={}, new member={} added", boost::uuids::to_string(group_id), - boost::uuids::to_string(member)); + LOGINFOMOD(replication, "Groupid={}, new member={} added with priority={}", boost::uuids::to_string(group_id), + boost::uuids::to_string(member), follower_priority); break; } else if (result.error() != nuraft::CONFIG_CHANGING) { LOGWARNMOD(replication, "Groupid={}, add member={} failed with error={}", diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 953ba95e9..87479aa01 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -33,6 +33,9 @@ namespace homestore { constexpr auto cert_change_timeout = std::chrono::seconds(1200); constexpr auto cert_check_sleep = std::chrono::seconds(1); +constexpr int32_t raft_leader_priority = 100; +constexpr double raft_priority_decay_coefficient = 0.8; +constexpr uint32_t raft_priority_election_round_upper_limit = 5; struct repl_dev_superblk; class RaftReplDev; @@ -57,6 +60,7 @@ class RaftReplService : public GenericReplService, ~RaftReplService() override; static ReplServiceError to_repl_error(nuraft::cmd_result_code code); + int32_t compute_raft_follower_priority(); ///////////////////// Overrides of nuraft_mesg::MessagingApplication //////////////////// std::string lookup_peer(nuraft_mesg::peer_id_t const&) override; diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp index 4393b13d5..92ff45a69 100644 --- a/src/tests/test_common/hs_repl_test_common.hpp +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -17,6 +17,8 @@ */ #pragma once +#include "raft_repl_test_base.hpp" + #include #include #include @@ -35,6 +37,8 @@ #include #include "test_common/homestore_test_common.hpp" +#include + SISL_OPTION_GROUP(test_repl_common_setup, (replicas, "", "replicas", "Total number of replicas", ::cxxopts::value< uint32_t >()->default_value("3"), "number"), @@ -298,6 +302,19 @@ class HSReplTestHelper : public HSTestHelper { auto v = hs()->repl_service().create_repl_dev(repl_group_id, members).get(); ASSERT_EQ(v.hasValue(), true) << "Error in creating repl dev for group_id=" << boost::uuids::to_string(repl_group_id).c_str(); + auto& raftService = dynamic_cast< RaftReplService& >(hs()->repl_service()); + auto follower_priority = raftService.compute_raft_follower_priority(); + auto repl_dev = v.value(); + ASSERT_EQ(my_replica_id_, repl_dev->get_leader_id()); + auto peer_info = repl_dev->get_replication_status(); + for (auto pinfo : peer_info) { + LOGINFO("Replica={} has priority={}", boost::uuids::to_string(pinfo.id_), pinfo.priority_); + if (pinfo.id_ == my_replica_id_) { + ASSERT_EQ(raft_leader_priority, pinfo.priority_); + } else { + ASSERT_EQ(follower_priority, pinfo.priority_); + } + } } } diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index 6e21a64e8..f6d458943 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -484,6 +484,80 @@ TEST_F(RaftReplDevTest, LargeDataWrite) { g_helper->sync_for_cleanup_start(); } +TEST_F(RaftReplDevTest, PriorityLeaderElection) { + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + uint64_t entries_per_attempt = SISL_OPTIONS["num_io"].as< uint64_t >(); + if (g_helper->replica_num() == 0) { + auto leader = this->wait_and_get_leader_id(); + ASSERT_EQ(leader, g_helper->my_replica_id()); + } + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + + g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + g_helper->sync_for_cleanup_start(); + + LOGINFO("Restart leader"); + if (g_helper->replica_num() == 0) { g_helper->restart_homestore(); } + g_helper->sync_for_test_start(); + + LOGINFO("Validate leader switched"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + auto leader = this->wait_and_get_leader_id(); + if (g_helper->replica_num() == 0) { ASSERT_NE(leader, g_helper->my_replica_id()); } + g_helper->sync_for_verify_start(); + + if (leader == g_helper->my_replica_id()) { + LOGINFO("Resign and trigger a priority leader election"); + // resign and trigger a priority leader election + g_helper->restart_homestore(); + } + g_helper->sync_for_test_start(); + + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + leader = this->wait_and_get_leader_id(); + LOGINFO("Validate leader switched back to initial replica"); + if (g_helper->replica_num() == 0) { ASSERT_EQ(leader, g_helper->my_replica_id()); } + g_helper->sync_for_verify_start(); + + LOGINFO("Post restart write the data again on the leader"); + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + + LOGINFO("Validate all data written (including pre-restart data) by reading them"); + this->validate_data(); + g_helper->sync_for_cleanup_start(); +} + +TEST_F(RaftReplDevTest, ComputePriority) { + g_helper->sync_for_test_start(); + auto& raftService = dynamic_cast< RaftReplService& >(hs()->repl_service()); + + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.max_wait_rounds_of_priority_election = 0; }); + HS_SETTINGS_FACTORY().save(); + ASSERT_EQ(raftService.compute_raft_follower_priority(), raft_leader_priority); + + for (auto i = 1; i <= int(raft_priority_election_round_upper_limit); i++) { + HS_SETTINGS_FACTORY().modifiable_settings( + [i](auto& s) { s.consensus.max_wait_rounds_of_priority_election = i; }); + HS_SETTINGS_FACTORY().save(); + auto follower_priority = raftService.compute_raft_follower_priority(); + // Simulate nuraft algorithm + auto decayed_priority = raft_leader_priority; + for (auto j = 1; j <= i; j++) { + int gap = std::max((int)10, decayed_priority / 5); + decayed_priority = std::max(1, decayed_priority - gap); + } + LOGINFO("Follower priority={} decayed_priority={}", follower_priority, decayed_priority); + ASSERT_TRUE(follower_priority >= decayed_priority); + } + // Set back to default value + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.max_wait_rounds_of_priority_election = 2; }); + HS_SETTINGS_FACTORY().save(); + g_helper->sync_for_cleanup_start(); +} + int main(int argc, char* argv[]) { int parsed_argc = argc; char** orig_argv = argv; From ecdae2090d34b508f2c6c9ff54cfe8c95bbdbb21 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Fri, 25 Apr 2025 18:22:10 +0800 Subject: [PATCH 108/170] add traceid for replace member --- conanfile.py | 2 +- src/include/homestore/replication_service.hpp | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 8 +++----- src/lib/replication/repl_dev/raft_repl_dev.h | 2 +- src/lib/replication/service/generic_repl_svc.cpp | 3 ++- src/lib/replication/service/generic_repl_svc.h | 3 ++- src/lib/replication/service/raft_repl_service.cpp | 5 +++-- src/lib/replication/service/raft_repl_service.h | 3 ++- 8 files changed, 15 insertions(+), 13 deletions(-) diff --git a/conanfile.py b/conanfile.py index cfffdf0ba..2de89c213 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.12.2" + version = "6.12.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index 448bb9afe..23ee2422c 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -43,7 +43,7 @@ class ReplicationService { virtual AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, const replica_member_info& member_in, - uint32_t commit_quorum = 0) const = 0; + uint32_t commit_quorum = 0, uint64_t trace_id = 0) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened /// @param group_id Group id interested in diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 92bc155ba..ea618b694 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -137,12 +137,10 @@ bool RaftReplDev::join_group() { } AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum) { - // Fixme: traceID for replace member - uint64_t trace_id = 0; - + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) { if (is_stopping()) { - LOGINFO("repl dev is being shutdown!"); + LOGINFO("repl dev is being shutdown! trace_id={}", trace_id); return make_async_error<>(ReplServiceError::STOPPING); } incr_pending_request_num(); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 19b672e7b..0dec2c45b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -225,7 +225,7 @@ class RaftReplDev : public ReplDev, bool bind_data_service(); bool join_group(); AsyncReplResult<> replace_member(const replica_member_info& member_out, const replica_member_info& member_in, - uint32_t commit_quorum); + uint32_t commit_quorum, uint64_t trace_id = 0); folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 3fb0357fc..5ac65981a 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -191,7 +191,8 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum) const { + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index a7325ceca..ab2fd4bf4 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -90,7 +90,8 @@ class SoloReplService : public GenericReplService { folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0) const override; + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; }; class SoloReplServiceCPHandler : public CPCallbacks { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index c1016e452..b1b4b9a89 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -477,7 +477,8 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum) const { + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); incr_pending_request_num(); auto rdev_result = get_repl_dev(group_id); @@ -487,7 +488,7 @@ AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const rep } return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) - ->replace_member(member_out, member_in, commit_quorum) + ->replace_member(member_out, member_in, commit_quorum, trace_id) .via(&folly::InlineExecutor::instance()) .thenValue([this](auto&& e) mutable { if (e.hasError()) { diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 87479aa01..5f70efd0e 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -79,7 +79,8 @@ class RaftReplService : public GenericReplService, folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0) const override; + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; private: RaftReplDev* raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie); From 45d7195f7b75658e0a504863406e5557a975c5e1 Mon Sep 17 00:00:00 2001 From: Yaming Kuang Date: Mon, 28 Apr 2025 11:12:18 -0700 Subject: [PATCH 109/170] adopt api signature change --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index 2de89c213..26c04a6aa 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.12.3" + version = "6.12.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index ea618b694..182df64e3 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1764,7 +1764,7 @@ void RaftReplDev::report_blk_metrics_if_needed(repl_req_ptr_t rreq) { void RaftReplDev::pause_statemachine() { if (!raft_server()->is_state_machine_execution_paused()) { - raft_server()->pause_state_machine_exeuction(); + raft_server()->pause_state_machine_execution(); while (!raft_server()->wait_for_state_machine_pause(100)) { RD_LOGD(NO_TRACE_ID, "wait for statemachine pause!"); } From 31a8b42ad52c1e621be7366942c338bfa0524138 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Tue, 29 Apr 2025 12:43:34 +0800 Subject: [PATCH 110/170] support handling config rollback and add periodical notification of the lastest committed lsn to upper layer (#703) --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 20 +++++++++----- .../replication/repl_dev/raft_repl_dev.cpp | 27 +++++++------------ src/lib/replication/repl_dev/raft_repl_dev.h | 5 +--- .../repl_dev/raft_state_machine.cpp | 2 +- src/lib/replication/repl_dev/solo_repl_dev.h | 4 --- src/tests/test_common/raft_repl_test_base.hpp | 12 +++++++++ src/tests/test_solo_repl_dev.cpp | 3 +++ 8 files changed, 42 insertions(+), 33 deletions(-) diff --git a/conanfile.py b/conanfile.py index 26c04a6aa..ae4092886 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.12.4" + version = "6.13.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 36c07e819..60a0f8430 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -270,6 +270,14 @@ class ReplDevListener { virtual void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) = 0; + /// @brief periodically called to notify the lastest committed lsn to the listener. + /// NOTE: this callback will block the thread of flushing the latest committed lsn into repl_dev superblk as DC_LSN, + /// pls take care if there is any heavy or blocking operation in this callback. + /// + /// @param lsn - The lasted committed log sequence number so far + /// + virtual void notify_committed_lsn(int64_t lsn) = 0; + /// @brief Called when the log entry has been received by the replica dev. /// /// On recovery, this is called from a random worker thread before the raft server is started. It is @@ -307,6 +315,10 @@ class ReplDevListener { virtual void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, cintrusive< repl_req_ctx >& ctx) = 0; + /// @brief Called when the config log entry has been rolled back. + /// @param lsn - The log sequence number getting rolled back + virtual void on_config_rollback(int64_t lsn) = 0; + /// @brief Called when the replDev is created after restart. The consumer is expected to recover all the modules /// necessary to replay/commit the logs. virtual void on_restart() = 0; @@ -385,10 +397,10 @@ class ReplDevListener { /// @brief ask upper layer to handle no_space_left event // @param lsn - on which repl_lsn no_space_left happened // @param chunk_id - on which chunk no_space_left happened - virtual void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) { return; } + virtual void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) = 0; /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer - virtual void on_log_replay_done(const group_id_t& group_id) {}; + virtual void on_log_replay_done(const group_id_t& group_id){}; private: std::weak_ptr< ReplDev > m_repl_dev; @@ -505,10 +517,6 @@ class ReplDev { } } - // pause/resume statemachine(commiting thread) - virtual void pause_statemachine() = 0; - virtual void resume_statemachine() = 0; - // complete all the requests that are in progress and start refusing new reqs virtual void quiesce_reqs() = 0; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 182df64e3..415525e74 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1043,6 +1043,13 @@ void RaftReplDev::handle_config_commit(const repl_lsn_t lsn, raft_cluster_config } } +void RaftReplDev::handle_config_rollback(const repl_lsn_t lsn, raft_cluster_config_ptr_t& conf) { + RD_LOGD(NO_TRACE_ID, "roll back config on lsn {}", lsn); + // keep this variable in case it is needed later + (void)conf; + m_listener->on_config_rollback(lsn); +} + void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) { if (err == ReplServiceError::OK) { return; } RD_LOGE(rreq->traceID(), "Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); @@ -1465,12 +1472,14 @@ nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, } void RaftReplDev::flush_durable_commit_lsn() { + auto const lsn = m_commit_upto_lsn.load(); + m_listener->notify_committed_lsn(lsn); + if (is_destroyed()) { RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore flush durable commit lsn"); return; } - auto const lsn = m_commit_upto_lsn.load(); RD_LOGT(NO_TRACE_ID, "Flushing durable commit lsn to {}", lsn); std::unique_lock lg{m_sb_mtx}; m_rd_sb->durable_commit_lsn = lsn; @@ -1762,22 +1771,6 @@ void RaftReplDev::report_blk_metrics_if_needed(repl_req_ptr_t rreq) { } } -void RaftReplDev::pause_statemachine() { - if (!raft_server()->is_state_machine_execution_paused()) { - raft_server()->pause_state_machine_execution(); - while (!raft_server()->wait_for_state_machine_pause(100)) { - RD_LOGD(NO_TRACE_ID, "wait for statemachine pause!"); - } - } -} - -void RaftReplDev::resume_statemachine() { - if (raft_server()->is_state_machine_execution_paused()) { - raft_server()->resume_state_machine_execution(); - RD_LOGD(NO_TRACE_ID, "statemachine is resumed!"); - } -} - void RaftReplDev::quiesce_reqs() { // all the block allocation happens in rreq->init. so after we wait for all the pending req has been initialized we // can make sure diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 0dec2c45b..bd6a6c448 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -277,6 +277,7 @@ class RaftReplDev : public ReplDev, void handle_commit(repl_req_ptr_t rreq, bool recovery = false); void handle_config_commit(const repl_lsn_t lsn, raft_cluster_config_ptr_t& new_conf); void handle_rollback(repl_req_ptr_t rreq); + void handle_config_rollback(const repl_lsn_t lsn, raft_cluster_config_ptr_t& old_conf); repl_req_ptr_t repl_key_to_req(repl_key const& rkey) const; repl_req_ptr_t applier_create_req(repl_key const& rkey, journal_type_t code, sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, bool is_data_channel, @@ -356,10 +357,6 @@ class RaftReplDev : public ReplDev, */ bool need_skip_processing(const repl_lsn_t lsn) { return lsn <= m_rd_sb->last_snapshot_lsn; } - // pause/resume statemachine(commiting thread) - void pause_statemachine(); - void resume_statemachine(); - void quiesce_reqs(); void resume_accepting_reqs(); diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 458320944..3c68c07b5 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -246,7 +246,7 @@ void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_pt void RaftStateMachine::rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) { RD_LOGD(NO_TRACE_ID, "Raft channel: Rollback cluster conf , log_idx = {}", log_idx); - // TODO:add more logic here if necessary + m_rd.handle_config_rollback(s_cast< repl_lsn_t >(log_idx), conf); } void RaftStateMachine::rollback_ext(const nuraft::state_machine::ext_op_params& params) { diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 932e74511..63838f254 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -79,10 +79,6 @@ class SoloReplDev : public ReplDev { uint32_t get_blk_size() const override; - // pause/resume statemachine(commiting thread) - void pause_statemachine() override { return; } - void resume_statemachine() override { return; } - void quiesce_reqs() override { return; } void resume_accepting_reqs() override { return; } diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 6a4be1b41..0dbd539e3 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -176,6 +176,18 @@ class TestReplicatedDB : public homestore::ReplDevListener { g_helper->runner().comp_promise_.setException(folly::make_exception_wrapper< ReplServiceError >(error)); } + void notify_committed_lsn(int64_t lsn) override { + LOGINFOMOD(replication, "[Replica={}] Received notify_committed_lsn={}", g_helper->replica_num(), lsn); + } + + void on_config_rollback(int64_t lsn) override { + LOGINFOMOD(replication, "[Replica={}] Received config rollback at lsn={}", g_helper->replica_num(), lsn); + } + void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) override { + LOGINFOMOD(replication, "[Replica={}] Received no_space_left at lsn={}, chunk_id={}", g_helper->replica_num(), + lsn, chunk_id); + } + AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { std::lock_guard< std::mutex > lock(m_snapshot_lock); auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 8415b46db..13bcc52b0 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -141,6 +141,9 @@ class SoloReplDevTest : public testing::Test { } void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) override {} void on_destroy(const group_id_t& group_id) override {} + void notify_committed_lsn(int64_t lsn) override {} + void on_config_rollback(int64_t lsn) override {} + void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) override {} }; class Application : public ReplApplication { From 2dc44585526820ab4953c38601c211226d8847c2 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Mon, 28 Apr 2025 16:09:41 +0800 Subject: [PATCH 111/170] fix: init rkey with trace id --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index ae4092886..2a79125ff 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.1" + version = "6.13.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 415525e74..082f4fac4 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -338,7 +338,7 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } auto status = init_req_ctx( - rreq, repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, + rreq, repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1), .traceID = tid}, data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, header, key, data.size, m_listener); From b8d2b789ecf38da2fc16c31fc741ea78429b845c Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Tue, 29 Apr 2025 11:16:32 -0700 Subject: [PATCH 112/170] Long remove test (#702) * add long running for remove * Fix index crash recovery and enable prefix in memory UT --- conanfile.py | 2 +- src/include/homestore/btree/btree.hpp | 3 +- .../homestore/btree/detail/btree_common.ipp | 84 ++- .../homestore/btree/detail/btree_internal.hpp | 6 +- .../homestore/btree/detail/btree_node.hpp | 25 +- .../homestore/btree/detail/btree_node_mgr.ipp | 10 +- .../btree/detail/btree_remove_impl.ipp | 38 +- .../homestore/btree/detail/simple_node.hpp | 16 +- .../homestore/index/index_internal.hpp | 3 + src/include/homestore/index/index_table.hpp | 680 ++++++++++++++---- src/include/homestore/index_service.hpp | 1 + src/lib/common/homestore_config.fbs | 6 + src/lib/index/index_cp.cpp | 85 ++- src/lib/index/index_cp.hpp | 4 +- src/lib/index/index_service.cpp | 15 +- src/lib/index/wb_cache.cpp | 163 +++-- src/lib/index/wb_cache.hpp | 10 + src/tests/CMakeLists.txt | 2 +- src/tests/btree_helpers/btree_test_helper.hpp | 19 +- src/tests/test_index_btree.cpp | 2 + src/tests/test_index_crash_recovery.cpp | 250 ++++++- src/tests/test_mem_btree.cpp | 18 +- src/tests/test_scripts/index_test.py | 25 +- 23 files changed, 1181 insertions(+), 286 deletions(-) diff --git a/conanfile.py b/conanfile.py index 2a79125ff..59bdcb513 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.2" + version = "6.13.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index 0fec83ddd..3ba74623f 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -201,9 +201,10 @@ class Btree { uint64_t get_btree_node_cnt() const; uint64_t get_child_node_cnt(bnodeid_t bnodeid) const; void to_string(bnodeid_t bnodeid, std::string& buf) const; - void to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, to_string_cb_t< K, V > const& cb) const; + void to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, to_string_cb_t< K, V > const& cb, int nindent=-1) const; void to_dot_keys(bnodeid_t bnodeid, std::string& buf, std::map< uint32_t, std::vector< uint64_t > >& l_map, std::map< uint64_t, BtreeVisualizeVariables >& info_map) const; + void sanity_sub_tree(bnodeid_t bnodeid=0) const; void validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const; void validate_sanity_next_child(const BtreeNodePtr& parent_node, uint32_t ind) const; void print_node(const bnodeid_t& bnodeid) const; diff --git a/src/include/homestore/btree/detail/btree_common.ipp b/src/include/homestore/btree/detail/btree_common.ipp index ecda7e138..43e0c7c60 100644 --- a/src/include/homestore/btree/detail/btree_common.ipp +++ b/src/include/homestore/btree/detail/btree_common.ipp @@ -148,23 +148,27 @@ void Btree< K, V >::to_string(bnodeid_t bnodeid, std::string& buf) const { template < typename K, typename V > void Btree< K, V >::to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, - to_string_cb_t< K, V > const& cb) const { + to_string_cb_t< K, V > const& cb, int nindent) const { BtreeNodePtr node; locktype_t acq_lock = locktype_t::READ; if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return; } - fmt::format_to(std::back_inserter(buf), "{}\n", node->to_custom_string(cb)); + if(nindent <0){ + nindent = node->level(); + } + std::string tabs(3*(nindent- node->level()), ' '); + fmt::format_to(std::back_inserter(buf), "{}{}\n", tabs, node->to_custom_string(cb)); if (!node->is_leaf()) { uint32_t i = 0; while (i < node->total_entries()) { BtreeLinkInfo p; node->get_nth_value(i, &p, false); - to_custom_string_internal(p.bnode_id(), buf, cb); + to_custom_string_internal(p.bnode_id(), buf, cb, nindent); ++i; } - if (node->has_valid_edge()) { to_custom_string_internal(node->edge_id(), buf, cb); } + if (node->has_valid_edge()) { to_custom_string_internal(node->edge_id(), buf, cb, nindent); } } unlock_node(node, acq_lock); } @@ -222,6 +226,35 @@ uint64_t Btree< K, V >::count_keys(bnodeid_t bnodeid) const { return result; } +template < typename K, typename V > +void Btree< K, V >::sanity_sub_tree(bnodeid_t bnodeid) const { + if (bnodeid==0) { + bnodeid= m_root_node_info.bnode_id(); + } + BtreeNodePtr node; + if ( + auto ret = read_node_impl(bnodeid, node); ret!=btree_status_t::success) { + LOGINFO("reading node failed for bnodeid: {} reason: {}", bnodeid, ret); + }else{ + if(node->is_leaf()){ + return; + } + uint32_t nentries = node->has_valid_edge() ? node->total_entries() + 1 : node->total_entries(); + std::vector child_id_list; + child_id_list.reserve(nentries); + BT_REL_ASSERT_NE(node->has_valid_edge() && node->next_bnode() != empty_bnodeid, true, "node {} has valid edge and next id is not empty", node->to_string()); + for (uint32_t i = 0; i < nentries; ++i) { + validate_sanity_child(node, i); + BtreeLinkInfo child_info; + node->get_nth_value(i, &child_info, false /* copy */); + child_id_list.push_back(child_info.bnode_id()); + } + for (auto child_id: child_id_list){ + sanity_sub_tree(child_id); + } + } +} + template < typename K, typename V > void Btree< K, V >::validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const { BtreeLinkInfo child_info; @@ -240,26 +273,33 @@ void Btree< K, V >::validate_sanity_child(const BtreeNodePtr& parent_node, uint3 } return; } - child_node->get_first_key(&child_first_key); - child_node->get_last_key(&child_last_key); - BT_REL_ASSERT_LE(child_first_key.compare(&child_last_key), 0); - if (ind == parent_node->total_entries()) { + BT_REL_ASSERT_NE(child_node->is_node_deleted(), true, "child node {} is deleted", child_node->to_string()); + if(ind >= parent_node->total_entries()){ BT_REL_ASSERT_EQ(parent_node->has_valid_edge(), true); - if (ind > 0) { - parent_node->get_nth_key< K >(ind - 1, &parent_key, false); - BT_REL_ASSERT_GT(child_first_key.compare(&parent_key), 0); - BT_REL_ASSERT_LT(parent_key.compare_start(&child_first_key), 0); + if( ind >0){ + parent_key = parent_node->get_nth_key< K >(ind -1, false); } - } else { - parent_node->get_nth_key< K >(ind, &parent_key, false); - BT_REL_ASSERT_LE(child_first_key.compare(&parent_key), 0) - BT_REL_ASSERT_LE(child_last_key.compare(&parent_key), 0) - BT_REL_ASSERT_GE(parent_key.compare_start(&child_first_key), 0) - BT_REL_ASSERT_GE(parent_key.compare_start(&child_first_key), 0) - if (ind != 0) { - parent_node->get_nth_key< K >(ind - 1, &parent_key, false); - BT_REL_ASSERT_GT(child_first_key.compare(&parent_key), 0) - BT_REL_ASSERT_LT(parent_key.compare_start(&child_first_key), 0) + }else + { + parent_key = parent_node->get_nth_key< K >(ind, false); + } + K previous_parent_key; + if( ind >0 && parent_node->total_entries()>0){ + previous_parent_key = parent_node->get_nth_key< K >(ind - 1, false); + } + for (uint32_t i = 0; i total_entries() ; ++i) { + K cur_child_key = child_node->get_nth_key< K >(i, false); + if(ind < parent_node->total_entries()){ + BT_REL_ASSERT_LE(cur_child_key.compare(parent_key), 0, " child {} {}-th key is greater than its parent's {} {}-th key", child_node->to_string(), i , parent_node->to_string(), ind); + if(ind>0) { + BT_REL_ASSERT_GT(cur_child_key.compare(previous_parent_key), 0, + " child {} {}-th key is less than its parent's {} {}-th key", child_node->to_string(), + i, parent_node->to_string(), ind - 1); + } + + }else + { + BT_REL_ASSERT_GT(cur_child_key.compare(parent_key), 0, " child {} {}-th key is greater than its parent {} {}-th key", child_node->to_string(), i , parent_node->to_string(), ind); } } } diff --git a/src/include/homestore/btree/detail/btree_internal.hpp b/src/include/homestore/btree/detail/btree_internal.hpp index 7dbc50c0a..8f2b267ac 100644 --- a/src/include/homestore/btree/detail/btree_internal.hpp +++ b/src/include/homestore/btree/detail/btree_internal.hpp @@ -250,19 +250,19 @@ struct BtreeConfig { uint64_t m_min_keys_in_node{0}; #endif bool m_rebalance_turned_on{false}; - bool m_merge_turned_on{true}; btree_node_type m_leaf_node_type{btree_node_type::VAR_OBJECT}; btree_node_type m_int_node_type{btree_node_type::VAR_KEY}; std::string m_btree_name; // Unique name for the btree - + bool m_merge_turned_on{true}; + uint8_t m_max_merge_level{1}; private: uint32_t m_suggested_min_size; // Precomputed values uint32_t m_ideal_fill_size; public: BtreeConfig(uint32_t node_size, const std::string& btree_name = "") : - m_node_size{node_size}, m_btree_name{btree_name.empty() ? std::string("btree") : btree_name} { + m_node_size{node_size}, m_btree_name{btree_name.empty() ? std::string("btree") : btree_name}{ set_node_data_size(node_size - 512); // Just put estimate at this point of time. } diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index 988b683cf..1c45501aa 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -86,9 +86,9 @@ struct persistent_hdr_t { auto sedge = (edge_info.m_bnodeid == empty_bnodeid) ? "" : fmt::format(" edge={}.{}", edge_info.m_bnodeid, edge_info.m_link_version); - return fmt::format("id={}{}{} {} level={} nentries={}{} mod_cp={}", node_id, snext, sedge, - leaf ? "LEAF" : "INTERIOR", level, nentries, (node_deleted == 0x1) ? " Deleted" : "", - modified_cp_id); + return fmt::format("id={}{}{} {} level={} nentries={} mod_cp={}{}", node_id, snext, sedge, + leaf ? "LEAF" : "INTERIOR", level, nentries, modified_cp_id, + node_deleted == 0x1 ? " Deleted" : " LIVE"); } }; #pragma pack() @@ -119,7 +119,6 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { m_trans_hdr.max_keys_in_node = cfg.m_max_keys_in_node; m_trans_hdr.min_keys_in_node = cfg.m_min_keys_in_node; #endif - } virtual ~BtreeNode() = default; @@ -368,9 +367,10 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { std::string to_custom_string(to_string_cb_t< K, V > const& cb) const { std::string snext = (this->next_bnode() == empty_bnodeid) ? "" : fmt::format(" next_node={}", this->next_bnode()); - auto str = fmt::format("id={}.{} level={} nEntries={} {}{} node_gen={} ", this->node_id(), this->link_version(), - this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), snext, - this->node_gen()); + auto str = + fmt::format("id={}.{} level={} nEntries={} {}{} node_gen={} {} ", this->node_id(), this->link_version(), + this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), snext, + this->node_gen(), this->is_node_deleted() ? " **DELETED**" : ""); if (this->has_valid_edge()) { fmt::format_to(std::back_inserter(str), " edge={}.{}", this->edge_info().m_bnodeid, this->edge_info().m_link_version); @@ -396,12 +396,6 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { } fmt::format_to(std::back_inserter(str), "]"); } - - // Should not happen - if (this->is_node_deleted()) { - fmt::format_to(std::back_inserter(str), " **DELETED** "); - } - return str; } @@ -537,10 +531,9 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { virtual uint32_t occupied_size() const { return (node_data_size() - available_size()); } bool is_merge_needed(const BtreeConfig& cfg) const { + if (level() > cfg.m_max_merge_level) { return false; } #ifdef _PRERELEASE - if (min_keys_in_node()) { - return total_entries() < min_keys_in_node(); - } + if (min_keys_in_node()) { return total_entries() < min_keys_in_node(); } #endif return (occupied_size() < cfg.suggested_min_size()); } diff --git a/src/include/homestore/btree/detail/btree_node_mgr.ipp b/src/include/homestore/btree/detail/btree_node_mgr.ipp index a5b0317de..aa536a728 100644 --- a/src/include/homestore/btree/detail/btree_node_mgr.ipp +++ b/src/include/homestore/btree/detail/btree_node_mgr.ipp @@ -334,11 +334,11 @@ void Btree< K, V >::_start_of_lock(const BtreeNodePtr& node, locktype_t ltype, c info.node = node.get(); if (ltype == locktype_t::WRITE) { bt_thread_vars()->wr_locked_nodes.push_back(info); - LOGTRACEMOD(btree, "ADDING node {} to write locked nodes list, its size={}", (void*)info.node, + LOGTRACEMOD(btree, "ADDING node {} to write locked nodes list, its size={}", info.node->node_id(), bt_thread_vars()->wr_locked_nodes.size()); } else if (ltype == locktype_t::READ) { bt_thread_vars()->rd_locked_nodes.push_back(info); - LOGTRACEMOD(btree, "ADDING node {} to read locked nodes list, its size={}", (void*)info.node, + LOGTRACEMOD(btree, "ADDING node {} to read locked nodes list, its size={}", info.node->node_id(), bt_thread_vars()->rd_locked_nodes.size()); } else { DEBUG_ASSERT(false, "Invalid locktype_t {}", ltype); @@ -355,7 +355,7 @@ bool Btree< K, V >::remove_locked_node(const BtreeNodePtr& node, locktype_t ltyp if (info.node == node.get()) { *out_info = info; pnode_infos->pop_back(); - LOGTRACEMOD(btree, "REMOVING node {} from {} locked nodes list, its size = {}", (void*)info.node, + LOGTRACEMOD(btree, "REMOVING node {} from {} locked nodes list, its size = {}",info.node->node_id(), (ltype == locktype_t::WRITE) ? "write" : "read", pnode_infos->size()); return true; } else if (pnode_infos->size() > 1) { @@ -364,7 +364,7 @@ bool Btree< K, V >::remove_locked_node(const BtreeNodePtr& node, locktype_t ltyp *out_info = info; pnode_infos->at(pnode_infos->size() - 2) = pnode_infos->back(); pnode_infos->pop_back(); - LOGTRACEMOD(btree, "REMOVING node {} from {} locked nodes list, its size = {}", (void*)info.node, + LOGTRACEMOD(btree, "REMOVING node {} from {} locked nodes list, its size = {}", info.node->node_id(), (ltype == locktype_t::WRITE) ? "write" : "read", pnode_infos->size()); return true; } @@ -390,7 +390,7 @@ template < typename K, typename V > uint64_t Btree< K, V >::end_of_lock(const BtreeNodePtr& node, locktype_t ltype) { btree_locked_node_info info; if (!remove_locked_node(node, ltype, &info)) { - DEBUG_ASSERT(false, "Expected node = {} is not there in locked_node_list", (void*)node.get()); + DEBUG_ASSERT(false, "Expected node = {} is not there in locked_node_list", node->node_id()); return 0; } // DEBUG_ASSERT_EQ(node.get(), info.node); diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp index 82213dcc6..ccfe0f584 100644 --- a/src/include/homestore/btree/detail/btree_remove_impl.ipp +++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp @@ -246,6 +246,11 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const _src_cursor_info src_cursor; total_size = leftmost_node->occupied_size(); + uint32_t expected_entities = leftmost_node->total_entries(); +#ifdef _PRERELEASE + const uint64_t max_keys = leftmost_node->max_keys_in_node(); +#endif + for (auto indx = start_idx + 1; indx <= end_idx; ++indx) { if (indx == parent_node->total_entries()) { BT_NODE_LOG_ASSERT(parent_node->has_valid_edge(), parent_node, @@ -271,6 +276,10 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const // Only option is to rebalance the nodes across. If we are asked not to do so, skip it. if (!m_bt_cfg.m_rebalance_turned_on) { ret = btree_status_t::merge_not_required; + BT_NODE_LOG( + DEBUG, parent_node, + "MERGE disqualified for parent node {} leftmost_node {}! num_nodes {} is more than old_nodes.size() {}", + parent_node->to_string(), leftmost_node->to_string(), num_nodes, old_nodes.size()); goto out; } } @@ -279,6 +288,10 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const if (leftmost_node->occupied_size() > balanced_size) { // If for some reason balancing increases the current size, give up. // TODO: Is this a real case, isn't happening would mean some sort of bug in calculation of is_merge_needed? + BT_NODE_LOG( + DEBUG, parent_node, + "MERGE disqualified for parent node {} leftmost_node {}! current size {} is more than balanced size {}", + parent_node->to_string(), leftmost_node->to_string(), leftmost_node->occupied_size(), balanced_size); BT_NODE_DBG_ASSERT(false, leftmost_node, "Didn't expect current size is more than balanced size without rebalancing"); ret = btree_status_t::merge_not_required; @@ -294,7 +307,19 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const leftmost_src.ith_nodes.push_back(i); // TODO: check whether value size of the node is greater than available_size? If so nentries is 0. Suppose if a // node contains one entry and the value size is much bigger than available size - auto const nentries = old_nodes[i]->num_entries_by_size(0, available_size); + auto nentries = old_nodes[i]->num_entries_by_size(0, available_size); + +#ifdef _PRERELEASE + if (max_keys) { + if (expected_entities + nentries > max_keys) { + nentries = max_keys - expected_entities; + expected_entities = max_keys; + } else { + expected_entities += nentries; + } + } +#endif + if ((old_nodes[i]->total_entries() - nentries) == 0) { // Entire node goes in available_size -= old_nodes[i]->occupied_size(); if (i >= old_nodes.size() - 1) { @@ -353,13 +378,22 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const // better merge next time. if (new_nodes.size() > old_nodes.size()) { ret = btree_status_t::merge_not_required; + BT_NODE_LOG( + DEBUG, parent_node, + "MERGE disqualified for parent node {} leftmost_node {}! new nodes size {} is more than old nodes size {}", + parent_node->to_string(), leftmost_node->to_string(), new_nodes.size(), old_nodes.size()); goto out; } // There is a case where we are rebalancing and the second node which rebalanced didn't move any size, in that case // the first node is going to be exactly same and we will do again merge, so bail out here. - if ((new_nodes.size() == old_nodes.size()) && (old_nodes[0]->occupied_size() >= new_nodes[0]->occupied_size())) { + if ((new_nodes.size() == old_nodes.size()) && (old_nodes[0]->occupied_size() == new_nodes[0]->occupied_size())) { ret = btree_status_t::merge_not_required; + BT_NODE_LOG(DEBUG, parent_node, + "MERGE disqualified for parent node {} leftmost_node {}! old nodes occupied size {} is more than " + "as new nodes occupied size {}", + parent_node->to_string(), leftmost_node->to_string(), old_nodes[0]->occupied_size(), + new_nodes[0]->occupied_size()); goto out; } diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/detail/simple_node.hpp index e85d1190c..25a87c1c1 100644 --- a/src/include/homestore/btree/detail/simple_node.hpp +++ b/src/include/homestore/btree/detail/simple_node.hpp @@ -166,6 +166,14 @@ class SimpleNode : public VariantNode< K, V > { nentries = std::min(nentries, other.total_entries() - start_idx); nentries = std::min(nentries, this->get_available_entries()); +#ifdef _PRERELEASE + const uint64_t max_keys = this->max_keys_in_node(); + if(max_keys){ + if(this->total_entries() + nentries > max_keys) { + nentries = max_keys - this->total_entries(); + } + } +#endif uint32_t sz = nentries * get_nth_obj_size(0); if (sz != 0) { std::memcpy(get_nth_obj(this->total_entries()), other.get_nth_obj_const(start_idx), sz); } this->add_entries(nentries); @@ -213,10 +221,10 @@ class SimpleNode : public VariantNode< K, V > { std::string to_string(bool print_friendly = false) const override { auto snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode()); - auto str = fmt::format("{}id={} level={} nEntries={} {} {} ", + auto str = fmt::format("{}id={} level={} nEntries={} {} {} {}", (print_friendly ? "------------------------------------------------------------\n" : ""), this->node_id(), this->level(), this->total_entries(), - (this->is_leaf() ? "LEAF" : "INTERIOR"), snext); + (this->is_leaf() ? "LEAF" : "INTERIOR"), snext, this->is_node_deleted()? " Deleted" : " LIVE"); if (this->has_valid_edge()) { fmt::format_to(std::back_inserter(str), " edge={}.{}", this->edge_info().m_bnodeid, this->edge_info().m_link_version); @@ -379,9 +387,9 @@ class SimpleNode : public VariantNode< K, V > { return (this->node_data_area_const() + (get_nth_obj_size(ind) * ind)); } - void set_nth_key(uint32_t ind, BtreeKey* key) { + void set_nth_key(uint32_t ind, const BtreeKey& key) { uint8_t* entry = this->node_data_area() + (get_nth_obj_size(ind) * ind); - sisl::blob const b = key->serialize(); + sisl::blob const b = key.serialize(); memcpy(entry, b.cbytes(), b.size()); } diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp index 1ff444650..c411edf70 100644 --- a/src/include/homestore/index/index_internal.hpp +++ b/src/include/homestore/index/index_internal.hpp @@ -75,6 +75,8 @@ class IndexTableBase { virtual void stop() = 0; virtual void repair_node(IndexBufferPtr const& buf) = 0; virtual void repair_root_node(IndexBufferPtr const& buf) = 0; + virtual void delete_stale_children(IndexBufferPtr const& buf) = 0; + virtual void audit_tree() = 0; }; enum class index_buf_state_t : uint8_t { @@ -94,6 +96,7 @@ struct IndexBuffer : public sisl::ObjLifeCounter< IndexBuffer > { cp_id_t m_created_cp_id{-1}; // CP id when this buffer is created. std::atomic< index_buf_state_t > m_state{index_buf_state_t::CLEAN}; // Is buffer yet to persist? uint8_t* m_bytes{nullptr}; // Actual data buffer + uint32_t m_node_level{0}; //levels of the node in the btree std::shared_ptr< IndexBuffer > m_up_buffer; // Parent buffer in the chain to persisted sisl::atomic_counter< int > m_wait_for_down_buffers{0}; // Number of children need to wait for before persisting diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index a693ddc9e..31c793bdf 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -60,6 +60,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg) : Btree< K, V >{cfg}, m_sb{"index"} { + this->m_bt_cfg.m_merge_turned_on = HS_DYNAMIC_CONFIG(btree.merge_turned_on); + this->m_bt_cfg.m_max_merge_level = HS_DYNAMIC_CONFIG(btree.max_merge_level); // Create a superblk for the index table and create MetaIndexBuffer corresponding to that m_sb.create(sizeof(index_table_sb)); m_sb->uuid = uuid; @@ -77,6 +79,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } IndexTable(superblk< index_table_sb >&& sb, const BtreeConfig& cfg) : Btree< K, V >{cfg}, m_sb{std::move(sb)} { + this->m_bt_cfg.m_merge_turned_on = HS_DYNAMIC_CONFIG(btree.merge_turned_on); + this->m_bt_cfg.m_max_merge_level = HS_DYNAMIC_CONFIG(btree.max_merge_level); m_sb_buffer = std::make_shared< MetaIndexBuffer >(m_sb); // After recovery, we see that root node is empty, which means that after btree is created, we crashed. @@ -98,6 +102,11 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } + void audit_tree() override { + cp_mgr().cp_guard(); + Btree< K, V >::sanity_sub_tree(); + } + btree_status_t destroy() override { if (is_stopping()) return btree_status_t::stopping; incr_pending_request_num(); @@ -181,6 +190,22 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } + void delete_stale_children(IndexBufferPtr const& idx_buf) override { + if (!idx_buf->is_meta_buf() && idx_buf->m_created_cp_id == -1) { + BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */, + BtreeNode::identify_leaf_node(idx_buf->raw_buffer())); + static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf); + auto cpg = cp_mgr().cp_guard(); + idx_buf->m_dirtied_cp_id = cpg->id(); + BtreeNodePtr bn = BtreeNodePtr{n}; + + if (!bn->is_leaf()) { + LOGTRACEMOD(wbcache, "delete_stale_links cp={} buf={}", cpg->id(), idx_buf->to_string()); + delete_stale_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + } + } + } + void repair_node(IndexBufferPtr const& idx_buf) override { if (idx_buf->is_meta_buf()) { // We cannot repair the meta buf on its own, we need to repair the root node which modifies the @@ -230,6 +255,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { node->set_checksum(); auto prev_state = idx_node->m_idx_buf->m_state.exchange(index_buf_state_t::DIRTY); + idx_node->m_idx_buf->m_node_level = node->level(); if (prev_state == index_buf_state_t::CLEAN) { // It was clean before, dirtying it first time, add it to the wb_cache list to flush if (idx_node->m_idx_buf->m_dirtied_cp_id != -1) { @@ -243,6 +269,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { (int)prev_state, (int)index_buf_state_t::FLUSHING, "Writing on a node buffer which was currently in flushing state on cur_cp={} buffer_cp_id={}", cp_ctx->id(), idx_node->m_idx_buf->m_dirtied_cp_id); + BT_DBG_ASSERT_EQ(idx_node->m_idx_buf->m_dirtied_cp_id, cp_ctx->id(), + "Writing a node which was not acquired by this cp"); } return btree_status_t::success; } @@ -294,6 +322,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { void free_node_impl(const BtreeNodePtr& node, void* context) override { auto n = static_cast< IndexBtreeNode* >(node.get()); + n->m_idx_buf->m_node_level = node->level(); wb_cache().free_buf(n->m_idx_buf, r_cast< CPContext* >(context)); } @@ -314,175 +343,562 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return btree_status_t::success; } - btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { - BT_LOG(DEBUG, "Repairing links for parent node [{}]", parent_node->to_string()); - // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this needs - // to be handled. Get the last key in the node - auto const last_parent_key = parent_node->get_last_key< K >(); - auto const is_parent_edge_node = parent_node->has_valid_edge(); - if ((parent_node->total_entries() == 0) && !is_parent_edge_node) { - BT_LOG_ASSERT(false, "Parent node={} is empty and not an edge node but was asked to repair", - parent_node->node_id()); - return btree_status_t::not_found; + btree_status_t delete_stale_links(BtreeNodePtr const& parent_node, void* cp_ctx) { + LOGTRACEMOD(wbcache, "deleting stale links for parent node [{}]", parent_node->to_string()); + BtreeNodeList free_nodes; + auto nentries = parent_node->total_entries(); + uint32_t deleted = 0; + for (uint32_t i = nentries; i-- > 0;) { + BtreeLinkInfo cur_child_info; + BtreeNodePtr child_node; + parent_node->get_nth_value(i, &cur_child_info, false /* copy */); + if (auto ret = read_node_impl(cur_child_info.bnode_id(), child_node); ret == btree_status_t::success) { + if (child_node->is_node_deleted()) { + LOGTRACEMOD(wbcache, "Deleting stale child node [{}] for parent node [{}]", child_node->to_string(), + parent_node->to_string()); + child_node->set_node_deleted(); + free_node_impl(child_node, cp_ctx); + + if (i > 0) { + BtreeLinkInfo pre_child_info; + parent_node->get_nth_value(i - 1, &pre_child_info, false /* copy */); + // auto ckey = parent_node->get_nth_key< K >(i-1, true); + // parent_node->set_nth_key(i-1, ckey); + parent_node->update(i, pre_child_info); + parent_node->remove(i - 1); + } else { + parent_node->remove(i); + } + + LOGTRACEMOD(wbcache, "so far parent node [{}]", parent_node->to_string()); + // free_nodes.push_back(child_node); + deleted++; + } + } else { + LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}", + cur_child_info.bnode_id(), parent_node->to_string(), ret); + } } + if (parent_node->has_valid_edge()) { + auto edge_info = parent_node->get_edge_value(); + BtreeNodePtr edge_node; + if (auto ret = read_node_impl(edge_info.bnode_id(), edge_node); ret == btree_status_t::success) { + if (edge_node->is_node_deleted()) { + LOGTRACEMOD(wbcache, "Deleting stale edge node [{}] for parent node [{}]", edge_node->to_string(), + parent_node->to_string()); + edge_node->set_node_deleted(); + free_node_impl(edge_node, cp_ctx); + if (parent_node->total_entries() == 0) { + parent_node->invalidate_edge(); + } else { + BtreeLinkInfo last_child_info; + parent_node->get_nth_value(parent_node->total_entries() - 1, &last_child_info, + false /* copy */); + parent_node->set_edge_value(last_child_info); + parent_node->remove(parent_node->total_entries() - 1); + LOGTRACEMOD(wbcache, "Replacing edge with previous child node [{}] for parent node [{}]", + last_child_info.bnode_id(), parent_node->to_string()); + } - // Get all original child ids as a support to check if we are beyond the last child node - std::set< bnodeid_t > orig_child_ids; - for (uint32_t i = 0; i < parent_node->total_entries(); ++i) { - BtreeLinkInfo link_info; - parent_node->get_nth_value(i, &link_info, true); - orig_child_ids.insert(link_info.bnode_id()); + deleted++; + } + } else { + LOGTRACEMOD(wbcache, "Failed to read edge node {} for parent node [{}] reason {}", + edge_node->to_string(), parent_node->to_string(), ret); + } } - BT_LOG(INFO, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), - last_parent_key.to_string()); - - // Get the first child node and its link info - BtreeLinkInfo child_info; - BtreeNodePtr child_node; - auto ret = this->get_child_and_lock_node(parent_node, 0, child_info, child_node, locktype_t::READ, - locktype_t::READ, cp_ctx); - if (ret != btree_status_t::success) { - BT_LOG_ASSERT(false, "Parent node={} repair failed, because first child_node get has failed with ret={}", - parent_node->node_id(), enum_name(ret)); + if (deleted /*free_nodes.size()*/) { + btree_status_t ret = btree_status_t::success; + + if ((parent_node->total_entries() == 0) && !parent_node->has_valid_edge()) { + parent_node->set_node_deleted(); + LOGTRACEMOD(wbcache, + "Freeing parent node=[{}] because it is empty and not an edge node but had stale children", + parent_node->to_string()); + ret = write_node_impl(parent_node, cp_ctx); + free_node_impl(parent_node, cp_ctx); + LOGTRACEMOD(wbcache, + "Accomplishing deleting stale links. After removing {} stale links, parent node is [{}]", + deleted, parent_node->to_string()); + } else { + ret = write_node_impl(parent_node, cp_ctx); + if (ret != btree_status_t::success) { + LOGTRACEMOD(wbcache, "Failed to write parent node [{}] after deleting stale links", + parent_node->to_string()); + } else { + LOGTRACEMOD( + wbcache, + "Accomplishing deleting stale links. After removing {} stale links, parent node is [{}]", + deleted, parent_node->to_string()); + } + } + // auto ret = transact_nodes({}, free_nodes, parent_node, nullptr, cp_ctx); return ret; + } else { + LOGTRACEMOD(wbcache, "Accomplishing deleting stale links. No stale links found for parent node [{}]", + parent_node->to_string()); } + return btree_status_t::success; + } - // Keep a copy of the node buffer, in case we need to revert back - uint8_t* tmp_buffer = new uint8_t[this->m_node_size]; - std::memcpy(tmp_buffer, parent_node->m_phys_node_buf, this->m_node_size); + // + btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { + LOGTRACEMOD(wbcache, "Repairing links for parent node [{}]", parent_node->to_string()); + // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this + // needs to be handled. Get the last key in the node + + auto last_parent_key = parent_node->get_last_key< K >(); + auto const is_parent_edge_node = parent_node->has_valid_edge(); + if ((parent_node->total_entries() == 0) && !is_parent_edge_node) { + BT_LOG_ASSERT(false, "Parent node={} is empty and not an edge node but was asked to repair", + parent_node->node_id()); + return btree_status_t::not_found; + } - // Remove all the entries in parent_node and let walk across child_nodes rebuild this node - parent_node->remove_all(this->m_bt_cfg); + // Get all original child ids as a support to check if we are beyond the last child node + std::unordered_map< bnodeid_t, K > orig_child_infos; + for (uint32_t i = 0; i < parent_node->total_entries(); ++i) { + BtreeLinkInfo link_info; + parent_node->get_nth_value(i, &link_info, true); + orig_child_infos[link_info.bnode_id()] = parent_node->get_nth_key< K >(i, false /* copy */); + } + LOGTRACEMOD(wbcache, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), + last_parent_key.to_string()); + + // Get the first child node and its link info + BtreeLinkInfo child_info; + BtreeNodePtr child_node; + BtreeNodePtr pre_child_node; + auto ret = this->get_child_and_lock_node(parent_node, 0, child_info, child_node, locktype_t::READ, + locktype_t::READ, cp_ctx); + if (ret != btree_status_t::success) { + BT_LOG_ASSERT(false, "Parent node={} repair failed, because first child_node get has failed with ret={}", parent_node->node_id(), enum_name(ret)); + return ret; + } - // Walk across all child nodes until it gets the last_parent_key and keep fixing them. - auto cur_parent = parent_node; - BtreeNodeList new_parent_nodes; - do { - if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) { - if (child_node->is_node_deleted()) { - // Edge node is merged, we need to set the current last entry as edge - if (cur_parent->total_entries() > 0) { - auto prev_val = V{}; - cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true); - cur_parent->remove(cur_parent->total_entries() - 1); - cur_parent->set_edge_value(prev_val); - BT_LOG(INFO, "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}", - cur_parent->node_id(), child_node->to_string(), prev_val.to_string()); + // update the last key of parent for issue + // 1- last key is X for parent (P) + // 2- check the non deleted last child (A) last key (here is Y) + // start from first child and store the last key of the child node, then traverse to next sibling + // 2-1- if this is greater than parent last key, traverse for sibling of parent until reaches to + //siblings which has keys more than Y or end of list (name this parent sibling node F), + // 2-2- Put last key of F to last key of P + // 2-3 - set F as Next of A + BtreeNodeList siblings; + BtreeNodePtr next_cur_child; + BT_DBG_ASSERT(parent_node->has_valid_edge() || parent_node->total_entries(), + "parent node {} doesn't have valid edge and no entries ", parent_node->to_string()); + if (parent_node->total_entries() > 0) { + auto updated_last_key = last_parent_key; + K last_child_last_key; + K last_child_neighbor_key; + BtreeNodePtr cur_child; + BtreeLinkInfo cur_child_info; + + bool found_child = false; + uint32_t nentries = parent_node->total_entries() + parent_node->has_valid_edge() ? 1 : 0; + + for (uint32_t i = nentries; i-- > 0;) { + parent_node->get_nth_value(i, &cur_child_info, false /* copy */); + if (auto ret = read_node_impl(cur_child_info.bnode_id(), cur_child); ret == + btree_status_t::success) { + if (!cur_child->is_node_deleted() && cur_child->total_entries()) { + last_child_last_key = cur_child->get_last_key< K >(); + if (cur_child->next_bnode() != empty_bnodeid && + read_node_impl(cur_child->next_bnode(), next_cur_child) == btree_status_t::success) { + LOGTRACEMOD(wbcache, + "Last child last key {} for child_node [{}] parent node [{}], next neigbor is [{}]", last_child_last_key.to_string(), + cur_child->to_string(), parent_node->to_string(), + next_cur_child->to_string()); + found_child = true; + break; + } + found_child = true; + break; + } + LOGTRACEMOD(wbcache, "PASSING child node {} so we need to check next child node", + cur_child->to_string()); + } + } + + if (found_child) { + LOGTRACEMOD(wbcache, "Last child last key {} for parent node {}, child_node {}", + last_child_last_key.to_string(), parent_node->to_string(), cur_child->to_string()); + if (last_child_last_key.compare(last_parent_key) > 0) { + if (next_cur_child) { + last_child_neighbor_key = next_cur_child->get_last_key< K >(); + LOGTRACEMOD(wbcache, + "Voila !! last child_key of child [{}] is greater than its parents [{}] and its next neighbor key is {}", cur_child->to_string(), + parent_node->to_string(), last_child_neighbor_key.to_string()); + } else { + LOGTRACEMOD( + wbcache, + "Last child_key of child [{}] is greater than its parents [{}] and it has no next neighbor", cur_child->to_string(), parent_node->to_string()); + } + + // 2-1 traverse for sibling of parent until reaches to siblings which has keys more than 7563 +// or end + // of list (put all siblings in a list, here is F) , + BtreeNodePtr sibling; + BtreeNodePtr true_sibling; + BtreeLinkInfo sibling_info; + + auto sibling_node_id = parent_node->next_bnode(); + while (sibling_node_id != empty_bnodeid) { + if (auto ret = read_node_impl(sibling_node_id, sibling); ret == btree_status_t::success) { + if (sibling->is_node_deleted()) { + // Do we need to free the sibling node here? + siblings.push_back(sibling); + sibling_node_id = sibling->next_bnode(); + LOGTRACEMOD(wbcache, "Sibling node [{}] is deleted, continue to next sibling", + sibling->to_string()); + continue; + } + auto sibling_last_key = sibling->get_last_key< K >(); + if (next_cur_child && sibling_last_key.compare(last_child_neighbor_key) < 0) { + siblings.push_back(sibling); + sibling_node_id = sibling->next_bnode(); + } else { + true_sibling = sibling; + break; + } + } + } + if (true_sibling) { + LOGTRACEMOD(wbcache, "True sibling [{}] for parent_node {}", + true_sibling->to_string(), + parent_node->to_string()); + } else { + LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", + parent_node->to_string()); + } + if (sibling_node_id != empty_bnodeid) { + last_parent_key = last_child_last_key; + parent_node->set_next_bnode(true_sibling->node_id()); + for (auto sibling : siblings) { + LOGTRACEMOD(wbcache, "Sibling list [{}]", sibling->to_string()); + } + LOGTRACEMOD(wbcache, "True sibling [{}]", true_sibling->to_string()); + BtreeLinkInfo first_child_info; + parent_node->get_nth_value(0, &first_child_info, false); + } } else { - BT_LOG(INFO, "Found an empty interior node {} with maybe all childs deleted", - cur_parent->node_id()); + LOGTRACEMOD(wbcache, + "No undeleted child found for parent_node [{}], keep normal repair (regular recovery)", parent_node->to_string()); + next_cur_child = nullptr; } - } else { - // Update edge and finish - BT_LOG(INFO, "Repairing node={}, child_node=[{}] is an edge node, end loop", cur_parent->node_id(), - child_node->to_string()); - child_node->set_next_bnode(empty_bnodeid); - write_node_impl(child_node, cp_ctx); - cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); } - break; } - auto const child_last_key = child_node->get_last_key< K >(); - BT_LOG(INFO, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(), - child_node->to_string(), child_last_key.to_string()); - - // Check if we are beyond the last child node. - // - // There can be cases where the child level merge is successfully persisted but the parent level is not. - // In this case, you may have your rightmost child node with last key greater than the last_parent_key. - // That's why here we have to check if the child node is one of the original child nodes first. - if (!is_parent_edge_node && !orig_child_ids.contains(child_node->node_id())) { - if (child_node->total_entries() == 0 || child_last_key.compare(last_parent_key) > 0) { - // We have reached a child beyond this parent, we can stop now + // Keep a copy of the node buffer, in case we need to revert back + uint8_t* tmp_buffer = new uint8_t[this->m_node_size]; + std::memcpy(tmp_buffer, parent_node->m_phys_node_buf, this->m_node_size); + + // Remove all the entries in parent_node and let walk across child_nodes rebuild this node + parent_node->remove_all(this->m_bt_cfg); + + // Walk across all child nodes until it gets the last_parent_key and keep fixing them. + auto cur_parent = parent_node; + BtreeNodeList new_parent_nodes; + do { + if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) { + if (child_node->is_node_deleted()) { + // Edge node is merged, we need to set the current last entry as edge + if (cur_parent->total_entries() > 0) { + auto prev_val = V{}; + cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true); + cur_parent->remove(cur_parent->total_entries() - 1); + cur_parent->set_edge_value(prev_val); + LOGTRACEMOD(wbcache, + "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}", + cur_parent->node_id(), child_node->to_string(), prev_val.to_string()); + } else { + LOGTRACEMOD(wbcache, "Found an empty interior node {} with maybe all childs deleted", + cur_parent->node_id()); + } + } else { + // Update edge and finish + if (is_parent_edge_node) { + cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), + child_node->link_version()}); + } else { + auto tsib_id = find_true_sibling(cur_parent); + if (tsib_id != empty_bnodeid) { + cur_parent->set_next_bnode(tsib_id); + LOGTRACEMOD(wbcache, + "True sibling [{}] for parent_node [{}], So don't add child [{}] here ", + tsib_id, cur_parent->to_string(), child_node->to_string()); + } else { + cur_parent->set_next_bnode(empty_bnodeid); + // if this child node previously belonged to this parent node, we need to add it but as edge o.w, not this node + if (orig_child_infos.contains(child_node->node_id())){ + cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), + child_node->link_version()}); + LOGTRACEMOD(wbcache, + "Child node [{}] is an edge node and previously belong to this parent, so we need to add it as edge", + child_node->to_string()); + } else { + LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", + cur_parent->to_string()); + } + BT_REL_ASSERT(cur_parent->total_entries() != 0 || cur_parent->has_valid_edge(), + "Parent node [{}] cannot be empty", cur_parent->to_string()); + } + } + +// +// } + break; + } break; } - } - if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(), - BtreeLinkInfo::get_fixed_size())) { - // No room in the parent_node, let us split the parent_node and continue - auto new_parent = this->alloc_interior_node(); - if (new_parent == nullptr) { - ret = btree_status_t::space_not_avail; - break; + auto child_last_key = child_node->get_last_key< K >(); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(), + child_node->to_string(), child_last_key.to_string()); + + // Check if we are beyond the last child node. + // + // There can be cases where the child level merge is successfully persisted but the parent level is + // not. In this case, you may have your rightmost child node with last key greater than the + // last_parent_key. That's why here we have to check if the child node is one of the original child + // nodes first. + if (!is_parent_edge_node && !orig_child_infos.contains(child_node->node_id())) { + if (child_last_key.compare(last_parent_key) > 0) { + // We have reached a child beyond this parent, we can stop now + // TODO this case if child last key is less than last parent key to update the parent node. + // this case can potentially break the btree for put and remove op. + break; + } + if (child_node->total_entries() == 0) { + // this child has no entries, but maybe in the middle of the parent node, we need to update the key + // of parent as previous one and go on + LOGTRACEMOD(wbcache, + "Reach to an empty child node {}, and this child doesn't belong to this parent; Hence loop ends", child_node->to_string()); + // now update the next of parent node by skipping all deleted siblings of this parent node + auto valid_sibling = cur_parent->next_bnode(); + while (valid_sibling != empty_bnodeid) { + BtreeNodePtr sibling; + if (read_node_impl(valid_sibling, sibling) == btree_status_t::success) { + if (sibling->is_node_deleted()) { + valid_sibling = sibling->next_bnode(); + continue; + } + // cur_parent->set_next_bnode(sibling->node_id()); + break; + } + LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}", + valid_sibling, cur_parent->to_string(), ret); + } + if (valid_sibling != empty_bnodeid) { + cur_parent->set_next_bnode(valid_sibling); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", + cur_parent->node_id(), child_node->to_string()); + + } else { + cur_parent->set_next_bnode(empty_bnodeid); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", + cur_parent->node_id(), child_node->to_string()); + } + + break; + } } - new_parent->set_next_bnode(cur_parent->next_bnode()); - cur_parent->set_next_bnode(new_parent->node_id()); - new_parent->set_level(cur_parent->level()); - cur_parent->inc_link_version(); + if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(), + BtreeLinkInfo::get_fixed_size())) { + // No room in the parent_node, let us split the parent_node and continue + auto new_parent = this->alloc_interior_node(); + if (new_parent == nullptr) { + ret = btree_status_t::space_not_avail; + break; + } - new_parent_nodes.push_back(new_parent); - cur_parent = std::move(new_parent); - } + new_parent->set_next_bnode(cur_parent->next_bnode()); + cur_parent->set_next_bnode(new_parent->node_id()); + new_parent->set_level(cur_parent->level()); + cur_parent->inc_link_version(); - // Insert the last key of the child node into parent node - if (!child_node->is_node_deleted()) { - cur_parent->insert(cur_parent->total_entries(), - child_node->total_entries() > 0 ? child_last_key : last_parent_key, - BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); - if (child_node->total_entries() == 0) { - // There should be at most one empty child node per parent - if we find one, we should stop here - BT_LOG(INFO, "Repairing node={}, child_node=[{}] is empty, end loop", cur_parent->node_id(), - child_node->to_string()); - break; + new_parent_nodes.push_back(new_parent); + cur_parent = std::move(new_parent); } - } else { - // Node deleted indicates it's freed & no longer used during recovery - BT_LOG(INFO, "Repairing node={}, child node=[{}] is deleted, skipping the insert", - cur_parent->node_id(), child_node->to_string()); - } - BT_LOG(INFO, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), cur_parent->to_string()); + // Insert the last key of the child node into parent node + if (!child_node->is_node_deleted()) { + if (child_node->total_entries() == 0) { + if (orig_child_infos.contains(child_node->node_id())) { + child_last_key = orig_child_infos[child_node->node_id()]; + LOGTRACEMOD(wbcache, + "Reach to an empty child node [{}], but not the end of the parent node, so we need to update the key of parent node as original one {}", + child_node->to_string(), child_last_key.to_string()); + } else { + LOGTRACEMOD(wbcache, + "Reach to an empty child node [{}] but not belonging to this parent (probably next parent sibling); Hence end loop", child_node->to_string()); + break; + } + } + cur_parent->insert(cur_parent->total_entries(), child_last_key, + BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + } else { + // Node deleted indicates it's freed & no longer used during recovery + LOGTRACEMOD(wbcache, "Repairing node={}, child node=[{}] is deleted, skipping the insert", + cur_parent->node_id(), child_node->to_string()); + if (pre_child_node) { + // We need to update the next of the previous child node to this child node + + LOGTRACEMOD(wbcache, + "Repairing node={}, child_node=[{}] is deleted, set next of previous child node [{}] to this child node [{}]", cur_parent->node_id(), child_node->to_string(), + pre_child_node->to_string(), child_node->next_bnode()); + pre_child_node->set_next_bnode(child_node->next_bnode()); + // repairing the next of previous child node + // We need to set the state of the previous child node to clean, so that it can be flushed + IndexBtreeNode* idx_node = static_cast< IndexBtreeNode* >(pre_child_node.get()); + idx_node->m_idx_buf->set_state(index_buf_state_t::CLEAN); + write_node_impl(pre_child_node, cp_ctx); + // update the key of last entry of the parent with the last key of deleted child + child_last_key = orig_child_infos[child_node->node_id()]; + LOGTRACEMOD(wbcache, "updating parent [{}] current last key with {}", cur_parent->to_string(), + child_last_key.to_string()); + // update it here to go to the next child node and unlock this node + LOGTRACEMOD(wbcache, "update the child node next to the next of previous child node"); + child_node->set_next_bnode(child_node->next_bnode()); + } + } - // Move to the next child node - auto const next_node_id = child_node->next_bnode(); - this->unlock_node(child_node, locktype_t::READ); - if (next_node_id == empty_bnodeid) { - // This can be a deleted edge node - only check if it is still valid + LOGTRACEMOD(wbcache, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), + cur_parent->to_string()); + + // Move to the next child node + auto const next_node_id = child_node->next_bnode(); + this->unlock_node(child_node, locktype_t::READ); if (!child_node->is_node_deleted()) { - BT_LOG_ASSERT(false, - "Child node={} next_node_id is empty, while its not a edge node, parent_node={} " - "repair is partial", - child_node->node_id(), parent_node->node_id()); - ret = btree_status_t::not_found; + // We need to free the child node + pre_child_node = child_node; + } + if (next_node_id == empty_bnodeid) { + // This can be a deleted edge node - only check if it is still valid + if (!child_node->is_node_deleted()) { + BT_LOG_ASSERT(false, + "Child node={} next_node_id is empty, while its not a edge node, parent_node={} repair is partial", child_node->node_id(), parent_node->node_id()); + ret = btree_status_t::not_found; + } + child_node = nullptr; + break; + } + if (next_cur_child && next_node_id == next_cur_child->node_id()) { + // We are at the last child node, we can stop now + LOGTRACEMOD( + wbcache, + "REACH Repairing node={}, child_node=[{}] is the true child of sibling parent; Hence, end loop", child_node->node_id(), next_cur_child->to_string()); + child_node = nullptr; + break; + } + ret = this->read_and_lock_node(next_node_id, child_node, locktype_t::READ, locktype_t::READ, cp_ctx); + if (ret != btree_status_t::success) { + BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}", + parent_node->node_id(), enum_name(ret)); + child_node = nullptr; + break; + } + + } while (true); + + if (child_node) { this->unlock_node(child_node, locktype_t::READ); } + // if last parent has the key less than the last child key, then we need to update the parent node with + // the last child key if it doesn't have edge. + auto last_parent = parent_node; + if (new_parent_nodes.size() > 0) { last_parent = new_parent_nodes[new_parent_nodes.size() - 1]; } + if (last_parent->total_entries() && !last_parent->has_valid_edge()) { + if (last_parent->compare_nth_key(last_parent_key, last_parent->total_entries() - 1) < 0) { + BtreeLinkInfo child_info; + last_parent->get_nth_value(last_parent->total_entries() - 1, &child_info, false /* copy */); + parent_node->update(parent_node->total_entries() - 1, last_parent_key, child_info); + LOGTRACEMOD(wbcache, "Repairing parent node={} with last_parent_key={} and child_info={}", + parent_node->node_id(), last_parent_key.to_string(), child_info.to_string()); + } + // if last key of children is less than the last key of parent, then we need to update the last key of non interior child + if (last_parent->level() > 1 && !last_parent->has_valid_edge()) { + // read last child + BtreeNodePtr last_child; + BtreeLinkInfo child_info; + auto total_entries = last_parent->total_entries(); + last_parent->get_nth_value(total_entries - 1, &child_info, false /* copy */); + if (ret = read_node_impl(child_info.bnode_id(), last_child); ret == btree_status_t::success) { + // get last key of cur child + auto last_child_key = last_child->get_last_key< K >(); + BtreeLinkInfo last_child_info; + last_child->get_nth_value(last_child->total_entries() - 1, &last_child_info, false /* copy*/); + if (last_parent->compare_nth_key(last_child_key, total_entries - 1) > 0) { + auto cur_child_st = last_child->to_string(); + last_child->update(last_child->total_entries() - 1, last_parent_key, last_child_info); + LOGTRACEMOD(wbcache, + "Updating interior child node={} with last_parent_key={} and child_info={}", + cur_child_st, last_parent_key.to_string(), last_child_info.to_string()); + write_node_impl(last_child, cp_ctx); + } + } } - child_node = nullptr; - break; } - ret = this->read_and_lock_node(next_node_id, child_node, locktype_t::READ, locktype_t::READ, cp_ctx); - if (ret != btree_status_t::success) { - BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}", - parent_node->node_id(), enum_name(ret)); - child_node = nullptr; - break; + if (ret == btree_status_t::success) { + // Make write_buf happy for the parent node in case of multiple write (stale repair and link repair) + IndexBtreeNode* p_node = static_cast< IndexBtreeNode* >(parent_node.get()); + p_node->m_idx_buf->set_state(index_buf_state_t::CLEAN); + ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx); } - } while (true); - if (child_node) { this->unlock_node(child_node, locktype_t::READ); } + if (ret != btree_status_t::success) { + BT_LOG(ERROR, "An error occurred status={} during repair of parent_node={}, aborting the repair", + enum_name(ret), parent_node->node_id()); + std::memcpy(parent_node->m_phys_node_buf, tmp_buffer, this->m_bt_cfg.node_size()); + } - if (parent_node->total_entries() == 0 && !parent_node->has_valid_edge()) { - // We shouldn't have an empty interior node in the tree, let's delete it. - // The buf will be released by the caller - BT_LOG(INFO, "Parent node={} is empty, deleting it", parent_node->node_id()); - parent_node->set_node_deleted(); + delete[] tmp_buffer; + return ret; } - if (ret == btree_status_t::success) { - ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx); + bnodeid_t find_true_sibling(BtreeNodePtr const& node) { + if (node == nullptr) return empty_bnodeid; + bnodeid_t sibling_id = empty_bnodeid; + if (node->has_valid_edge()) { + sibling_id = node->get_edge_value().bnode_id(); + } else { + sibling_id = node->next_bnode(); } + if (sibling_id == empty_bnodeid) { + return empty_bnodeid; + } else { + BtreeNodePtr sibling_node; + if (read_node_impl(sibling_id, sibling_node) != btree_status_t::success) { return empty_bnodeid; } - if (ret != btree_status_t::success) { - BT_LOG(ERROR, "An error occurred status={} during repair of parent_node={}, aborting the repair", - enum_name(ret), parent_node->node_id()); - std::memcpy(parent_node->m_phys_node_buf, tmp_buffer, this->m_bt_cfg.node_size()); + if (sibling_node->is_node_deleted()) { + LOGTRACEMOD(wbcache, "Sibling node [{}] is not the sibling for parent_node {}", sibling_node->to_string(), node->to_string()); + return find_true_sibling(sibling_node); + } else { + return sibling_id; + } } + return sibling_id; + } - delete[] tmp_buffer; - return ret; + K get_last_true_child_key(BtreeNodePtr const& parent_node) { + uint32_t nentries = parent_node->total_entries() + parent_node->has_valid_edge() ? 1 : 0; + BtreeLinkInfo cur_child_info; + BtreeNodePtr cur_child; + for (uint32_t i = nentries; i-- > 0;) { + parent_node->get_nth_value(i, &cur_child_info, false /* copy */); + if (auto ret = read_node_impl(cur_child_info.bnode_id(), cur_child); ret == btree_status_t::success) { + if (!cur_child->is_node_deleted()) { + if (cur_child->total_entries()) { + return cur_child->get_last_key< K >(); + } else { + LOGTRACEMOD(wbcache, "Last valid child {} has no entries", cur_child->to_string()); + } + } + } + } } + }; } // namespace homestore diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp index 801cace13..e14f6c18f 100644 --- a/src/include/homestore/index_service.hpp +++ b/src/include/homestore/index_service.hpp @@ -89,6 +89,7 @@ class IndexService { // the following methods are used wb_cache , which will not used by upper layer. so graceful shutdown just skips // them for now. void repair_index_node(uint32_t ordinal, IndexBufferPtr const& node_buf); + void parent_recover(uint32_t ordinal, IndexBufferPtr const& node_buf); void update_root(uint32_t ordinal, IndexBufferPtr const& node_buf); IndexWBCacheBase& wb_cache() { diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index a661da497..cf400ee73 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -57,6 +57,12 @@ table Btree { max_nodes_to_rebalance: uint32 = 3; mem_btree_page_size: uint32 = 8192; + + /* Maximum level of btree merge operation enabled while removig keys. */ + max_merge_level: uint8 = 1; + + /* Merge enabled */ + merge_turned_on: bool = true; } table Cache { diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index 122667726..fd411526a 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -58,6 +58,7 @@ void IndexCPContext::add_to_txn_journal(uint32_t index_ordinal, const IndexBuffe rec->append(op_t::child_new, buf->blkid()); } for (auto const& buf : freed_bufs) { + rec->free_node_level = buf->m_node_level; rec->append(op_t::child_freed, buf->blkid()); } } @@ -235,7 +236,78 @@ std::map< BlkId, IndexBufferPtr > IndexCPContext::recover(sisl::byte_view sb) { cur_ptr += rec->size(); LOGTRACEMOD(wbcache, "Recovered txn record: {}: {}", t, rec->to_string()); } + auto modifyBuffer = [](IndexBufferPtr& buffer) { + IndexBufferPtr up_buf = buffer->m_up_buffer; + auto real_up_buf = up_buf; + while (real_up_buf && real_up_buf->m_node_freed) { + real_up_buf = real_up_buf->m_up_buffer; + } + if (real_up_buf != up_buf) { + up_buf->remove_down_buffer(buffer); + buffer->m_up_buffer = real_up_buf; + real_up_buf->add_down_buffer(buffer); + LOGTRACEMOD(wbcache, "Change upbuffer from {} to {}", up_buf->to_string(), + buffer->m_up_buffer->to_string()); + } + }; +#if 0 + auto dag_print = [](const std::map< BlkId, IndexBufferPtr >& dags, std::string delimiter) { + int index = 1; + for (const auto& [blkid, bufferPtr] : dags) { + LOGTRACEMOD(wbcache, "{}{} - blkid {} buffer {} ", delimiter, index++, blkid.to_integer(), + bufferPtr->to_string()); + } + }; + LOGTRACEMOD(wbcache,"Before modify : \n "); + dag_print(buf_map, "Before: "); +#endif + for (auto& [blkid, bufferPtr] : buf_map) { + modifyBuffer(bufferPtr); + } + // LOGTRACEMOD(wbcache,"\n\n\nAFTER modify : \n "); + // dag_print(buf_map, "After: "); + + auto sanityCheck = [](const std::map< BlkId, IndexBufferPtr >& dags) { + for (const auto& [blkid, bufferPtr] : dags) { + auto up_buffer = bufferPtr->m_up_buffer; + if (up_buffer) { + HS_REL_ASSERT( + !up_buffer->m_node_freed, + "Sanity check failed: Buffer {} blkdid {} has an up_buffer {} blkid that is marked as freed.", + bufferPtr->to_string(), blkid.to_integer(), up_buffer->to_string(), + up_buffer->blkid().to_integer()); + HS_REL_ASSERT_EQ(up_buffer->m_created_cp_id, -1, + "Sanity check failed: Buffer {} has an up_buffer {} that just created", + bufferPtr->to_string(), up_buffer->to_string()); + HS_REL_ASSERT_EQ(up_buffer->m_index_ordinal, bufferPtr->m_index_ordinal, + "Sanity check failed: Buffer {} has an up_buffer {} that has different index_ordinal.", + bufferPtr->to_string(), up_buffer->to_string()); + HS_REL_ASSERT(!bufferPtr->is_meta_buf(), + "Sanity check failed: down buffer {} is meta buffer of up buffer {}", + bufferPtr->to_string(), up_buffer->to_string()); + HS_REL_ASSERT( + !up_buffer->m_wait_for_down_buffers.testz(), + "Sanity check failed: Buffer {} has an up_buffer {} that has zero m_wait_for_down_buffers.", + bufferPtr->to_string(), up_buffer->to_string()); +#ifdef _PRERELEASE + HS_DBG_ASSERT(up_buffer->is_in_down_buffers(bufferPtr), + "Sanity check failed: up_buffer {} has't {} as a down_buffer.", up_buffer->to_string(), + bufferPtr->to_string()); +#endif + } + HS_REL_ASSERT(!bufferPtr->m_node_freed || bufferPtr->m_wait_for_down_buffers.testz(), + "Sanity check failed: Freed buffer {} has non-zero m_wait_for_down_buffers.", + bufferPtr->to_string()); +#ifdef _PRERELEASE + HS_DBG_ASSERT(bufferPtr->m_wait_for_down_buffers.test_eq(bufferPtr->m_down_buffers.size()), + "Sanity check failed: Buffer {} has a mismatch between down_buffers_count and " + "m_wait_for_down_buffers.", + bufferPtr->to_string()); +#endif + } + }; + sanityCheck(buf_map); return buf_map; } @@ -264,7 +336,14 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, } if (up_buf) { - auto real_up_buf = (up_buf->m_created_cp_id == cpg->id()) ? up_buf->m_up_buffer : up_buf; + auto real_up_buf = up_buf; + if (up_buf->m_created_cp_id == cpg->id()) { + real_up_buf = up_buf->m_up_buffer; + } else if (up_buf->m_node_freed) { + real_up_buf = up_buf->m_up_buffer; + LOGTRACEMOD(wbcache, "\n\n change upbuffer from {} to {}\n\n", up_buf->to_string(), + real_up_buf->to_string()); + } #ifndef NDEBUG // if (!is_sibling_link || (buf->m_up_buffer == real_up_buf)) { return buf;} @@ -299,6 +378,7 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, for (uint8_t idx{0}; idx < rec->num_freed_ids; ++idx) { auto freed_buf = rec_to_buf(rec, false /* is_meta */, rec->blk_id(cur_idx++), inplace_child_buf ? inplace_child_buf : parent_buf); + freed_buf->m_node_level = rec->free_node_level; freed_buf->m_node_freed = true; } } @@ -337,6 +417,9 @@ std::string IndexCPContext::txn_record::to_string() const { fmt::format_to(std::back_inserter(str), ", freed_ids=["); add_to_string(str, idx, num_freed_ids); + if (num_freed_ids) { + fmt::format_to(std::back_inserter(str), ", freed_node_level= {}", (uint8_t)(free_node_level)); + }; fmt::format_to(std::back_inserter(str), "{}", (is_parent_meta ? ", parent is meta" : "")); return str; } diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/index_cp.hpp index d7bd124df..dffb3113c 100644 --- a/src/lib/index/index_cp.hpp +++ b/src/lib/index/index_cp.hpp @@ -37,7 +37,8 @@ struct IndexCPContext : public VDevCPContext { uint8_t has_inplace_parent : 1; // Do we have parent_id in the list of ids. It will be first uint8_t has_inplace_child : 1; // Do we have child_id in the list of ids. It will be second uint8_t is_parent_meta : 1; // Is the parent buffer a meta buffer - uint8_t reserved1 : 5; + uint8_t free_node_level : 4; // Free/created node level + uint8_t reserved1 : 1; uint8_t num_new_ids; uint8_t num_freed_ids; uint8_t reserved{0}; @@ -48,6 +49,7 @@ struct IndexCPContext : public VDevCPContext { has_inplace_parent{0x0}, has_inplace_child{0x0}, is_parent_meta{0x0}, + free_node_level{0x0}, num_new_ids{0}, num_freed_ids{0}, index_ordinal{ordinal} {} diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index 8e8f47bef..76da72842 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -88,6 +88,9 @@ void IndexService::start() { std::unique_lock lg(m_index_map_mtx); for (const auto& [_, tbl] : m_index_map) { tbl->recovery_completed(); +#ifdef _PRERELEASE + tbl->audit_tree(); +#endif } // Force taking cp after recovery done. This makes sure that the index table is in consistent state and dirty buffer // after recovery can be added to dirty list for flushing in the new cp @@ -161,6 +164,16 @@ void IndexService::repair_index_node(uint32_t ordinal, IndexBufferPtr const& nod } } +void IndexService::parent_recover(uint32_t ordinal, IndexBufferPtr const& node_buf) { + auto tbl = get_index_table(node_buf->m_index_ordinal); + if (tbl) { + tbl->delete_stale_children(node_buf); + } else { + HS_DBG_ASSERT(false, "Index corresponding to ordinal={} has not been loaded yet, unexpected", + node_buf->m_index_ordinal); + } +} + void IndexService::update_root(uint32_t ordinal, IndexBufferPtr const& node_buf) { auto tbl = get_index_table(ordinal); if (tbl) { @@ -264,7 +277,7 @@ void IndexBuffer::remove_down_buffer(const IndexBufferPtr& buf) { } } } - HS_DBG_ASSERT(found, "Down buffer is linked to up_buf, but up_buf doesn't have down_buf in its list"); + HS_DBG_ASSERT(found, "Down buffer {} is linked to up_buf, but up_buf {} doesn't have down_buf in its list", buf->to_string(), buf->m_up_buffer? buf->m_up_buffer->to_string(): std::string("nulptr")); #endif } diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 749b530d9..66abd4b37 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -106,6 +106,7 @@ BtreeNodePtr IndexWBCache::alloc_buf(node_initializer_t&& node_initializer) { idx_buf->m_created_cp_id = cpg->id(); idx_buf->m_dirtied_cp_id = cpg->id(); auto node = node_initializer(idx_buf); + idx_buf->m_node_level = node->level(); if (!m_in_recovery) { // Add the node to the cache. Skip if we are in recovery mode. @@ -127,6 +128,7 @@ void IndexWBCache::write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); } else { + LOGTRACEMOD(wbcache, "write buf [{}] in recovery mode", buf->to_string()); m_vdev->sync_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid); } } else { @@ -141,7 +143,7 @@ void IndexWBCache::read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t auto const blkid = BlkId{id}; retry: - // Check if the blkid is already in cache, if not load and put it into the cache + // Check if the blkid is already in cache, if notL load and put it into the cache if (!m_in_recovery && m_cache.get(blkid, node)) { return; } // Read the buffer from virtual device @@ -179,6 +181,7 @@ bool IndexWBCache::get_writable_buf(const BtreeNodePtr& node, CPContext* context // If its not clean, we do deep copy. auto new_buf = std::make_shared< IndexBuffer >(idx_buf->m_blkid, m_node_size, m_vdev->align_size()); new_buf->m_created_cp_id = idx_buf->m_created_cp_id; + new_buf->m_node_level = idx_buf->m_node_level; std::memcpy(new_buf->raw_buffer(), idx_buf->raw_buffer(), m_node_size); node->update_phys_buf(new_buf->raw_buffer()); @@ -299,11 +302,11 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p if (new_node_bufs.empty() && freed_node_bufs.empty()) { // This is an update for meta, root transaction. - if (child_buf->m_created_cp_id != -1) { - DEBUG_ASSERT_EQ(child_buf->m_created_cp_id, icp_ctx->id(), - "Root buffer is not created by current cp (for split root), its not expected"); + if (child_buf->m_created_cp_id < icp_ctx->id()) { + icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, child_buf, {}, {}); + } else { + icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {}); } - icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {}); } else { icp_ctx->add_to_txn_journal(index_ordinal, child_buf->m_up_buffer /* real up buffer */, child_buf, new_node_bufs, freed_node_bufs); @@ -312,15 +315,14 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p // log new nodes and freed nodes and parent and child static uint32_t txn_id = 0; static int last_cp_id = -2; - static std::string txn = ""; + std::string txn = ""; if (last_cp_id != icp_ctx->id()) { last_cp_id = icp_ctx->id(); txn_id = 0; - txn = ""; } if (new_node_bufs.empty() && freed_node_bufs.empty()) { - fmt::format_to(std::back_inserter(txn), "\n{} - parent=[{}] child=[{}] new=[{}] freed=[{}]", txn_id, + fmt::format_to(std::back_inserter(txn), "{} - parent=[{}] child=[{}] new=[{}] freed=[{}]", txn_id, (parent_buf && parent_buf->blkid().to_integer() != 0) ? std::to_string(parent_buf->blkid().to_integer()) : "empty", @@ -341,10 +343,10 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p ? std::to_string(child_buf->blkid().to_integer()) : "empty"; - fmt::format_to(std::back_inserter(txn), "\n{} - parent={} child={} new=[{}] freed=[{}]", txn_id, parent_str, + fmt::format_to(std::back_inserter(txn), ": {} - parent={} child={} new=[{}] freed=[{}]", txn_id, parent_str, child_str, new_nodes, freed_nodes); } - LOGTRACEMOD(wbcache, "\ttranasction till now: cp: {} \n{}\n", icp_ctx->id(), txn); + LOGTRACEMOD(wbcache, "tranasction till now: cp: {} {}", icp_ctx->id(), txn); txn_id++; #endif #if 0 @@ -448,15 +450,7 @@ void IndexWBCache::load_buf(IndexBufferPtr const& buf) { } } -struct DagNode { - IndexBufferPtr buffer; - std::vector< shared< DagNode > > children; -}; - -using DagPtr = std::shared_ptr< DagNode >; -using DagMap = std::map< IndexBufferPtr, DagPtr >; - -static DagMap generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap) { +IndexWBCache::DagMap IndexWBCache::generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap) { std::vector< IndexBufferPtr > bufs; std::ranges::transform(bufmap, std::back_inserter(bufs), [](const auto& pair) { return pair.second; }); @@ -498,7 +492,7 @@ static DagMap generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap) { return generateDagMap(bufs); } -static std::string to_string_dag_bufs(DagMap& dags, cp_id_t cp_id = 0) { +std::string IndexWBCache::to_string_dag_bufs(DagMap& dags, cp_id_t cp_id) { std::string str{fmt::format("#_of_dags={}\n", dags.size())}; int cnt = 1; for (const auto& [_, dag] : dags) { @@ -509,6 +503,7 @@ static std::string to_string_dag_bufs(DagMap& dags, cp_id_t cp_id = 0) { stack.pop_back(); auto snew = node->buffer->m_created_cp_id == cp_id ? "NEW" : ""; auto sfree = node->buffer->m_node_freed ? "FREED" : ""; + load_buf(node->buffer); fmt::format_to(std::back_inserter(str), "{}{}-{} {} {}\n", std::string(level * 4, ' '), index, node->buffer->to_string(), snew, sfree); int c = node->children.size(); @@ -557,10 +552,8 @@ void IndexWBCache::recover(sisl::byte_view sb) { return log; }; - std::string log = fmt::format("Recovering bufs (#of bufs = {}) before processing them\n", bufs.size()); - LOGTRACEMOD(wbcache, "{}\n{}", log, detailed_log(bufs, {})); auto dags = generate_dag_buffers(bufs); - LOGTRACEMOD(wbcache, "Before recovery: {}", to_string_dag_bufs(dags, icp_ctx->id())); + LOGTRACEMOD(wbcache, "before processing recovery DAGS:\n {}\n\n\n\n", to_string_dag_bufs(dags, icp_ctx->id())); #endif // At this point, we have the DAG structure (up/down dependency graph), exactly the same as prior to crash, with one @@ -576,27 +569,60 @@ void IndexWBCache::recover(sisl::byte_view sb) { // On the second pass, we only take part of the parents/siblings and then repair them, if needed. std::vector< IndexBufferPtr > pending_bufs; std::vector< IndexBufferPtr > deleted_bufs; + std::multiset< IndexBufferPtr, bool (*)(const IndexBufferPtr&, const IndexBufferPtr&) > + potential_parent_recovered_bufs( + [](const IndexBufferPtr& a, const IndexBufferPtr& b) { return a->m_node_level < b->m_node_level; }); + + LOGTRACEMOD(wbcache, "\n\n\nRecovery processing begins\n\n\n"); for (auto const& [_, buf] : bufs) { + load_buf(buf); + if (buf->m_node_freed) { - // Freed node - load_buf(buf); + LOGTRACEMOD(wbcache, "recovering free buf {}", buf->to_string()); if (was_node_committed(buf)) { // Mark this buffer as deleted, so that we can avoid using it anymore when repairing its parent's link r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted = true; - write_buf(nullptr, buf, icp_ctx); + write_buf(nullptr, buf, icp_ctx); // no need to write it here !! deleted_bufs.push_back(buf); pending_bufs.push_back(buf->m_up_buffer); + LOGINFOMOD(wbcache, "Freeing deleted buf {} and adding up buffer to pending {}", buf->to_string(), + buf->m_up_buffer->to_string()); } else { // (Up) buffer is not committed, node need to be kept and (potentially) repaired later - buf->m_node_freed = false; - if (buf->m_created_cp_id == icp_ctx->id()) { - // New nodes need to be commited first + if (buf->m_created_cp_id != icp_ctx->id()) { + LOGTRACEMOD(wbcache, + "NOT FREE committing buffer {} node deleted is false reason: node commited?= {} " + "up committed? {}", + buf->to_string(), was_node_committed(buf), was_node_committed(buf->m_up_buffer)); + buf->m_node_freed = false; + r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted = false; m_vdev->commit_blk(buf->m_blkid); + // it can happen when children moved to one of right parent sibling and then the previous node is deleted but not commited during crash (upbuffer is not committed). but its children already committed. and freed (or changed) + if (buf->m_node_level) { potential_parent_recovered_bufs.insert(buf); } + } else { + LOGINFO("deleting and creating new buf {}", buf->to_string()); + deleted_bufs.push_back(buf); + } + // 1- upbuffer was dirtied by the same cp, so it is not commited, so we don't need to repair it. + // remove it from down_waiting list (probably recursively going up) 2- upbuffer was created and + // freed at the same cp, so it is not commited, so we don't need to repair it. + if (buf->m_up_buffer) { + LOGTRACEMOD(wbcache, "remove_down_buffer {} from up buffer {}", buf->to_string(), + buf->m_up_buffer->to_string()); + buf->m_up_buffer->remove_down_buffer(buf); + if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) { + // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers + LOGINFOMOD(wbcache, + "\n\npruning up_buffer due to zero dependency of child\n up buffer {}\n buffer {}", + buf->m_up_buffer ? buf->m_up_buffer->to_string() : std::string("nullptr"), + buf->to_string()); + update_up_buffer_counters(buf->m_up_buffer /*,visited_bufs*/); + } + buf->m_up_buffer = nullptr; } - pending_bufs.push_back(buf); - buf->m_wait_for_down_buffers.increment(1); // Purely for recover_buf() counter consistency } } else if (buf->m_created_cp_id == icp_ctx->id()) { + LOGTRACEMOD(wbcache, "recovering new buf {}", buf->to_string()); // New node if (was_node_committed(buf) && was_node_committed(buf->m_up_buffer)) { // Both current and up buffer is commited, we can safely commit the current block @@ -605,31 +631,63 @@ void IndexWBCache::recover(sisl::byte_view sb) { } else { // Up buffer is not committed, we need to repair it first buf->m_up_buffer->remove_down_buffer(buf); - // buf->m_up_buffer = nullptr; if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) { // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers + LOGINFOMOD(wbcache, "\npruning due to zero dependency of child\n up buffer {} \n buffer \n{}", + buf->m_up_buffer ? buf->m_up_buffer->to_string() : std::string("nullptr"), + buf->to_string()); update_up_buffer_counters(buf->m_up_buffer); } +// buf->m_up_buffer = nullptr; } } } - + LOGTRACEMOD(wbcache, "\n\n\nRecovery processing Ends\n\n\n"); #ifdef _PRERELEASE LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", pending_bufs.size(), bufs.size(), icp_ctx->id()); - LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, pending_bufs)); - LOGTRACEMOD(wbcache, "After recovery: {}", to_string_dag_bufs(dags, icp_ctx->id())); -#endif + // add deleted bufs to logs here as well + auto modified_dags = generate_dag_buffers(bufs); + LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log({}, pending_bufs)); + LOGTRACEMOD(wbcache, "After recovery: {}", to_string_dag_bufs(modified_dags, icp_ctx->id())); - for (auto const& buf : pending_bufs) { - recover_buf(buf); - if (buf->m_bytes != nullptr && r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted) { +#endif + uint32_t cnt = 0; + LOGTRACEMOD(wbcache, "Potential parent recovered bufs (#of bufs = {})", + potential_parent_recovered_bufs.size()); + for (auto const& buf : potential_parent_recovered_bufs) { + LOGTRACEMOD(wbcache, " {} - check stale recovered buf {}", cnt++, buf->to_string()); + } + // This step is needed since there is a case where all(or some) children of an interior node is freed (after moving + // to a previous sibling parent) and after crash, this node has stale links to its children + cnt = 0; + std::vector buffers_to_repair; + for (auto const& buf : potential_parent_recovered_bufs) { + LOGTRACEMOD(wbcache, " {} - potential parent recovered buf {}", cnt, buf->to_string()); + parent_recover(buf); + if (buf->m_bytes == nullptr || r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted) { // This buffer was marked as deleted during repair, so we also need to free it deleted_bufs.push_back(buf); + }else + { + // This buffer was not marked as deleted during repair, so we need to repair it + buffers_to_repair.push_back(buf); } } + // let all unfreed buffers to be repaired first. This is important to let detect and remove all stale links first + // and then repair them before actual repair (due to dependency of finding true siblings) + for (auto const& buf : buffers_to_repair) { + LOGTRACEMOD(wbcache, "recover and repairing unfreed non-stale link interior node buf {}", buf->to_string()); + index_service().repair_index_node(buf->m_index_ordinal, buf); + } + // actual recover is done here in recovery path + for (auto const& buf : pending_bufs) { + LOGTRACEMOD(wbcache, "recover and repairing up_buffer buf {}", buf->to_string()); + recover_buf(buf); + } for (auto const& buf : deleted_bufs) { + LOGTRACEMOD(wbcache, "freeing buf after repairing (last step) {}", buf->to_string()); m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); } @@ -637,6 +695,9 @@ void IndexWBCache::recover(sisl::byte_view sb) { m_vdev->recovery_completed(); } +void IndexWBCache::parent_recover(IndexBufferPtr const& buf) { + index_service().parent_recover(buf->m_index_ordinal, buf); +} // if buf->m_wait_for_down_buffers.testz() is true (which means that it has no dependency on any other buffer) then we // can decrement the wait_for_down_buffers of its up buffer. If the up buffer has up buffer, then we need to decrement // its wait_for_down_buffers. If the up buffer of up buffer has wait_for_down_buffers as 0, then we need to decrement @@ -644,14 +705,16 @@ void IndexWBCache::recover(sisl::byte_view sb) { // wait_for_down_buffers as 0, then we need to decrement its wait_for_down_buffers. void IndexWBCache::update_up_buffer_counters(IndexBufferPtr const& buf) { if (buf == nullptr || !buf->m_wait_for_down_buffers.testz() || buf->m_up_buffer == nullptr) { - LOGINFOMOD(wbcache, "Finish decrementing wait_for_down_buffers"); + LOGINFOMOD(wbcache, "Finish decrementing wait_for_down_buffers\n"); return; } auto grand_buf = buf->m_up_buffer; - grand_buf->remove_down_buffer(buf); LOGINFOMOD(wbcache, - "Decrementing wait_for_down_buffers for buffer {} due to zero dependency of child {}, Keep going up", + "Decrementing wait_for_down_buffers due to zero dependency of child for grand_buffer {} up_buffer {}, " + "Keep going up", grand_buf->to_string(), buf->to_string()); + grand_buf->remove_down_buffer(buf); + buf->m_up_buffer = nullptr; update_up_buffer_counters(grand_buf); } @@ -686,7 +749,7 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) { // If the node is freed, then it can be considered committed as long as its up buffer was committed if (buf->m_node_freed) { - HS_DBG_ASSERT(buf->m_up_buffer, "Buf was marked deleted, but doesn't have an up_buffer"); + HS_DBG_ASSERT(buf->m_up_buffer, "Buf {} was marked deleted, but doesn't have an up_buffer", buf->to_string()); return was_node_committed(buf->m_up_buffer); } @@ -698,8 +761,7 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) { //////////////////// CP Related API section ///////////////////////////////// folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { - LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp \ndag={}\n\n cp context {}", cp_ctx->to_string_with_dags(), - cp_ctx->to_string()); + LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp \ndag={}", cp_ctx->to_string_with_dags()); // #ifdef _PRERELEASE // static int id = 0; // auto filename = "cp_" + std::to_string(id++) + "_" + std::to_string(rand() % 100) + ".dot"; @@ -754,8 +816,8 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { } void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) { -#ifdef _PRERELEASE static std::once_flag flag; +#ifdef _PRERELEASE if (hs()->crash_simulator().is_crashed()) { std::call_once(flag, []() { LOGINFO("Crash simulation is ongoing; aid simulation by not flushing."); }); return; @@ -779,19 +841,20 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const if (!sb.is_empty()) { meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); } process_write_completion(cp_ctx, buf); } else if (buf->m_node_freed) { - LOGTRACEMOD(wbcache, "Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), + LOGTRACEMOD(wbcache, "cp {} Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), buf->to_string()); process_write_completion(cp_ctx, buf); } else { - LOGTRACEMOD(wbcache, "Flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) .thenValue([buf, cp_ctx](auto) { try { auto& pthis = s_cast< IndexWBCache& >(wb_cache()); pthis.process_write_completion(cp_ctx, buf); - } catch (const std::runtime_error& e) { LOGERROR("Failed to access write-back cache: {}", e.what()); } + } catch (const std::runtime_error& e) { + std::call_once(flag, + []() { LOGERROR("Crash simulation is ongoing; aid simulation by not flushing."); }); + } }); - if (!part_of_batch) { m_vdev->submit_batch(); } } } diff --git a/src/lib/index/wb_cache.hpp b/src/lib/index/wb_cache.hpp index 7d10d7f54..82221ab0d 100644 --- a/src/lib/index/wb_cache.hpp +++ b/src/lib/index/wb_cache.hpp @@ -61,6 +61,13 @@ class IndexWBCache : public IndexWBCacheBase { folly::Future< bool > async_cp_flush(IndexCPContext* context); IndexBufferPtr copy_buffer(const IndexBufferPtr& cur_buf, const CPContext* cp_ctx) const; void recover(sisl::byte_view sb) override; + struct DagNode { + IndexBufferPtr buffer; + std::vector< shared< DagNode > > children; + }; + + using DagPtr = std::shared_ptr< DagNode >; + using DagMap = std::map< IndexBufferPtr, DagPtr >; private: void start_flush_threads(); @@ -77,6 +84,9 @@ class IndexWBCache : public IndexWBCacheBase { IndexBufferPtrList& bufs); void recover_buf(IndexBufferPtr const& buf); + void parent_recover(IndexBufferPtr const& buf); + std::string to_string_dag_bufs(DagMap& dags, cp_id_t cp_id = 0); + DagMap generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap); bool was_node_committed(IndexBufferPtr const& buf); void load_buf(IndexBufferPtr const& buf); void update_up_buffer_counters(IndexBufferPtr const& buf); diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index cba159954..f255ea81b 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -34,7 +34,7 @@ if (${build_nonio_tests}) set(TEST_MEMBTREE_SOURCE_FILES test_mem_btree.cpp) add_executable(test_mem_btree ${TEST_MEMBTREE_SOURCE_FILES}) - target_link_libraries(test_mem_btree ${COMMON_TEST_DEPS} GTest::gtest) + target_link_libraries(test_mem_btree homestore ${COMMON_TEST_DEPS} GTest::gtest) add_test(NAME MemBtree COMMAND test_mem_btree) set_tests_properties(MemBtree PROPERTIES TIMEOUT 1200) diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index 3ab8632e6..227fd0ee0 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -25,7 +25,7 @@ #include #include #include - +#include "common/homestore_config.hpp" #include "test_common/range_scheduler.hpp" #include "shadow_map.hpp" @@ -44,8 +44,17 @@ struct BtreeTestHelper { void SetUp() { m_cfg.m_leaf_node_type = T::leaf_node_type; m_cfg.m_int_node_type = T::interior_node_type; + if (SISL_OPTIONS.count("disable_merge")) { + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.btree.merge_turned_on = false; + HS_SETTINGS_FACTORY().save(); + }); + } + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.btree.max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); + HS_SETTINGS_FACTORY().save(); + }); m_max_range_input = SISL_OPTIONS["num_entries"].as< uint32_t >(); - if (SISL_OPTIONS.count("disable_merge")) { m_cfg.m_merge_turned_on = false; } if (m_is_multi_threaded) { std::mutex mtx; @@ -225,16 +234,14 @@ struct BtreeTestHelper { rreq.enable_route_tracing(); bool removed = (m_bt->remove(rreq) == btree_status_t::success); - if(care_success) { + if (care_success) { ASSERT_EQ(removed, m_shadow_map.exists(*pk)) << "Removal of key " << pk->key() << " status doesn't match with shadow"; if (removed) { m_shadow_map.remove_and_check(*pk, *existing_v); } - }else { + } else { // Do not care if the key is not present in the btree, just cleanup the shadow map m_shadow_map.erase(*pk); } - - } void remove_random() { diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp index 6083140bf..c62ce2f84 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_index_btree.cpp @@ -49,6 +49,8 @@ SISL_OPTION_GROUP( (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", ::cxxopts::value< bool >()->default_value("1"), ""), + (max_merge_level, "", "max_merge_level", "max merge level", ::cxxopts::value< uint8_t >()->default_value("127"), + ""), (seed, "", "seed", "random engine seed, use random if not defined", ::cxxopts::value< uint64_t >()->default_value("0"), "number")) diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index e13c886c9..9121f7240 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -49,6 +49,8 @@ SISL_OPTION_GROUP( ""), (min_keys_in_node, "", "min_keys_in_node", "min_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("6"), ""), + (max_merge_level, "", "max_merge_level", "max merge level", ::cxxopts::value< uint8_t >()->default_value("1"), ""), + (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", @@ -329,6 +331,15 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Reset btree with uuid {} - erase shadow map {}", boost::uuids::to_string(uuid), m_shadow_filename); } + void destroy_btree() { + hs()->index_service().remove_index_table(this->m_bt); + this->m_bt->destroy(); + this->trigger_cp(true); + this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); + this->m_shadow_map.save(m_shadow_filename); + LOGINFO("destroy btree - erase shadow map {}", m_shadow_filename); + } + void restart_homestore(uint32_t shutdown_delay_sec = 3) override { this->params(HS_SERVICE::INDEX).index_svc_cbs = new TestIndexServiceCallbacks(this); LOGINFO("\n\n\n\n\n\n shutdown homestore for index service Test\n\n\n\n\n"); @@ -441,8 +452,9 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Sanity check passed for {} keys!", count); } - void crash_and_recover(OperationList& operations, std::string filename = "") { - // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + void crash_and_recover(std::string& flip, OperationList& operations, std::string filename = "") { + this->remove_flip(flip); + // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); LOGINFO("Before Crash: {} keys in shadow map and it is actually {} keys in tree - operations size {}", this->m_shadow_map.size(), tree_key_count(), operations.size()); @@ -461,7 +473,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Visualize the tree file after recovery : {}", rec_filename); this->visualize_keys(rec_filename); } - // this->print_keys("Post crash and recovery, btree structure: "); + // this->print_keys("Post crash and recovery, btree structure: "); sanity_check(operations); // Added to the index service right after recovery. Not needed here // test_common::HSTestHelper::trigger_cp(true); @@ -473,7 +485,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Visualize the tree after reapply {}", re_filename); this->visualize_keys(re_filename); } - // this->print_keys("Post reapply, btree structure: "); + // this->print_keys("Post reapply, btree structure: "); this->get_all(); LOGINFO("After reapply: {} keys in shadow map and actually {} in tress", this->m_shadow_map.size(), @@ -580,7 +592,7 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) { // LOGINFO("\t\t\t\t\t\t\t\t\t\t\t\t\tupserting entry {}", k); this->put(k, btree_put_type::INSERT, true /* expect_success */); } - this->crash_and_recover(operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); + this->crash_and_recover(flips[i], operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); if (renew_btree_after_crash) { this->reset_btree(); }; } } @@ -709,8 +721,152 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { } } else { // remove the flips so that they do not get triggered erroneously - this->remove_flip(flip); - this->crash_and_recover(operations, fmt::format("long_tree_{}", round)); + this->crash_and_recover(flip, operations, fmt::format("long_tree_{}", round)); + } + if (elapsed_time - last_progress_time > 30) { + last_progress_time = elapsed_time; + print_time = true; + } + if (print_time) { + LOGINFO("\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " + "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", + round, rounds, round * 100.0 / rounds, elapsed_time, this->m_run_time, + elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), num_entries, + this->tree_key_count() * 100.0 / num_entries); + } + // this->print_keys(fmt::format("reapply: after round {}", round)); + if (renew_btree_after_crash) { this->reset_btree(); }; + } + this->destroy_btree(); + log_obj_life_counter(); +} + +TYPED_TEST(IndexCrashTest, long_running_remove_crash) { + + // Define the lambda function + auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); + auto const rounds = SISL_OPTIONS["num_rounds"].as< uint32_t >(); + auto const num_entries_per_rounds = SISL_OPTIONS["num_entries_per_rounds"].as< uint32_t >(); + bool load_mode = SISL_OPTIONS.count("load_from_file"); + bool save_mode = SISL_OPTIONS.count("save_to_file"); + SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); + vector< std::string > flips = {"crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child"/*, + "crash_flush_on_freed_child"*/}; + + std::string flip = ""; + OperationList operations; + auto m_start_time = Clock::now(); + auto time_to_stop = [this, m_start_time]() { return (get_elapsed_time_sec(m_start_time) > this->m_run_time); }; + double elapsed_time, progress_percent, last_progress_time = 0; + bool renew_btree_after_crash = false; + auto cur_flip_idx = 0; + std::uniform_int_distribution<> dis(1, 100); + int flip_percentage = 90; // Set the desired percentage here + bool normal_execution = true; + bool clean_shutdown = true; + // if it is safe then delete all previous save files + if (save_mode) { + std::filesystem::remove_all("/tmp/operations_*.txt"); + std::filesystem::remove_all("/tmp/flips_history.txt"); + } + // init tree + LOGINFO("Step 0: Fill up the tree with {} entries", num_entries); + if (load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_0.txt")); + } else { + operations = generator.generateOperations(num_entries, true /* reset */); + if (save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); } + } + // auto opstr = SequenceGenerator::printOperations(operations); + // LOGINFO("Lets before crash print operations\n{}", opstr); + + for (auto [k, _] : operations) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } + generator.setPutFrequency(0); + generator.setRemoveFrequency(100); + + // Trigger the cp to make sure middle part is successful + LOGINFO("Step 0-1: Flush all the entries so far"); + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + this->m_shadow_map.save(this->m_shadow_filename); + // this->print_keys("reapply: after preload"); + this->visualize_keys("tree_after_preload.dot"); + + for (uint32_t round = 1; round <= rounds && !time_to_stop() && this->tree_key_count() >= num_entries_per_rounds; + round++) { + LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, rounds); + // this->print_keys(fmt::format("before round {}",round)); + bool print_time = false; + elapsed_time = get_elapsed_time_sec(m_start_time); + if (load_mode) { + std::ifstream file("/tmp/flips_history.txt"); + std::string line; + bool found = false; + for (uint32_t i = 0; i < round && std::getline(file, line); i++) { + if (i == round - 1) { + found = true; + break; + } + } + if (found && !line.empty()) { + if (line == "normal") { + normal_execution = true; + } else { + normal_execution = false; + flip = line; + LOGINFO("Step 1-{}: Set flag {}", round, flip); + this->set_basic_flip(flip, 1, 100); + } + } + file.close(); + } else { + if (dis(g_re) <= flip_percentage) { + flip = flips[cur_flip_idx++ % flips.size()]; + LOGINFO("Step 1-{}: Set flag {}", round, flip); + this->set_basic_flip(flip, 1, 100); + normal_execution = false; + } else { + normal_execution = true; + LOGINFO("Step 1-{}: No flip set", round); + } + if (save_mode) { + // save the filp name to a file for later use + std::ofstream file("/tmp/flips_history.txt", std::ios::app); + if (file.is_open()) { file << (normal_execution ? "normal" : flip) << "\n"; } + file.close(); + } + } + if (load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); + } else { + operations = generator.generateOperations(num_entries_per_rounds, renew_btree_after_crash /* reset */); + if (save_mode) { + SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations); + } + } + // LOGINFO("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); + for (auto [k, _] : operations) { + this->remove_one(k, true /* expect_success */); + if (!time_to_stop()) { + static bool print_alert = false; + if (print_alert) { + LOGINFO("It is time to stop but let's finish this round and then stop!"); + print_alert = false; + } + } + } + if (normal_execution) { + if (clean_shutdown) { + this->m_shadow_map.save(this->m_shadow_filename); + this->restart_homestore(); + } else { + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + } + } else { + this->crash_and_recover(flip, operations, fmt::format("long_tree_{}", round)); } if (elapsed_time - last_progress_time > 30) { last_progress_time = elapsed_time; @@ -726,13 +882,17 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { // this->print_keys(fmt::format("reapply: after round {}", round)); if (renew_btree_after_crash) { this->reset_btree(); }; } + this->print_keys(fmt::format("tree at end")); + this->destroy_btree(); + log_obj_life_counter(); } // Basic reverse and forward order remove with different flip points TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { vector< std::string > flip_points = { - "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child", - // "crash_flush_on_freed_child", + "crash_flush_on_merge_at_parent", + "crash_flush_on_merge_at_left_child", + "crash_flush_on_freed_child", }; for (size_t i = 0; i < flip_points.size(); ++i) { @@ -742,7 +902,7 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { LOGINFO("=== Testing flip point: {} - {} ===", i + 1, flip_point); // Populate some keys [1,num_entries) and trigger cp to persist - LOGINFO("Step {}-1: Populate some keys and flush", i + 1); + LOGINFO("Step {}-0: Populate some keys and flush", i + 1); auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); for (auto k = 0u; k < num_entries; ++k) { this->put(k, btree_put_type::INSERT, true /* expect_success */); @@ -750,10 +910,8 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { test_common::HSTestHelper::trigger_cp(true); this->m_shadow_map.save(this->m_shadow_filename); - this->visualize_keys("tree_merge_full.dot"); - // Split keys into batches and remove the last one in reverse order - LOGINFO("Step {}-2: Set crash flag, remove some keys in reverse order", i + 1); + LOGINFO("\n\n\n\n\n\n\n\n\n\n\n\n\n\nStep {}-1: Set crash flag {}", i + 1, flip_point); int batch_num = 4; { int n = batch_num; @@ -763,20 +921,21 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { for (auto k = r; k >= l; --k) { ops.emplace_back(k, OperationType::Remove); } - LOGINFO("Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, r, l); - + LOGINFO("Step {}-1-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, r, l); + this->print_keys(fmt::format("Print before Step {}-1-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, + batch_num, r, l)); this->set_basic_flip(flip_point); for (auto [k, _] : ops) { this->remove_one(k, true); } - this->visualize_keys("tree_merge_before_first_crash.dot"); - - LOGINFO("Step {}-2-2: Trigger cp to crash", i + 1); - this->crash_and_recover(ops); + LOGINFO("Step {}-1-2: Trigger cp to crash", i + 1); + this->crash_and_recover(flip_point, ops); } + this->print_keys(fmt::format("Print after recover Step {}1--3: flip {}", i + 1, flip_point)); // Remove the next batch of keys in forward order - LOGINFO("Step {}-3: Remove another batch in ascending order", i + 1) { + LOGINFO("\n\n\n\n\n\n\n\n\n\n\n\n\n\nStep {}-2: Set crash flag {}", i + 1, flip_point); + { int n = batch_num - 1; auto r = num_entries * n / batch_num - 1; auto l = num_entries * (n - 1) / batch_num; @@ -784,21 +943,47 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { for (auto k = l; k <= r; ++k) { ops.emplace_back(k, OperationType::Remove); } + LOGINFO("Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); + this->print_keys(fmt::format("Print before Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, + batch_num, l, r)); + this->set_basic_flip(flip_point); + for (auto [k, _] : ops) { + this->remove_one(k, true); + } + LOGINFO("Step {}-2-2: Trigger cp to crash", i + 1); + this->crash_and_recover(flip_point, ops); + } + this->print_keys(fmt::format("Print after recover Step {}-2-3: flip {}", i + 1, flip_point)); + + // Remove the next batch of keys in random order + LOGINFO("\n\n\n\n\n\n\n\n\n\n\n\n\n\nStep {}-3: Set crash flag {}", i + 1, flip_point); + { + int n = batch_num - 2; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + SequenceGenerator generator(0, 100, l, r); + generator.fillRange(l, r); + OperationList ops = generator.generateOperations(r - l + 1, false); + LOGINFO("Step {}-3-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); this->set_basic_flip(flip_point); for (auto [k, _] : ops) { this->remove_one(k, true); } - this->visualize_keys("tree_merge_before_second_crash.dot"); + this->print_keys(fmt::format("Print before Step {}-3: Remove keys in batch {}/{} ({} to {})", i + 1, n, + batch_num, l, r)); LOGINFO("Step {}-3-2: Trigger cp to crash", i + 1); - this->crash_and_recover(ops); + this->crash_and_recover(flip_point, ops); } + this->print_keys(fmt::format("Print after recover Step {}-3-3: flip {}", i + 1, flip_point)); // Remove the next batch of keys in random order - LOGINFO("Step {}-4: Remove another batch in random order", i + 1) { - int n = batch_num - 2; + LOGINFO("\n\n\n\n\n\n\n\n\n\n\n\n\n\nStep {}-4: Set crash flag {} Remove another batch in ascending order", + i + 1, flip_point); + { + int n = batch_num - 3; auto r = num_entries * n / batch_num - 1; auto l = num_entries * (n - 1) / batch_num; SequenceGenerator generator(0, 100, l, r); @@ -806,21 +991,17 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { OperationList ops = generator.generateOperations(r - l + 1, false); LOGINFO("Step {}-4-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); - + this->print_keys(fmt::format("Print before Step {}-4-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, + batch_num, l, r)); this->set_basic_flip(flip_point); for (auto [k, _] : ops) { this->remove_one(k, true); } - this->visualize_keys("tree_merge_before_third_crash.dot"); - LOGINFO("Step {}-4-2: Trigger cp to crash", i + 1); - this->crash_and_recover(ops); + this->crash_and_recover(flip_point, ops); } + this->print_keys(fmt::format("Print after recover Step {}-4-3: flip {}", i + 1, flip_point)); - LOGINFO("Step {}-5: Cleanup the tree", i + 1); - for (auto k = 0u; k < num_entries; ++k) { - this->remove_one(k, false); - } test_common::HSTestHelper::trigger_cp(true); this->get_all(); } @@ -962,11 +1143,14 @@ int main(int argc, char* argv[]) { SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_index_crash_recovery, iomgr, test_common_setup); sisl::logging::SetLogger("test_index_crash_recovery"); spdlog::set_pattern("[%D %T%z] [%^%L%$] [%t] %v"); - if (SISL_OPTIONS.count("seed")) { auto seed = SISL_OPTIONS["seed"].as< uint64_t >(); LOGINFO("Using seed {} to sow the random generation", seed); g_re.seed(seed); + } else { + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + LOGINFO("No seed provided. Using randomly generated seed: {}", seed); + g_re.seed(seed); } #ifdef _PRERELEASE diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 141fcf5e2..50f8df9cd 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -42,6 +42,8 @@ SISL_OPTION_GROUP( (num_entries, "", "num_entries", "number of entries to test with", ::cxxopts::value< uint32_t >()->default_value("10000"), "number"), (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), + (max_merge_level, "", "max_merge_level", "max merge level", ::cxxopts::value< uint8_t >()->default_value("127"), + ""), (num_threads, "", "num_threads", "number of threads", ::cxxopts::value< uint32_t >()->default_value("2"), "number"), (num_fibers, "", "num_fibers", "number of fibers", ::cxxopts::value< uint32_t >()->default_value("10"), "number"), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", @@ -107,13 +109,18 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { #ifdef _PRERELEASE this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); #endif + this->m_cfg.m_max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); + this->m_cfg.m_merge_turned_on = !SISL_OPTIONS["disable_merge"].as< bool >(); + //if TestType is PrefixIntervalBtreeTest print here something + if constexpr (std::is_same_v) { + this->m_cfg.m_merge_turned_on = false; + } this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); } }; -// TODO Enable PrefixIntervalBtreeTest later -using BtreeTypes = testing::Types< /* PrefixIntervalBtreeTest, */ FixedLenBtreeTest, VarKeySizeBtreeTest, - VarValueSizeBtreeTest, VarObjSizeBtreeTest >; +using BtreeTypes = testing::Types< FixedLenBtreeTest, VarKeySizeBtreeTest, + VarValueSizeBtreeTest, VarObjSizeBtreeTest, PrefixIntervalBtreeTest >; TYPED_TEST_SUITE(BtreeTest, BtreeTypes); TYPED_TEST(BtreeTest, SequentialInsert) { @@ -308,6 +315,11 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin #ifdef _PRERELEASE this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); #endif + this->m_cfg.m_max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); + this->m_cfg.m_merge_turned_on = !SISL_OPTIONS["disable_merge"].as< bool >(); + if constexpr (std::is_same_v) { + this->m_cfg.m_merge_turned_on = false; + } this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); } diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index 564bd61c5..54059cf0a 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -21,8 +21,8 @@ def run_test(options, type): print("Test completed") -def run_crash_test(options): - cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --log_mods=wbcache:trace --max_keys_in_node={options['max_keys_in_node']} --num_entries_per_rounds={options['num_entries_per_rounds']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} --num_rounds={options['num_rounds']} {options['dev_list']} " +def run_crash_test(options, crash_type='put', type=0): + cmd_opts = f"--gtest_filter=IndexCrashTest/{type}.long_running_{crash_type}_crash --gtest_break_on_failure --min_keys_in_node={options['min_keys_in_node']} --max_keys_in_node={options['max_keys_in_node']} --num_entries_per_rounds={options['num_entries_per_rounds']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} --num_rounds={options['num_rounds']} {options['dev_list']} " # print(f"Running test with options: {cmd_opts}") try: subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT, @@ -99,7 +99,19 @@ def long_running_crash_put(options): options['run_time'] = 14400 # 4 hours options['preload_size'] = 1024 print(f"options: {options}") - run_crash_test(options) + run_crash_test(options, 'put', 0) + print("Long running crash put completed") + +def long_running_crash_remove(options): + print("Long running crash remove started") + options['num_entries'] = 1000 + options['init_device'] = True + options['run_time'] = 14400 # 4 hours + options['num_entries_per_rounds'] = 100 + options['min_keys_in_node'] = 2 + options['max_keys_in_node'] = 10 + print(f"options: {options}") + run_crash_test(options, 'remove', 0) print("Long running crash put completed") @@ -120,9 +132,14 @@ def main(): def long_running(*args): options = parse_arguments() + for i in range(50): + print(f"Iteration {i + 1}") + long_running_crash_remove(options) + for i in range(5): + print(f"Iteration {i + 1}") + long_running_crash_put(options) long_runnig_index(options) long_running_clean_shutdown(options) - long_running_crash_put(options) if __name__ == "__main__": From 2a86b8f383114b27e8c49e1f1c6bf5388455a429 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Wed, 30 Apr 2025 14:53:44 -0700 Subject: [PATCH 113/170] Bump up hub.tess.io/sds/sds_develop in DockerFile (#709) --- .jenkins/Dockerfile | 2 +- conanfile.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.jenkins/Dockerfile b/.jenkins/Dockerfile index 20c4489b0..dcfdd9d65 100644 --- a/.jenkins/Dockerfile +++ b/.jenkins/Dockerfile @@ -1,5 +1,5 @@ # ########## ####### ############ -FROM hub.tess.io/sds/sds_develop:4.x-latest +FROM hub.tess.io/sds/sds_develop:7.x-latest LABEL description="Automated HomeStore compilation" WORKDIR /output diff --git a/conanfile.py b/conanfile.py index 59bdcb513..9dbe2898e 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.3" + version = "6.13.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" From a74fd29776097e332fd63d91592dcc4b3df076e3 Mon Sep 17 00:00:00 2001 From: Brian Szmyd Date: Wed, 30 Apr 2025 17:26:44 -0600 Subject: [PATCH 114/170] Move sanitizer builds to its own location. (#710) --- conanfile.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 9dbe2898e..8101ade90 100644 --- a/conanfile.py +++ b/conanfile.py @@ -65,7 +65,12 @@ def imports(self): def layout(self): self.folders.source = "." - self.folders.build = join("build", str(self.settings.build_type)) + if self.options.get_safe("sanitize"): + self.folders.build = join("build", "Sanitized") + elif self.options.get_safe("coverage"): + self.folders.build = join("build", "Coverage") + else: + self.folders.build = join("build", str(self.settings.build_type)) self.folders.generators = join(self.folders.build, "generators") self.cpp.source.includedirs = ["src/include"] From 679e8fb54622a67178d86b2ac3d67d07c65a23a6 Mon Sep 17 00:00:00 2001 From: Sanal Date: Thu, 1 May 2025 10:09:18 -0700 Subject: [PATCH 115/170] Add async_write, alloc blks for solo repl dev. (#706) Add support for async write data, journal, alloc blks for solo repl dev. Raft repl dev doesnt support these operations. This is needed for nublocks where it need to write free blkids also to the journal. Free blocks are obtained after writing the new blkids to index. Add apis for allocation and write for vector of blkids . Raft repldev currently uses only a single blkid. Test solo repl dev changes to support vector of blkids. --- conanfile.py | 4 +- src/include/homestore/blkdata_service.hpp | 26 ++- src/include/homestore/replication/repl_dev.h | 57 +++++- src/lib/blkdata_svc/blkdata_service.cpp | 31 ++- src/lib/replication/repl_dev/common.cpp | 41 +++- .../replication/repl_dev/raft_repl_dev.cpp | 13 +- src/lib/replication/repl_dev/raft_repl_dev.h | 18 ++ .../replication/repl_dev/solo_repl_dev.cpp | 102 +++++++++- src/lib/replication/repl_dev/solo_repl_dev.h | 9 +- src/tests/test_solo_repl_dev.cpp | 179 ++++++++++++------ 10 files changed, 386 insertions(+), 94 deletions(-) diff --git a/conanfile.py b/conanfile.py index 8101ade90..74801da73 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.4" + version = "6.13.5" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" @@ -54,7 +54,7 @@ def build_requirements(self): def requirements(self): self.requires("iomgr/[^11.3]@oss/master", transitive_headers=True) self.requires("sisl/[^12.2]@oss/master", transitive_headers=True) - self.requires("nuraft_mesg/[>=3.7.5]@oss/main", transitive_headers=True) + self.requires("nuraft_mesg/[~3.8.0]@oss/main", transitive_headers=True) self.requires("farmhash/cci.20190513@", transitive_headers=True) if self.settings.arch in ['x86', 'x86_64']: diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index 33a5fe2ac..e1992b983 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -114,6 +114,18 @@ class BlkDataService { folly::Future< std::error_code > async_write(sisl::sg_list const& sgs, MultiBlkId const& in_blkids, bool part_of_batch = false); + /** + * @brief : asynchronous write with input block ids; + * + * @param sgs : the data buffer that needs to be written + * @param hints : blk alloc hints + * @param in_blkids : input block ids that this write should be written to; + * @param cb : callback that will be triggered after write completes + * @param part_of_batch : is this write part of a batch; + */ + folly::Future< std::error_code > async_write(sisl::sg_list const& sgs, std::vector< MultiBlkId > const& in_blkids, + bool part_of_batch = false); + /** * @brief Asynchronously reads data from the specified block ID into the provided buffer. * @@ -147,7 +159,8 @@ class BlkDataService { BlkAllocStatus commit_blk(MultiBlkId const& bid); /** - * @brief Allocates a contiguous block of disk space of the given size. + * @brief Allocates a contiguous block of disk space of the given size. This API should be called that when consumer + * is expecting blks only allocated on same chunk. * * @param size The size of the block to allocate, in bytes. * @param hints Hints for how to allocate the block. @@ -156,6 +169,17 @@ class BlkDataService { */ BlkAllocStatus alloc_blks(uint32_t size, blk_alloc_hints const& hints, MultiBlkId& out_blkids); + /** + * @brief Allocates blocks of disk space of the given size.This API should be called when consumer is expecting blk + * allocation happen on different chunks is possible and acceptable. + * + * @param size The size of the block to allocate, in bytes. + * @param hints Hints for how to allocate the block. + * @param out_blkids Output parameter that will be filled with the IDs of the allocated blocks. + * @return The status of the block allocation attempt. + */ + BlkAllocStatus alloc_blks(uint32_t size, blk_alloc_hints const& hints, std::vector< BlkId >& out_blkids); + /** * @brief Asynchronously frees the specified block IDs. * It is asynchronous because it might need to wait for pending read to complete if same block is being read and not diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 60a0f8430..9a4cba340 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -130,7 +130,16 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: sisl::blob const& header() const { return m_header; } sisl::blob const& key() const { return m_key; } - MultiBlkId const& local_blkid() const { return m_local_blkid; } + MultiBlkId const& local_blkid() const { + // Currently used by raft repl dev only where a single blob is expected. + // Code checks if its a valid blkid so return a dummy blkid. + if (!m_local_blkids.empty()) + return m_local_blkids[0]; + else + return dummy_blkid; + } + + std::vector< MultiBlkId >& local_blkids() { return m_local_blkids; } RemoteBlkId const& remote_blkid() const { return m_remote_blkid; } const char* data() const { DEBUG_ASSERT(m_data != nullptr, @@ -141,6 +150,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: bool has_state(repl_req_state_t s) const { return m_state.load() & uint32_cast(s); } repl_journal_entry const* journal_entry() const { return m_journal_entry; } uint32_t journal_entry_size() const; + uint32_t blkids_serialized_size() const; bool is_localize_pending() const { return m_is_jentry_localize_pending; } bool has_linked_data() const { return (m_op_code == journal_type_t::HS_DATA_LINKED); } @@ -149,6 +159,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: /////////////////////// Non modifiers methods ////////////////// std::string to_string() const; std::string to_compact_string() const; + std::string blkids_to_string() const; Clock::time_point created_time() const { return m_start_time; } void set_created_time() { m_start_time = Clock::now(); } bool is_expired() const; @@ -195,7 +206,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: bool save_fetched_data(sisl::GenericClientResponse const& fetched_data, uint8_t const* data, uint32_t data_size); void set_remote_blkid(RemoteBlkId const& rbid) { m_remote_blkid = rbid; } - void set_local_blkid(MultiBlkId const& lbid) { m_local_blkid = lbid; } // Only used during recovery + void set_local_blkids(std::vector< MultiBlkId > const& lbids) { m_local_blkids = std::move(lbids); } void set_is_volatile(bool is_volatile) { m_is_volatile.store(is_volatile); } void set_lsn(int64_t lsn); void add_state(repl_req_state_t s); @@ -226,9 +237,10 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: std::atomic< bool > m_is_volatile{true}; // Is the log still in memory and not flushed to disk yet /////////////// Data related section ///////////////// - MultiBlkId m_local_blkid; // Local BlkId for the data - RemoteBlkId m_remote_blkid; // Corresponding remote blkid for the data - uint8_t const* m_data; // Raw data pointer containing the actual data + static inline MultiBlkId dummy_blkid; + std::vector< MultiBlkId > m_local_blkids; // Local BlkId for the data + RemoteBlkId m_remote_blkid; // Corresponding remote blkid for the data + uint8_t const* m_data; // Raw data pointer containing the actual data /////////////// Journal/Buf related section ///////////////// std::variant< std::unique_ptr< uint8_t[] >, raft_buf_ptr_t > m_journal_buf; // Buf for the journal entry @@ -400,7 +412,7 @@ class ReplDevListener { virtual void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) = 0; /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer - virtual void on_log_replay_done(const group_id_t& group_id){}; + virtual void on_log_replay_done(const group_id_t& group_id) {}; private: std::weak_ptr< ReplDev > m_repl_dev; @@ -411,6 +423,39 @@ class ReplDev { ReplDev() = default; virtual ~ReplDev() { detach_listener(); } + /// @brief Allocates blkids from the storage engine to write the value into. Storage + /// engine returns a blkid_list in cases where single contiguous blocks are not + /// available. + /// + /// @param data_size - Size of the data. + /// @param hints - Specify block allocation hints. + /// @param out_blkids - List of bilkid's which may not be contiguous. + virtual std::error_code alloc_blks(uint32_t data_size, const blk_alloc_hints& hints, + std::vector< MultiBlkId >& out_blkids) = 0; + + /// @brief Write data locally using the specified blkid's. Data is split across the blkids. + /// @param blkids - List of blkid's where data will be written. + /// @param value - vector of io buffers that contain value for the key. + /// @param part_of_batch - Is write is part of a batch. If part of the batch, then submit_batch needs to be called + /// at the end + /// @return A Future with std::error_code to notify if it has successfully write the data or any error code in case + /// of failure + virtual folly::Future< std::error_code > async_write(const std::vector< MultiBlkId >& blkids, + sisl::sg_list const& value, bool part_of_batch = false, + trace_id_t tid = 0) = 0; + + /// @brief Creates a log/journal entry with and calls the on_commit listener callback. + /// @param blkids - List of blkid's where data was written. + /// @param header - Blob representing the header (it is opaque and will be copied + /// as-is to the journal entry) + /// @param key - Blob representing the key (it is opaque and will be copied as-is to + /// the journal entry). + /// @param data_size - Size of the data. + /// @param ctx - User supplied context which will be passed to listener callbacks + virtual void async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header, + sisl::blob const& key, uint32_t data_size, repl_req_ptr_t ctx, + trace_id_t tid = 0) = 0; + /// @brief Replicate the data to the replica set. This method goes through the /// following steps: /// Step 1: Allocates blkid from the storage engine to write the value into. Storage diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index 58cc36c61..1219ed00e 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -208,10 +208,35 @@ folly::Future< std::error_code > BlkDataService::async_write(sisl::sg_list const } } +folly::Future< std::error_code > +BlkDataService::async_write(sisl::sg_list const& sgs, std::vector< MultiBlkId > const& blkids, bool part_of_batch) { + if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + incr_pending_request_num(); + static thread_local std::vector< folly::Future< std::error_code > > s_futs; + s_futs.clear(); + for (const auto& blkid : blkids) { + s_futs.emplace_back(async_write(sgs, blkid, part_of_batch)); + } + decr_pending_request_num(); + return collect_all_futures(s_futs); +} + BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, MultiBlkId& out_blkids) { if (is_stopping()) return BlkAllocStatus::FAILED; incr_pending_request_num(); - HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested"); + HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested size={} blk_size={}", size, m_blk_size); + blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size); + + auto ret = m_vdev->alloc_blks(nblks, hints, out_blkids); + decr_pending_request_num(); + return ret; +} + +BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, + std::vector< BlkId >& out_blkids) { + if (is_stopping()) return BlkAllocStatus::FAILED; + incr_pending_request_num(); + HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested size={} blk_size={}", size, m_blk_size); blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size); auto ret = m_vdev->alloc_blks(nblks, hints, out_blkids); @@ -271,8 +296,8 @@ void BlkDataService::start() { void BlkDataService::stop() { start_stopping(); - // we have no way to track the completion of each async io in detail which should be done in iomanager level, so we - // just wait for 3 seconds, and we expect each io will be completed within this time. + // we have no way to track the completion of each async io in detail which should be done in iomanager level, so + // we just wait for 3 seconds, and we expect each io will be completed within this time. // TODO: find a better solution to track the completion of these aysnc calls std::this_thread::sleep_for(std::chrono::milliseconds(3000)); diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 6a39256f9..2782a36a5 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -63,7 +63,7 @@ repl_req_ctx::~repl_req_ctx() { } void repl_req_ctx::create_journal_entry(bool is_raft_buf, int32_t server_id) { - uint32_t val_size = has_linked_data() ? m_local_blkid.serialized_size() : 0; + uint32_t val_size = has_linked_data() ? blkids_serialized_size() : 0; uint32_t entry_size = sizeof(repl_journal_entry) + m_header.size() + m_key.size() + val_size; if (is_raft_buf) { @@ -94,14 +94,25 @@ void repl_req_ctx::create_journal_entry(bool is_raft_buf, int32_t server_id) { } if (has_linked_data()) { - auto const b = m_local_blkid.serialize(); - std::memcpy(raw_ptr, b.cbytes(), b.size()); + for (const auto& blkid : m_local_blkids) { + auto const b = blkid.serialize(); + std::memcpy(raw_ptr, b.cbytes(), b.size()); + raw_ptr += b.size(); + } } } uint32_t repl_req_ctx::journal_entry_size() const { return sizeof(repl_journal_entry) + m_header.size() + m_key.size() + - (has_linked_data() ? m_local_blkid.serialized_size() : 0); + (has_linked_data() ? blkids_serialized_size() : 0); +} + +uint32_t repl_req_ctx::blkids_serialized_size() const { + uint32_t blkids_serialized_size = 0; + for (const auto& blkid : m_local_blkids) { + blkids_serialized_size += blkid.serialized_size(); + } + return blkids_serialized_size; } void repl_req_ctx::change_raft_journal_buf(raft_buf_ptr_t new_buf, bool adjust_hdr_key) { @@ -128,7 +139,7 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list // if the committed_blk_id is already present, use it and skip allocation and commitment LOGINFOMOD(replication, "[traceID={}] For Repl_key=[{}] data already exists, skip", rkey().traceID, rkey().to_string()); - m_local_blkid = hints_result.value().committed_blk_id.value(); + m_local_blkids.emplace_back(hints_result.value().committed_blk_id.value()); add_state(repl_req_state_t::BLK_ALLOCATED); add_state(repl_req_state_t::DATA_RECEIVED); add_state(repl_req_state_t::DATA_WRITTEN); @@ -138,14 +149,19 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list return ReplServiceError::OK; } + std::vector< BlkId > blkids; auto status = data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), - hints_result.value(), m_local_blkid); + hints_result.value(), blkids); if (status != BlkAllocStatus::SUCCESS) { LOGWARNMOD(replication, "[traceID={}] block allocation failure, repl_key=[{}], status=[{}]", rkey().traceID, rkey(), status); DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks"); return ReplServiceError::NO_SPACE_LEFT; } + + for (auto& blkid : blkids) { + m_local_blkids.emplace_back(blkid); + } add_state(repl_req_state_t::BLK_ALLOCATED); return ReplServiceError::OK; } @@ -246,7 +262,7 @@ std::string repl_req_ctx::to_string() const { return fmt::format("repl_key=[{}], lsn={} state=[{}] m_headersize={} m_keysize={} is_proposer={} " "local_blkid={} remote_blkid={}", m_rkey.to_string(), m_lsn, req_state_name(uint32_cast(state())), m_header.size(), m_key.size(), - m_is_proposer, m_local_blkid.to_string(), m_remote_blkid.blkid.to_string()); + m_is_proposer, blkids_to_string(), m_remote_blkid.blkid.to_string()); } std::string repl_req_ctx::to_compact_string() const { @@ -255,7 +271,16 @@ std::string repl_req_ctx::to_compact_string() const { } return fmt::format("dsn={} term={} lsn={} op={} local_blkid={} state=[{}]", m_rkey.dsn, m_rkey.term, m_lsn, - enum_name(m_op_code), m_local_blkid.to_string(), req_state_name(uint32_cast(state()))); + enum_name(m_op_code), blkids_to_string(), req_state_name(uint32_cast(state()))); +} + +std::string repl_req_ctx::blkids_to_string() const { + std::string str = fmt::format("["); + for (const auto& blkid : m_local_blkids) { + fmt::format_to(std::back_inserter(str), "{} ", blkid.to_string()); + } + fmt::format_to(std::back_inserter(str), "]"); + return str; } bool repl_req_ctx::is_expired() const { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 082f4fac4..88aa9d6c3 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -337,10 +337,13 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } } - auto status = init_req_ctx( - rreq, repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1), .traceID = tid}, - data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, header, - key, data.size, m_listener); + auto status = init_req_ctx(rreq, + repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = tid}, + data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, + true /* is_proposer */, header, key, data.size, m_listener); if (status != ReplServiceError::OK) { RD_LOGI(tid, "Initializing rreq failed error={}, failing this req", status); @@ -1659,7 +1662,7 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx MultiBlkId entry_blkid; entry_blkid.deserialize(entry_to_val(jentry), true /* copy */); data_size = entry_blkid.blk_count() * get_blk_size(); - rreq->set_local_blkid(entry_blkid); + rreq->set_local_blkids({entry_blkid}); rreq->add_state(repl_req_state_t::BLK_ALLOCATED); rreq->add_state(repl_req_state_t::DATA_RECEIVED); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index bd6a6c448..42d100ebb 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -229,6 +229,24 @@ class RaftReplDev : public ReplDev, folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// + virtual std::error_code alloc_blks(uint32_t size, const blk_alloc_hints& hints, + std::vector< MultiBlkId >& out_blkids) override { + RD_REL_ASSERT(false, "NOT SUPPORTED"); + return std::make_error_code(std::errc::operation_not_supported); + } + virtual folly::Future< std::error_code > async_write(const std::vector< MultiBlkId >& blkids, + sisl::sg_list const& value, bool part_of_batch = false, + trace_id_t tid = 0) override { + RD_REL_ASSERT(false, "NOT SUPPORTED"); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_not_supported)); + } + + virtual void async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header, + sisl::blob const& key, uint32_t data_size, repl_req_ptr_t ctx, + trace_id_t tid = 0) override { + RD_REL_ASSERT(false, "NOT SUPPORTED"); + } + void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) override; folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index bc6bdb8bb..587cb8b2e 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -39,7 +39,7 @@ void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& // If it is header only entry, directly write to the journal if (rreq->has_linked_data() && !rreq->has_state(repl_req_state_t::DATA_WRITTEN)) { // Write the data - data_service().async_write(value, rreq->local_blkid()).thenValue([this, rreq = std::move(rreq)](auto&& err) { + data_service().async_write(value, rreq->local_blkids()).thenValue([this, rreq = std::move(rreq)](auto&& err) { HS_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener write_journal(std::move(rreq)); }); @@ -60,12 +60,92 @@ void SoloReplDev::write_journal(repl_req_ptr_t rreq) { auto cur_lsn = m_commit_upto.load(); if (cur_lsn < lsn) { m_commit_upto.compare_exchange_strong(cur_lsn, lsn); } - data_service().commit_blk(rreq->local_blkid()); - m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), {rreq->local_blkid()}, rreq); + for (const auto& blkid : rreq->local_blkids()) { + data_service().commit_blk(blkid); + } + m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkids(), rreq); decr_pending_request_num(); }); } +std::error_code SoloReplDev::alloc_blks(uint32_t data_size, const blk_alloc_hints& hints, + std::vector< MultiBlkId >& out_blkids) { + if (is_stopping()) { return std::make_error_code(std::errc::operation_canceled); } + + incr_pending_request_num(); + std::vector< BlkId > blkids; + auto status = + data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), hints, blkids); + if (status != BlkAllocStatus::SUCCESS) { + DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks"); + decr_pending_request_num(); + return std::make_error_code(std::errc::no_space_on_device); + } + for (auto& blkid : blkids) { + out_blkids.emplace_back(blkid); + } + decr_pending_request_num(); + return std::error_code{}; +} + +folly::Future< std::error_code > SoloReplDev::async_write(const std::vector< MultiBlkId >& blkids, + sisl::sg_list const& value, bool part_of_batch, + trace_id_t tid) { + if (is_stopping()) { + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + } + + incr_pending_request_num(); + HS_REL_ASSERT_GT(blkids.size(), 0, "Empty blkid vec"); + std::vector< folly::Future< std::error_code > > futs; + futs.reserve(blkids.size()); + sisl::sg_iterator sg_it{value.iovs}; + + for (const auto& blkid : blkids) { + auto sgs_size = blkid.blk_count() * data_service().get_blk_size(); + const auto iovs = sg_it.next_iovs(sgs_size); + uint32_t total_size = 0; + for (auto& iov : iovs) { + total_size += iov.iov_len; + } + if (total_size != sgs_size) { + LOGINFO("Block size mismatch total_size={} sgs_size={}", total_size, sgs_size); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::invalid_argument)); + } + sisl::sg_list sgs{sgs_size, iovs}; + futs.emplace_back(data_service().async_write(sgs, blkid, part_of_batch)); + } + + return folly::collectAllUnsafe(futs).thenValue([this](auto&& v_res) { + for (const auto& err_c : v_res) { + if (sisl_unlikely(err_c.value())) { + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::io_error)); + } + } + + decr_pending_request_num(); + return folly::makeFuture< std::error_code >(std::error_code{}); + }); +} + +void SoloReplDev::async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header, + sisl::blob const& key, uint32_t data_size, repl_req_ptr_t rreq, trace_id_t tid) { + if (is_stopping()) { return; } + incr_pending_request_num(); + + // We expect clients to provide valid repl req ctx with blocks allocated. + HS_REL_ASSERT(rreq, "Invalid repl req ctx"); + rreq->add_state(repl_req_state_t::BLK_ALLOCATED); + rreq->set_local_blkids(blkids); + auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1, .traceID = tid}, + data_size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, header, + key, data_size, m_listener); + HS_REL_ASSERT_EQ(status, ReplServiceError::OK, "Error in initializing repl req context."); + + // Write to journal. + write_journal(std::move(rreq)); +} + void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { repl_journal_entry const* entry = r_cast< repl_journal_entry const* >(buf.bytes()); uint32_t remain_size = buf.size() - sizeof(repl_journal_entry); @@ -83,22 +163,27 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx raw_ptr += entry->key_size; remain_size -= entry->key_size; - sisl::blob value_blob{raw_ptr, remain_size}; - MultiBlkId blkid; - if (remain_size) { blkid.deserialize(value_blob, true /* copy */); } + std::vector< MultiBlkId > blkids; + while (remain_size > 0) { + MultiBlkId blkid; + sisl::blob value_blob{raw_ptr, sizeof(BlkId)}; + blkid.deserialize(value_blob, true /* copy */); + raw_ptr += sizeof(BlkId); + remain_size -= sizeof(BlkId); + blkids.push_back(blkid); + } m_listener->on_pre_commit(lsn, header, key, nullptr); auto cur_lsn = m_commit_upto.load(); if (cur_lsn < lsn) { m_commit_upto.compare_exchange_strong(cur_lsn, lsn); } - m_listener->on_commit(lsn, header, key, {blkid}, nullptr); + m_listener->on_commit(lsn, header, key, blkids, nullptr); } folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch, trace_id_t tid) { if (is_stopping()) { - LOGINFO("repl dev is being shutdown!"); return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); } incr_pending_request_num(); @@ -109,7 +194,6 @@ folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, folly::Future< std::error_code > SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) { if (is_stopping()) { - LOGINFO("repl dev is being shutdown!"); return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); } incr_pending_request_num(); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 63838f254..35f089ec5 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -39,7 +39,14 @@ class SoloReplDev : public ReplDev { SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing); virtual ~SoloReplDev() = default; - // TODO: implement graceful shutdown for solo repl dev + virtual std::error_code alloc_blks(uint32_t data_size, const blk_alloc_hints& hints, + std::vector< MultiBlkId >& out_blkids) override; + virtual folly::Future< std::error_code > async_write(const std::vector< MultiBlkId >& blkids, + sisl::sg_list const& value, bool part_of_batch = false, + trace_id_t tid = 0) override; + virtual void async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header, + sisl::blob const& key, uint32_t data_size, repl_req_ptr_t ctx, + trace_id_t tid = 0) override; void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) override; diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 13bcc52b0..4d271efcb 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -63,22 +63,15 @@ struct test_repl_req : public repl_req_ctx { sisl::byte_array header; sisl::byte_array key; sisl::sg_list write_sgs; - sisl::sg_list read_sgs; - MultiBlkId written_blkids; + std::vector< MultiBlkId > written_blkids; - test_repl_req() { - write_sgs.size = 0; - read_sgs.size = 0; - } + test_repl_req() { write_sgs.size = 0; } ~test_repl_req() { for (auto const& iov : write_sgs.iovs) { iomanager.iobuf_free(uintptr_cast(iov.iov_base)); } - - for (auto const& iov : read_sgs.iovs) { - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } } + struct journal_header { uint32_t key_size; uint64_t key_pattern; @@ -100,12 +93,11 @@ class SoloReplDevTest : public testing::Test { void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received on_commit lsn={}", lsn); - HS_REL_ASSERT(!blkids.empty(), "Invalid blkids size"); if (ctx == nullptr) { - m_test.validate_replay(*repl_dev(), lsn, header, key, blkids[0]); + m_test.validate_replay(*repl_dev(), lsn, header, key, blkids); } else { auto req = boost::static_pointer_cast< test_repl_req >(ctx); - req->written_blkids = blkids[0]; + req->written_blkids = std::move(blkids); m_test.on_write_complete(*repl_dev(), req); } } @@ -231,60 +223,116 @@ class SoloReplDevTest : public testing::Test { rdev->async_alloc_write(*req->header, req->key ? *req->key : sisl::blob{}, req->write_sgs, req); } + void async_write_data_and_journal(uint32_t key_size, uint64_t data_size, uint32_t max_size_per_iov) { + data_size = data_size == 0 ? g_block_size : data_size; + auto req = intrusive< test_repl_req >(new test_repl_req()); + req->header = sisl::make_byte_array(sizeof(test_repl_req::journal_header)); + auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); + hdr->key_size = key_size; + hdr->key_pattern = ((long long)rand() << 32) | rand(); + hdr->data_size = data_size; + hdr->data_pattern = ((long long)rand() << 32) | rand(); + + if (key_size != 0) { + req->key = sisl::make_byte_array(key_size); + HSTestHelper::fill_data_buf(req->key->bytes(), key_size, hdr->key_pattern); + } + + req->write_sgs = HSTestHelper::create_sgs(data_size, max_size_per_iov, hdr->data_pattern); + + auto& rdev = (rand() % 2) ? m_repl_dev1 : m_repl_dev2; + + auto const cap = hs()->repl_service().get_cap_stats(); + LOGDEBUG("Before write, cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); + + std::vector< MultiBlkId > blkids; + blk_alloc_hints hints; + auto err = rdev->alloc_blks(data_size, hints, blkids); + RELEASE_ASSERT(!err, "Error during alloc_blks"); + RELEASE_ASSERT(!blkids.empty(), "Empty blkids"); + + rdev->async_write(blkids, req->write_sgs).thenValue([this, rdev, blkids, data_size, req](auto&& err) { + RELEASE_ASSERT(!err, "Error during async_write"); + rdev->async_write_journal(blkids, *req->header, req->key ? *req->key : sisl::blob{}, data_size, req); + }); + } + void validate_replay(ReplDev& rdev, int64_t lsn, sisl::blob const& header, sisl::blob const& key, - MultiBlkId const& blkids) { + std::vector< MultiBlkId > const& blkids) { + if (blkids.empty()) { + m_task_waiter.one_complete(); + return; + } + auto const jhdr = r_cast< test_repl_req::journal_header const* >(header.cbytes()); HSTestHelper::validate_data_buf(key.cbytes(), key.size(), jhdr->key_pattern); - - uint32_t size = blkids.blk_count() * g_block_size; - if (size) { - auto read_sgs = HSTestHelper::create_sgs(size, size); - LOGINFO("[{}] Validating replay of lsn={} blkid = {}", boost::uuids::to_string(rdev.group_id()), lsn, - blkids.to_string()); - rdev.async_read(blkids, read_sgs, size) - .thenValue([this, hdr = *jhdr, read_sgs, lsn, blkids, &rdev](auto&& err) { - RELEASE_ASSERT(!err, "Error during async_read"); - HS_REL_ASSERT_EQ(hdr.data_size, read_sgs.size, "journal hdr data size mismatch with actual size"); - - for (auto const& iov : read_sgs.iovs) { - HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr.data_pattern); - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - LOGINFO("[{}] Replay of lsn={} blkid={} validated successfully", - boost::uuids::to_string(rdev.group_id()), lsn, blkids.to_string()); - m_task_waiter.one_complete(); - }); - } else { - m_task_waiter.one_complete(); + uint64_t total_io = blkids.size(); + auto io_count = std::make_shared< std::atomic< uint64_t > >(0); + for (const auto& blkid : blkids) { + uint32_t size = blkid.blk_count() * g_block_size; + if (size) { + auto read_sgs = HSTestHelper::create_sgs(size, size); + LOGDEBUG("[{}] Validating replay of lsn={} blkid = {}", boost::uuids::to_string(rdev.group_id()), lsn, + blkid.to_string()); + rdev.async_read(blkid, read_sgs, size) + .thenValue([this, io_count, total_io, hdr = *jhdr, read_sgs, lsn, blkid, &rdev](auto&& err) { + RELEASE_ASSERT(!err, "Error during async_read"); + // HS_REL_ASSERT_EQ(hdr.data_size, read_sgs.size, + // "journal hdr data size mismatch with actual size"); + + for (auto const& iov : read_sgs.iovs) { + HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr.data_pattern); + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + LOGDEBUG("[{}] Replay of lsn={} blkid={} validated successfully", + boost::uuids::to_string(rdev.group_id()), lsn, blkid.to_string()); + + io_count->fetch_add(1); + if (*io_count == total_io) { m_task_waiter.one_complete(); } + }); + } else { + m_task_waiter.one_complete(); + } } } void on_write_complete(ReplDev& rdev, intrusive< test_repl_req > req) { - // If we did send some data to the repl_dev, validate it by doing async_read - if (req->write_sgs.size != 0) { - req->read_sgs = HSTestHelper::create_sgs(req->write_sgs.size, req->write_sgs.size); - - auto const cap = hs()->repl_service().get_cap_stats(); - LOGINFO("Write complete with cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); - - rdev.async_read(req->written_blkids, req->read_sgs, req->read_sgs.size) - .thenValue([this, &rdev, req](auto&& err) { - RELEASE_ASSERT(!err, "Error during async_read"); - - LOGINFO("[{}] Write complete with lsn={} for size={} blkids={}", - boost::uuids::to_string(rdev.group_id()), req->lsn(), req->write_sgs.size, - req->written_blkids.to_string()); - auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); - HS_REL_ASSERT_EQ(hdr->data_size, req->read_sgs.size, - "journal hdr data size mismatch with actual size"); - - for (auto const& iov : req->read_sgs.iovs) { - HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr->data_pattern); - } - m_io_runner.next_task(); - }); - } else { + if (req->written_blkids.empty()) { m_io_runner.next_task(); + return; + } + + // If we did send some data to the repl_dev, validate it by doing async_read + auto io_count = std::make_shared< std::atomic< uint64_t > >(0); + for (const auto blkid : req->written_blkids) { + if (req->write_sgs.size != 0) { + auto const cap = hs()->repl_service().get_cap_stats(); + LOGDEBUG("Write complete with cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); + + auto sgs_size = blkid.blk_count() * g_block_size; + auto read_sgs = HSTestHelper::create_sgs(sgs_size, sgs_size); + rdev.async_read(blkid, read_sgs, read_sgs.size) + .thenValue([this, io_count, blkid, &rdev, sgs_size, read_sgs, req](auto&& err) { + RELEASE_ASSERT(!err, "Error during async_read"); + + LOGINFO("[{}] Write complete with lsn={} for size={} blkid={}", + boost::uuids::to_string(rdev.group_id()), req->lsn(), sgs_size, blkid.to_string()); + auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); + // HS_REL_ASSERT_EQ(hdr->data_size, read_sgs.size, + // "journal hdr data size mismatch with actual size"); + + for (auto const& iov : read_sgs.iovs) { + LOGDEBUG("Read data blkid={} len={} data={}", blkid.to_integer(), iov.iov_len, + *(uint64_t*)iov.iov_base); + HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr->data_pattern); + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + io_count->fetch_add(1); + if (*io_count == req->written_blkids.size()) { m_io_runner.next_task(); } + }); + } else { + m_io_runner.next_task(); + } } } }; @@ -319,6 +367,19 @@ TEST_F(SoloReplDevTest, TestHeaderOnly) { this->m_task_waiter.start([this]() { this->restart(); }).get(); } +TEST_F(SoloReplDevTest, TestAsyncWriteJournal) { + LOGINFO("Step 1: run on worker threads to schedule write for random bytes ranging {}-{}.", 0, 1 * Mi); + this->m_io_runner.set_task([this]() { + uint32_t nblks = rand() % ((1 * Mi) / g_block_size); + uint32_t key_size = rand() % 512 + 8; + this->async_write_data_and_journal(key_size, nblks * g_block_size, g_block_size); + }); + + this->m_io_runner.execute().get(); + LOGINFO("Step 2: Restart homestore and validate replay data.", g_block_size); + this->m_task_waiter.start([this]() { this->restart(); }).get(); +} + SISL_OPTION_GROUP(test_solo_repl_dev, (block_size, "", "block_size", "block size to io", ::cxxopts::value< uint32_t >()->default_value("4096"), "number")); From 7e0a4043268a4055ad258e42f296eb7d1beff2ec Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Thu, 1 May 2025 11:12:20 -0700 Subject: [PATCH 116/170] Revert btree config file (#711) --- .github/workflows/build_dependencies.yml | 1 + conanfile.py | 2 +- src/include/homestore/index/index_table.hpp | 4 ---- src/lib/common/homestore_config.fbs | 6 ------ src/tests/btree_helpers/btree_test_helper.hpp | 13 +++---------- 5 files changed, 5 insertions(+), 21 deletions(-) diff --git a/.github/workflows/build_dependencies.yml b/.github/workflows/build_dependencies.yml index 1e73061a1..122f825af 100644 --- a/.github/workflows/build_dependencies.yml +++ b/.github/workflows/build_dependencies.yml @@ -214,6 +214,7 @@ jobs: with: limit-access-to-actor: true detached: true + connect-timeout-seconds: 60 if: ${{ inputs.testing == 'True' }} - name: Create and Test Package diff --git a/conanfile.py b/conanfile.py index 74801da73..4f7f2de3d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.5" + version = "6.13.6" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 31c793bdf..9d8387a32 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -60,8 +60,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg) : Btree< K, V >{cfg}, m_sb{"index"} { - this->m_bt_cfg.m_merge_turned_on = HS_DYNAMIC_CONFIG(btree.merge_turned_on); - this->m_bt_cfg.m_max_merge_level = HS_DYNAMIC_CONFIG(btree.max_merge_level); // Create a superblk for the index table and create MetaIndexBuffer corresponding to that m_sb.create(sizeof(index_table_sb)); m_sb->uuid = uuid; @@ -79,8 +77,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } IndexTable(superblk< index_table_sb >&& sb, const BtreeConfig& cfg) : Btree< K, V >{cfg}, m_sb{std::move(sb)} { - this->m_bt_cfg.m_merge_turned_on = HS_DYNAMIC_CONFIG(btree.merge_turned_on); - this->m_bt_cfg.m_max_merge_level = HS_DYNAMIC_CONFIG(btree.max_merge_level); m_sb_buffer = std::make_shared< MetaIndexBuffer >(m_sb); // After recovery, we see that root node is empty, which means that after btree is created, we crashed. diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index cf400ee73..a661da497 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -57,12 +57,6 @@ table Btree { max_nodes_to_rebalance: uint32 = 3; mem_btree_page_size: uint32 = 8192; - - /* Maximum level of btree merge operation enabled while removig keys. */ - max_merge_level: uint8 = 1; - - /* Merge enabled */ - merge_turned_on: bool = true; } table Cache { diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index 227fd0ee0..0ff207f0d 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -44,16 +44,9 @@ struct BtreeTestHelper { void SetUp() { m_cfg.m_leaf_node_type = T::leaf_node_type; m_cfg.m_int_node_type = T::interior_node_type; - if (SISL_OPTIONS.count("disable_merge")) { - HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { - s.btree.merge_turned_on = false; - HS_SETTINGS_FACTORY().save(); - }); - } - HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { - s.btree.max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); - HS_SETTINGS_FACTORY().save(); - }); + m_cfg.m_max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); + if (SISL_OPTIONS.count("disable_merge")){m_cfg.m_merge_turned_on = false;} + m_max_range_input = SISL_OPTIONS["num_entries"].as< uint32_t >(); if (m_is_multi_threaded) { From f932e2b9ac7d1f1dbec2e0d9362069112b26e517 Mon Sep 17 00:00:00 2001 From: Sanal Date: Mon, 5 May 2025 11:02:39 -0700 Subject: [PATCH 117/170] Use application context in req struct for multi put and multi remove. (#712) Shift is passed app context which gives users additinal control on what to serialize. --- conanfile.py | 2 +- src/include/homestore/btree/btree_kv.hpp | 4 ++-- .../homestore/btree/detail/btree_mutate_impl.ipp | 6 +++--- .../homestore/btree/detail/btree_remove_impl.ipp | 3 ++- .../homestore/btree/detail/prefix_node.hpp | 16 +++++++++------- .../homestore/btree/detail/variant_node.hpp | 14 ++++++++------ src/tests/btree_helpers/btree_test_kvs.hpp | 4 ++-- src/tests/btree_helpers/shadow_map.hpp | 13 ++++++------- src/tests/test_btree_node.cpp | 11 +++++------ 9 files changed, 38 insertions(+), 35 deletions(-) diff --git a/conanfile.py b/conanfile.py index 4f7f2de3d..810a307e5 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.6" + version = "6.13.7" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/btree_kv.hpp b/src/include/homestore/btree/btree_kv.hpp index c995a7cc9..cbadc3830 100644 --- a/src/include/homestore/btree/btree_kv.hpp +++ b/src/include/homestore/btree/btree_kv.hpp @@ -61,7 +61,7 @@ class BtreeKey { // integers, but it needs to be able to get next or prev key from a given key in the key range class BtreeIntervalKey : public BtreeKey { public: - virtual void shift(int n) = 0; + virtual void shift(int n, void* app_ctx) = 0; virtual int distance(BtreeKey const& from) const = 0; bool is_interval_key() const override { return true; } @@ -142,7 +142,7 @@ class BtreeValue { class BtreeIntervalValue : public BtreeValue { public: - virtual void shift(int n) = 0; + virtual void shift(int n, void* app_ctx) = 0; virtual sisl::blob serialize_prefix() const = 0; virtual sisl::blob serialize_suffix() const = 0; diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp index 3cfc19a18..5247a6e22 100644 --- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp +++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp @@ -175,15 +175,15 @@ btree_status_t Btree< K, V >::mutate_write_leaf_node(const BtreeNodePtr& my_node if constexpr (std::is_same_v< ReqT, BtreeRangePutRequest< K > >) { K last_failed_key; ret = to_variant_node(my_node)->multi_put(req.working_range(), req.input_range().start_key(), *req.m_newval, - req.m_put_type, &last_failed_key, req.m_filter_cb); + req.m_put_type, &last_failed_key, req.m_filter_cb, req.m_app_context); if (ret == btree_status_t::has_more) { req.shift_working_range(std::move(last_failed_key), true /* make it including last_failed_key */); } else if (ret == btree_status_t::success) { req.shift_working_range(); } } else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) { - ret = to_variant_node(my_node)->put(req.key(), req.value(), req.m_put_type, req.m_existing_val, - req.m_filter_cb); + ret = + to_variant_node(my_node)->put(req.key(), req.value(), req.m_put_type, req.m_existing_val, req.m_filter_cb); COUNTER_INCREMENT(m_metrics, btree_obj_count, 1); } diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp index ccfe0f584..66955b6c7 100644 --- a/src/include/homestore/btree/detail/btree_remove_impl.ipp +++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp @@ -34,7 +34,8 @@ btree_status_t Btree< K, V >::do_remove(const BtreeNodePtr& my_node, locktype_t if constexpr (std::is_same_v< ReqT, BtreeSingleRemoveRequest >) { if ((modified = my_node->remove_one(req.key(), nullptr, req.m_outval))) { ++removed_count; } } else if constexpr (std::is_same_v< ReqT, BtreeRangeRemoveRequest< K > >) { - removed_count = to_variant_node(my_node)->multi_remove(req.working_range(), req.m_filter_cb); + removed_count = + to_variant_node(my_node)->multi_remove(req.working_range(), req.m_filter_cb, req.m_app_context); modified = (removed_count != 0); req.shift_working_range(); } else if constexpr (std::is_same_v< ReqT, BtreeRemoveAnyRequest< K > >) { diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp index 072b4f654..cbcdc5257 100644 --- a/src/include/homestore/btree/detail/prefix_node.hpp +++ b/src/include/homestore/btree/detail/prefix_node.hpp @@ -176,12 +176,13 @@ class FixedPrefixNode : public VariantNode< K, V > { /// batch_upsert_decision_t::remove, the entry is removed from the node. /// batch_upsert_decision_t::keep, the entry is not modified and the method moves on to the /// next entry. + /// @param app_ctx User supplied private context data. /// @return An optional key that was not upserted due to lack of space in the node. /// If all keys were upserted successfully, the method returns std::nullopt. /// If the method ran out of space in the node, the method returns the key that was last upserted btree_status_t multi_put(BtreeKeyRange< K > const& keys, BtreeKey const& first_input_key, BtreeValue const& val, - btree_put_type put_type, K* last_failed_key, - put_filter_cb_t const& filter_cb = nullptr) override { + btree_put_type put_type, K* last_failed_key, put_filter_cb_t const& filter_cb = nullptr, + void* app_ctx = nullptr) override { DEBUG_ASSERT_EQ(this->is_leaf(), true, "Multi put entries on node are supported only for leaf nodes"); if constexpr (std::is_base_of_v< BtreeIntervalKey, K > && std::is_base_of_v< BtreeIntervalValue, V >) { uint32_t modified{0}; @@ -189,7 +190,7 @@ class FixedPrefixNode : public VariantNode< K, V > { uint16_t prefix_slot{std::numeric_limits< uint16_t >::max()}; K cur_key = keys.start_key(); - if (!keys.is_start_inclusive()) { cur_key.shift(1); } + if (!keys.is_start_inclusive()) { cur_key.shift(1, app_ctx); } if (!has_room(1u)) { return btree_status_t::space_not_avail; } bool upserted_all{false}; @@ -233,11 +234,11 @@ class FixedPrefixNode : public VariantNode< K, V > { prefix_slot = add_prefix(cur_key, val); } V new_val{s_cast< V const& >(val)}; - new_val.shift(s_cast< K const& >(cur_key).distance(first_input_key)); + new_val.shift(s_cast< K const& >(cur_key).distance(first_input_key), app_ctx); write_suffix(idx, prefix_slot, cur_key, new_val); } - cur_key.shift(1); + cur_key.shift(1, app_ctx); if (!has_room(1u)) { break; } if (decision != put_filter_decision::remove) { ++idx; } @@ -274,11 +275,12 @@ class FixedPrefixNode : public VariantNode< K, V > { * * @return Returns number of objects removed */ - uint32_t multi_remove(BtreeKeyRange< K > const& keys, remove_filter_cb_t const& filter_cb = nullptr) override { + uint32_t multi_remove(BtreeKeyRange< K > const& keys, remove_filter_cb_t const& filter_cb = nullptr, + void* app_ctx = nullptr) override { DEBUG_ASSERT_EQ(this->is_leaf(), true, "remove_batch api is supported only for leaf node"); if constexpr (std::is_base_of_v< BtreeIntervalKey, K > && std::is_base_of_v< BtreeIntervalValue, V >) { K cur_key = keys.start_key(); - if (!keys.is_start_inclusive()) { cur_key.shift(1); } + if (!keys.is_start_inclusive()) { cur_key.shift(1, app_ctx); } uint32_t num_removed{0}; auto [_, idx] = this->find(cur_key, nullptr, false); diff --git a/src/include/homestore/btree/detail/variant_node.hpp b/src/include/homestore/btree/detail/variant_node.hpp index 004313ce1..283ca114a 100644 --- a/src/include/homestore/btree/detail/variant_node.hpp +++ b/src/include/homestore/btree/detail/variant_node.hpp @@ -195,8 +195,8 @@ class VariantNode : public StoreSpecificBtreeNode { /// is used as a filter to remove anything that needn't be updated. /// @return A status code indicating whether the operation was successful. /// - virtual btree_status_t put(BtreeKey const &key, BtreeValue const &val, btree_put_type put_type, - BtreeValue *existing_val, put_filter_cb_t const &filter_cb = nullptr) { + virtual btree_status_t put(BtreeKey const& key, BtreeValue const& val, btree_put_type put_type, + BtreeValue* existing_val, put_filter_cb_t const& filter_cb = nullptr) { LOGMSG_ASSERT_EQ(magic(), BTREE_NODE_MAGIC, "Magic mismatch on btree_node {}", get_persistent_header_const()->to_string()); auto ret = btree_status_t::success; @@ -210,7 +210,7 @@ class VariantNode : public StoreSpecificBtreeNode { if (existing_val) { get_nth_value(idx, existing_val, true); } if (filter_cb && filter_cb(get_nth_key< K >(idx, false), get_nth_value(idx, false), val) != - put_filter_decision::replace) { + put_filter_decision::replace) { LOGINFO("Filter callback rejected the update for key {}", key.to_string()); return btree_status_t::filtered_out; } @@ -229,7 +229,7 @@ class VariantNode : public StoreSpecificBtreeNode { } update(idx, key, val); } else if (put_type == btree_put_type::UPSERT) { - found ? update(idx, key, val) : (void) insert(idx, key, val); + found ? update(idx, key, val) : (void)insert(idx, key, val); } else { DEBUG_ASSERT(false, "Wrong put_type {}", put_type); } @@ -251,13 +251,14 @@ class VariantNode : public StoreSpecificBtreeNode { /// put_filter_decision::replace, the entry is upserted with the new value. /// put_filter_decision::remove, the entry is removed from the node. /// put_filter_decision::keep, the entry is not modified and the method moves on to the next entry. + /// @param app_ctx User supplied private context data. /// @return Btree status typically . /// If all keys were upserted successfully, the method returns btree_status_t::success. /// If the method ran out of space in the node, the method returns the key that was last put and the status /// as btree_status_t::has_more virtual btree_status_t multi_put(BtreeKeyRange< K > const& keys, BtreeKey const&, BtreeValue const& val, btree_put_type put_type, K* last_failed_key, - put_filter_cb_t const& filter_cb = nullptr) { + put_filter_cb_t const& filter_cb = nullptr, void* app_ctx = nullptr) { if (put_type != btree_put_type::UPDATE) { DEBUG_ASSERT(false, "For non-interval keys multi-put should be really update and cannot insert"); return btree_status_t::not_supported; @@ -291,7 +292,8 @@ class VariantNode : public StoreSpecificBtreeNode { } ///////////////////////////////////////// Remove related APIs of the node ///////////////////////////////////////// - virtual uint32_t multi_remove(BtreeKeyRange< K > const& keys, remove_filter_cb_t const& filter_cb = nullptr) { + virtual uint32_t multi_remove(BtreeKeyRange< K > const& keys, remove_filter_cb_t const& filter_cb = nullptr, + void* usr_ctx = nullptr) { DEBUG_ASSERT_EQ(this->is_leaf(), true, "Multi put entries on node are supported only for leaf nodes"); // Match the key range to get start and end idx. If none of the ranges here matches, we have to return not_found diff --git a/src/tests/btree_helpers/btree_test_kvs.hpp b/src/tests/btree_helpers/btree_test_kvs.hpp index 0aebc77bc..86d83a35c 100644 --- a/src/tests/btree_helpers/btree_test_kvs.hpp +++ b/src/tests/btree_helpers/btree_test_kvs.hpp @@ -319,7 +319,7 @@ class TestIntervalKey : public BtreeIntervalKey { static uint32_t get_fixed_size() { return sizeof(TestIntervalKey); } /////////////////// Overriding methods of BtreeIntervalKey ///////////////// - void shift(int n) override { m_offset += n; } + void shift(int n, void* app_ctx) override { m_offset += n; } int distance(BtreeKey const& f) const override { TestIntervalKey const& from = s_cast< TestIntervalKey const& >(f); @@ -536,7 +536,7 @@ class TestIntervalValue : public BtreeIntervalValue { } ///////////////////////////// Overriding methods of BtreeIntervalValue ////////////////////////// - void shift(int n) override { m_offset += n; } + void shift(int n, void* app_ctx) override { m_offset += n; } sisl::blob serialize_prefix() const override { return sisl::blob{uintptr_cast(const_cast< uint32_t* >(&m_base_val)), uint32_cast(sizeof(uint32_t))}; diff --git a/src/tests/btree_helpers/shadow_map.hpp b/src/tests/btree_helpers/shadow_map.hpp index 3e8c998ef..6e7310c3f 100644 --- a/src/tests/btree_helpers/shadow_map.hpp +++ b/src/tests/btree_helpers/shadow_map.hpp @@ -3,7 +3,6 @@ #include "btree_test_kvs.hpp" - template < typename K, typename V > class ShadowMap { private: @@ -12,11 +11,11 @@ class ShadowMap { uint32_t m_max_keys; using mutex = iomgr::FiberManagerLib::shared_mutex; mutex m_mutex; -//#define SHOWM(X) cout << #X " = " << (X) << endl -// void testPrint(std::map< uint32_t, std::string >& m_map, int i) { -// SHOWM(m[i]); -// SHOWM(m.find(i)->first); -// } + // #define SHOWM(X) cout << #X " = " << (X) << endl + // void testPrint(std::map< uint32_t, std::string >& m_map, int i) { + // SHOWM(m[i]); + // SHOWM(m.find(i)->first); + // } public: ShadowMap(uint32_t num_keys) : m_range_scheduler(num_keys), m_max_keys{num_keys} {} @@ -41,7 +40,7 @@ class ShadowMap { for (uint32_t i{0}; i < count; ++i) { K key{start_k + i}; V range_value{val}; - if constexpr (std::is_same_v< V, TestIntervalValue >) { range_value.shift(i); } + if constexpr (std::is_same_v< V, TestIntervalValue >) { range_value.shift(i, nullptr); } m_map.insert_or_assign(key, range_value); } m_range_scheduler.put_keys(start_k, start_k + count - 1); diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp index 2b1a02e71..3046a45bd 100644 --- a/src/tests/test_btree_node.cpp +++ b/src/tests/test_btree_node.cpp @@ -92,12 +92,11 @@ struct NodeTest : public testing::Test { auto expected_status = btree_status_t::success; if (m_shadow_map.contains(key)) { - expected_status = put_type != btree_put_type::INSERT - ? btree_status_t::success - : btree_status_t::already_exists; + expected_status = + put_type != btree_put_type::INSERT ? btree_status_t::success : btree_status_t::already_exists; } - ASSERT_EQ(status, expected_status) << "Expected put of key " << k << " of put_type " << enum_name(put_type) - << " to be " << expected_status; + ASSERT_EQ(status, expected_status) + << "Expected put of key " << k << " of put_type " << enum_name(put_type) << " to be " << expected_status; if (expected_status == btree_status_t::success) { m_shadow_map.insert(std::make_pair(key, value)); } else { @@ -131,7 +130,7 @@ struct NodeTest : public testing::Test { for (uint32_t i{0}; i < count; ++i) { K key{k + i}; V range_value{value}; - if constexpr (std::is_same_v< V, TestIntervalValue >) { range_value.shift(i); } + if constexpr (std::is_same_v< V, TestIntervalValue >) { range_value.shift(i, nullptr); } if (m_shadow_map.find(key) != m_shadow_map.end()) { if (put_type != btree_put_type::INSERT) { m_shadow_map.insert_or_assign(key, range_value); } From f30f0d4452519e2e87edc6c3d3bfd07c50ad80bf Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Mon, 5 May 2025 19:04:13 -0700 Subject: [PATCH 118/170] Issue 713: Fix index table destroy race with wb_cache cp flush (#714) --- conanfile.py | 2 +- .../homestore/index/index_internal.hpp | 3 +- src/include/homestore/index/index_table.hpp | 744 +++++++++--------- src/lib/index/index_service.cpp | 4 +- src/lib/index/wb_cache.cpp | 27 +- 5 files changed, 401 insertions(+), 379 deletions(-) diff --git a/conanfile.py b/conanfile.py index 810a307e5..495c144cd 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.7" + version = "6.13.8" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp index c411edf70..6918f9741 100644 --- a/src/include/homestore/index/index_internal.hpp +++ b/src/include/homestore/index/index_internal.hpp @@ -96,7 +96,7 @@ struct IndexBuffer : public sisl::ObjLifeCounter< IndexBuffer > { cp_id_t m_created_cp_id{-1}; // CP id when this buffer is created. std::atomic< index_buf_state_t > m_state{index_buf_state_t::CLEAN}; // Is buffer yet to persist? uint8_t* m_bytes{nullptr}; // Actual data buffer - uint32_t m_node_level{0}; //levels of the node in the btree + uint32_t m_node_level{0}; // levels of the node in the btree std::shared_ptr< IndexBuffer > m_up_buffer; // Parent buffer in the chain to persisted sisl::atomic_counter< int > m_wait_for_down_buffers{0}; // Number of children need to wait for before persisting @@ -145,6 +145,7 @@ struct MetaIndexBuffer : public IndexBuffer { virtual ~MetaIndexBuffer(); void copy_sb_to_buf(); + bool m_valid{true}; superblk< index_table_sb >& m_sb; }; diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 9d8387a32..121a136fd 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -109,6 +109,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { auto cpg = cp_mgr().cp_guard(); Btree< K, V >::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); m_sb.destroy(); + m_sb_buffer->m_valid = false; decr_pending_request_num(); return btree_status_t::success; } @@ -438,421 +439,436 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } // - btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { - LOGTRACEMOD(wbcache, "Repairing links for parent node [{}]", parent_node->to_string()); - // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this - // needs to be handled. Get the last key in the node - - auto last_parent_key = parent_node->get_last_key< K >(); - auto const is_parent_edge_node = parent_node->has_valid_edge(); - if ((parent_node->total_entries() == 0) && !is_parent_edge_node) { - BT_LOG_ASSERT(false, "Parent node={} is empty and not an edge node but was asked to repair", - parent_node->node_id()); - return btree_status_t::not_found; - } + btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { + LOGTRACEMOD(wbcache, "Repairing links for parent node [{}]", parent_node->to_string()); + // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this + // needs to be handled. Get the last key in the node + + auto last_parent_key = parent_node->get_last_key< K >(); + auto const is_parent_edge_node = parent_node->has_valid_edge(); + if ((parent_node->total_entries() == 0) && !is_parent_edge_node) { + BT_LOG_ASSERT(false, "Parent node={} is empty and not an edge node but was asked to repair", + parent_node->node_id()); + return btree_status_t::not_found; + } - // Get all original child ids as a support to check if we are beyond the last child node - std::unordered_map< bnodeid_t, K > orig_child_infos; - for (uint32_t i = 0; i < parent_node->total_entries(); ++i) { - BtreeLinkInfo link_info; - parent_node->get_nth_value(i, &link_info, true); - orig_child_infos[link_info.bnode_id()] = parent_node->get_nth_key< K >(i, false /* copy */); - } - LOGTRACEMOD(wbcache, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), - last_parent_key.to_string()); + // Get all original child ids as a support to check if we are beyond the last child node + std::unordered_map< bnodeid_t, K > orig_child_infos; + for (uint32_t i = 0; i < parent_node->total_entries(); ++i) { + BtreeLinkInfo link_info; + parent_node->get_nth_value(i, &link_info, true); + orig_child_infos[link_info.bnode_id()] = parent_node->get_nth_key< K >(i, false /* copy */); + } + LOGTRACEMOD(wbcache, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), + last_parent_key.to_string()); + + // Get the first child node and its link info + BtreeLinkInfo child_info; + BtreeNodePtr child_node; + BtreeNodePtr pre_child_node; + auto ret = this->get_child_and_lock_node(parent_node, 0, child_info, child_node, locktype_t::READ, + locktype_t::READ, cp_ctx); + if (ret != btree_status_t::success) { + BT_LOG_ASSERT(false, "Parent node={} repair failed, because first child_node get has failed with ret={}", + parent_node->node_id(), enum_name(ret)); + return ret; + } - // Get the first child node and its link info - BtreeLinkInfo child_info; - BtreeNodePtr child_node; - BtreeNodePtr pre_child_node; - auto ret = this->get_child_and_lock_node(parent_node, 0, child_info, child_node, locktype_t::READ, - locktype_t::READ, cp_ctx); - if (ret != btree_status_t::success) { - BT_LOG_ASSERT(false, "Parent node={} repair failed, because first child_node get has failed with ret={}", parent_node->node_id(), enum_name(ret)); - return ret; - } + // update the last key of parent for issue + // 1- last key is X for parent (P) + // 2- check the non deleted last child (A) last key (here is Y) + // start from first child and store the last key of the child node, then traverse to next sibling + // 2-1- if this is greater than parent last key, traverse for sibling of parent until reaches to + // siblings which has keys more than Y or end of list (name this parent sibling node F), + // 2-2- Put last key of F to last key of P + // 2-3 - set F as Next of A + BtreeNodeList siblings; + BtreeNodePtr next_cur_child; + BT_DBG_ASSERT(parent_node->has_valid_edge() || parent_node->total_entries(), + "parent node {} doesn't have valid edge and no entries ", parent_node->to_string()); + if (parent_node->total_entries() > 0) { + auto updated_last_key = last_parent_key; + K last_child_last_key; + K last_child_neighbor_key; + BtreeNodePtr cur_child; + BtreeLinkInfo cur_child_info; - // update the last key of parent for issue - // 1- last key is X for parent (P) - // 2- check the non deleted last child (A) last key (here is Y) - // start from first child and store the last key of the child node, then traverse to next sibling - // 2-1- if this is greater than parent last key, traverse for sibling of parent until reaches to - //siblings which has keys more than Y or end of list (name this parent sibling node F), - // 2-2- Put last key of F to last key of P - // 2-3 - set F as Next of A - BtreeNodeList siblings; - BtreeNodePtr next_cur_child; - BT_DBG_ASSERT(parent_node->has_valid_edge() || parent_node->total_entries(), - "parent node {} doesn't have valid edge and no entries ", parent_node->to_string()); - if (parent_node->total_entries() > 0) { - auto updated_last_key = last_parent_key; - K last_child_last_key; - K last_child_neighbor_key; - BtreeNodePtr cur_child; - BtreeLinkInfo cur_child_info; - - bool found_child = false; - uint32_t nentries = parent_node->total_entries() + parent_node->has_valid_edge() ? 1 : 0; - - for (uint32_t i = nentries; i-- > 0;) { - parent_node->get_nth_value(i, &cur_child_info, false /* copy */); - if (auto ret = read_node_impl(cur_child_info.bnode_id(), cur_child); ret == - btree_status_t::success) { - if (!cur_child->is_node_deleted() && cur_child->total_entries()) { - last_child_last_key = cur_child->get_last_key< K >(); - if (cur_child->next_bnode() != empty_bnodeid && - read_node_impl(cur_child->next_bnode(), next_cur_child) == btree_status_t::success) { - LOGTRACEMOD(wbcache, - "Last child last key {} for child_node [{}] parent node [{}], next neigbor is [{}]", last_child_last_key.to_string(), - cur_child->to_string(), parent_node->to_string(), - next_cur_child->to_string()); - found_child = true; - break; - } + bool found_child = false; + uint32_t nentries = parent_node->total_entries() + parent_node->has_valid_edge() ? 1 : 0; + + for (uint32_t i = nentries; i-- > 0;) { + parent_node->get_nth_value(i, &cur_child_info, false /* copy */); + if (auto ret = read_node_impl(cur_child_info.bnode_id(), cur_child); ret == btree_status_t::success) { + if (!cur_child->is_node_deleted() && cur_child->total_entries()) { + last_child_last_key = cur_child->get_last_key< K >(); + if (cur_child->next_bnode() != empty_bnodeid && + read_node_impl(cur_child->next_bnode(), next_cur_child) == btree_status_t::success) { + LOGTRACEMOD( + wbcache, + "Last child last key {} for child_node [{}] parent node [{}], next neigbor is [{}]", + last_child_last_key.to_string(), cur_child->to_string(), parent_node->to_string(), + next_cur_child->to_string()); found_child = true; break; } - LOGTRACEMOD(wbcache, "PASSING child node {} so we need to check next child node", - cur_child->to_string()); + found_child = true; + break; } + LOGTRACEMOD(wbcache, "PASSING child node {} so we need to check next child node", + cur_child->to_string()); } + } - if (found_child) { - LOGTRACEMOD(wbcache, "Last child last key {} for parent node {}, child_node {}", - last_child_last_key.to_string(), parent_node->to_string(), cur_child->to_string()); - if (last_child_last_key.compare(last_parent_key) > 0) { - if (next_cur_child) { - last_child_neighbor_key = next_cur_child->get_last_key< K >(); - LOGTRACEMOD(wbcache, - "Voila !! last child_key of child [{}] is greater than its parents [{}] and its next neighbor key is {}", cur_child->to_string(), - parent_node->to_string(), last_child_neighbor_key.to_string()); - } else { - LOGTRACEMOD( - wbcache, - "Last child_key of child [{}] is greater than its parents [{}] and it has no next neighbor", cur_child->to_string(), parent_node->to_string()); - } + if (found_child) { + LOGTRACEMOD(wbcache, "Last child last key {} for parent node {}, child_node {}", + last_child_last_key.to_string(), parent_node->to_string(), cur_child->to_string()); + if (last_child_last_key.compare(last_parent_key) > 0) { + if (next_cur_child) { + last_child_neighbor_key = next_cur_child->get_last_key< K >(); + LOGTRACEMOD(wbcache, + "Voila !! last child_key of child [{}] is greater than its parents [{}] and its " + "next neighbor key is {}", + cur_child->to_string(), parent_node->to_string(), + last_child_neighbor_key.to_string()); + } else { + LOGTRACEMOD( + wbcache, + "Last child_key of child [{}] is greater than its parents [{}] and it has no next neighbor", + cur_child->to_string(), parent_node->to_string()); + } - // 2-1 traverse for sibling of parent until reaches to siblings which has keys more than 7563 -// or end - // of list (put all siblings in a list, here is F) , - BtreeNodePtr sibling; - BtreeNodePtr true_sibling; - BtreeLinkInfo sibling_info; - - auto sibling_node_id = parent_node->next_bnode(); - while (sibling_node_id != empty_bnodeid) { - if (auto ret = read_node_impl(sibling_node_id, sibling); ret == btree_status_t::success) { - if (sibling->is_node_deleted()) { - // Do we need to free the sibling node here? - siblings.push_back(sibling); - sibling_node_id = sibling->next_bnode(); - LOGTRACEMOD(wbcache, "Sibling node [{}] is deleted, continue to next sibling", - sibling->to_string()); - continue; - } - auto sibling_last_key = sibling->get_last_key< K >(); - if (next_cur_child && sibling_last_key.compare(last_child_neighbor_key) < 0) { - siblings.push_back(sibling); - sibling_node_id = sibling->next_bnode(); - } else { - true_sibling = sibling; - break; - } + // 2-1 traverse for sibling of parent until reaches to siblings which has keys more than 7563 + // or end + // of list (put all siblings in a list, here is F) , + BtreeNodePtr sibling; + BtreeNodePtr true_sibling; + BtreeLinkInfo sibling_info; + + auto sibling_node_id = parent_node->next_bnode(); + while (sibling_node_id != empty_bnodeid) { + if (auto ret = read_node_impl(sibling_node_id, sibling); ret == btree_status_t::success) { + if (sibling->is_node_deleted()) { + // Do we need to free the sibling node here? + siblings.push_back(sibling); + sibling_node_id = sibling->next_bnode(); + LOGTRACEMOD(wbcache, "Sibling node [{}] is deleted, continue to next sibling", + sibling->to_string()); + continue; } - } - if (true_sibling) { - LOGTRACEMOD(wbcache, "True sibling [{}] for parent_node {}", - true_sibling->to_string(), - parent_node->to_string()); - } else { - LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", - parent_node->to_string()); - } - if (sibling_node_id != empty_bnodeid) { - last_parent_key = last_child_last_key; - parent_node->set_next_bnode(true_sibling->node_id()); - for (auto sibling : siblings) { - LOGTRACEMOD(wbcache, "Sibling list [{}]", sibling->to_string()); + auto sibling_last_key = sibling->get_last_key< K >(); + if (next_cur_child && sibling_last_key.compare(last_child_neighbor_key) < 0) { + siblings.push_back(sibling); + sibling_node_id = sibling->next_bnode(); + } else { + true_sibling = sibling; + break; } - LOGTRACEMOD(wbcache, "True sibling [{}]", true_sibling->to_string()); - BtreeLinkInfo first_child_info; - parent_node->get_nth_value(0, &first_child_info, false); } + } + if (true_sibling) { + LOGTRACEMOD(wbcache, "True sibling [{}] for parent_node {}", true_sibling->to_string(), + parent_node->to_string()); } else { - LOGTRACEMOD(wbcache, - "No undeleted child found for parent_node [{}], keep normal repair (regular recovery)", parent_node->to_string()); - next_cur_child = nullptr; + LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", parent_node->to_string()); + } + if (sibling_node_id != empty_bnodeid) { + last_parent_key = last_child_last_key; + parent_node->set_next_bnode(true_sibling->node_id()); + for (auto sibling : siblings) { + LOGTRACEMOD(wbcache, "Sibling list [{}]", sibling->to_string()); + } + LOGTRACEMOD(wbcache, "True sibling [{}]", true_sibling->to_string()); + BtreeLinkInfo first_child_info; + parent_node->get_nth_value(0, &first_child_info, false); } + } else { + LOGTRACEMOD(wbcache, + "No undeleted child found for parent_node [{}], keep normal repair (regular recovery)", + parent_node->to_string()); + next_cur_child = nullptr; } } + } - // Keep a copy of the node buffer, in case we need to revert back - uint8_t* tmp_buffer = new uint8_t[this->m_node_size]; - std::memcpy(tmp_buffer, parent_node->m_phys_node_buf, this->m_node_size); - - // Remove all the entries in parent_node and let walk across child_nodes rebuild this node - parent_node->remove_all(this->m_bt_cfg); - - // Walk across all child nodes until it gets the last_parent_key and keep fixing them. - auto cur_parent = parent_node; - BtreeNodeList new_parent_nodes; - do { - if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) { - if (child_node->is_node_deleted()) { - // Edge node is merged, we need to set the current last entry as edge - if (cur_parent->total_entries() > 0) { - auto prev_val = V{}; - cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true); - cur_parent->remove(cur_parent->total_entries() - 1); - cur_parent->set_edge_value(prev_val); - LOGTRACEMOD(wbcache, - "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}", - cur_parent->node_id(), child_node->to_string(), prev_val.to_string()); - } else { - LOGTRACEMOD(wbcache, "Found an empty interior node {} with maybe all childs deleted", - cur_parent->node_id()); - } + // Keep a copy of the node buffer, in case we need to revert back + uint8_t* tmp_buffer = new uint8_t[this->m_node_size]; + std::memcpy(tmp_buffer, parent_node->m_phys_node_buf, this->m_node_size); + + // Remove all the entries in parent_node and let walk across child_nodes rebuild this node + parent_node->remove_all(this->m_bt_cfg); + + // Walk across all child nodes until it gets the last_parent_key and keep fixing them. + auto cur_parent = parent_node; + BtreeNodeList new_parent_nodes; + do { + if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) { + if (child_node->is_node_deleted()) { + // Edge node is merged, we need to set the current last entry as edge + if (cur_parent->total_entries() > 0) { + auto prev_val = V{}; + cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true); + cur_parent->remove(cur_parent->total_entries() - 1); + cur_parent->set_edge_value(prev_val); + LOGTRACEMOD(wbcache, + "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}", + cur_parent->node_id(), child_node->to_string(), prev_val.to_string()); + } else { + LOGTRACEMOD(wbcache, "Found an empty interior node {} with maybe all childs deleted", + cur_parent->node_id()); + } + } else { + // Update edge and finish + if (is_parent_edge_node) { + cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); } else { - // Update edge and finish - if (is_parent_edge_node) { - cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), - child_node->link_version()}); + auto tsib_id = find_true_sibling(cur_parent); + if (tsib_id != empty_bnodeid) { + cur_parent->set_next_bnode(tsib_id); + LOGTRACEMOD(wbcache, + "True sibling [{}] for parent_node [{}], So don't add child [{}] here ", + tsib_id, cur_parent->to_string(), child_node->to_string()); } else { - auto tsib_id = find_true_sibling(cur_parent); - if (tsib_id != empty_bnodeid) { - cur_parent->set_next_bnode(tsib_id); + cur_parent->set_next_bnode(empty_bnodeid); + // if this child node previously belonged to this parent node, we need to add it but as edge + // o.w, not this node + if (orig_child_infos.contains(child_node->node_id())) { + cur_parent->set_edge_value( + BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); LOGTRACEMOD(wbcache, - "True sibling [{}] for parent_node [{}], So don't add child [{}] here ", - tsib_id, cur_parent->to_string(), child_node->to_string()); + "Child node [{}] is an edge node and previously belong to this parent, so " + "we need to add it as edge", + child_node->to_string()); } else { - cur_parent->set_next_bnode(empty_bnodeid); - // if this child node previously belonged to this parent node, we need to add it but as edge o.w, not this node - if (orig_child_infos.contains(child_node->node_id())){ - cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), - child_node->link_version()}); - LOGTRACEMOD(wbcache, - "Child node [{}] is an edge node and previously belong to this parent, so we need to add it as edge", - child_node->to_string()); - } else { - LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", - cur_parent->to_string()); - } - BT_REL_ASSERT(cur_parent->total_entries() != 0 || cur_parent->has_valid_edge(), - "Parent node [{}] cannot be empty", cur_parent->to_string()); + LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", + cur_parent->to_string()); } + BT_REL_ASSERT(cur_parent->total_entries() != 0 || cur_parent->has_valid_edge(), + "Parent node [{}] cannot be empty", cur_parent->to_string()); } - -// -// } - break; } + + // + // } break; } + break; + } - auto child_last_key = child_node->get_last_key< K >(); - LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(), - child_node->to_string(), child_last_key.to_string()); - - // Check if we are beyond the last child node. - // - // There can be cases where the child level merge is successfully persisted but the parent level is - // not. In this case, you may have your rightmost child node with last key greater than the - // last_parent_key. That's why here we have to check if the child node is one of the original child - // nodes first. - if (!is_parent_edge_node && !orig_child_infos.contains(child_node->node_id())) { - if (child_last_key.compare(last_parent_key) > 0) { - // We have reached a child beyond this parent, we can stop now - // TODO this case if child last key is less than last parent key to update the parent node. - // this case can potentially break the btree for put and remove op. - break; - } - if (child_node->total_entries() == 0) { - // this child has no entries, but maybe in the middle of the parent node, we need to update the key - // of parent as previous one and go on - LOGTRACEMOD(wbcache, - "Reach to an empty child node {}, and this child doesn't belong to this parent; Hence loop ends", child_node->to_string()); - // now update the next of parent node by skipping all deleted siblings of this parent node - auto valid_sibling = cur_parent->next_bnode(); - while (valid_sibling != empty_bnodeid) { - BtreeNodePtr sibling; - if (read_node_impl(valid_sibling, sibling) == btree_status_t::success) { - if (sibling->is_node_deleted()) { - valid_sibling = sibling->next_bnode(); - continue; - } - // cur_parent->set_next_bnode(sibling->node_id()); - break; + auto child_last_key = child_node->get_last_key< K >(); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(), + child_node->to_string(), child_last_key.to_string()); + + // Check if we are beyond the last child node. + // + // There can be cases where the child level merge is successfully persisted but the parent level is + // not. In this case, you may have your rightmost child node with last key greater than the + // last_parent_key. That's why here we have to check if the child node is one of the original child + // nodes first. + if (!is_parent_edge_node && !orig_child_infos.contains(child_node->node_id())) { + if (child_last_key.compare(last_parent_key) > 0) { + // We have reached a child beyond this parent, we can stop now + // TODO this case if child last key is less than last parent key to update the parent node. + // this case can potentially break the btree for put and remove op. + break; + } + if (child_node->total_entries() == 0) { + // this child has no entries, but maybe in the middle of the parent node, we need to update the key + // of parent as previous one and go on + LOGTRACEMOD(wbcache, + "Reach to an empty child node {}, and this child doesn't belong to this parent; Hence " + "loop ends", + child_node->to_string()); + // now update the next of parent node by skipping all deleted siblings of this parent node + auto valid_sibling = cur_parent->next_bnode(); + while (valid_sibling != empty_bnodeid) { + BtreeNodePtr sibling; + if (read_node_impl(valid_sibling, sibling) == btree_status_t::success) { + if (sibling->is_node_deleted()) { + valid_sibling = sibling->next_bnode(); + continue; } - LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}", - valid_sibling, cur_parent->to_string(), ret); - } - if (valid_sibling != empty_bnodeid) { - cur_parent->set_next_bnode(valid_sibling); - LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", - cur_parent->node_id(), child_node->to_string()); - - } else { - cur_parent->set_next_bnode(empty_bnodeid); - LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", - cur_parent->node_id(), child_node->to_string()); + // cur_parent->set_next_bnode(sibling->node_id()); + break; } - - break; + LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}", + valid_sibling, cur_parent->to_string(), ret); } - } + if (valid_sibling != empty_bnodeid) { + cur_parent->set_next_bnode(valid_sibling); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", + cur_parent->node_id(), child_node->to_string()); - if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(), - BtreeLinkInfo::get_fixed_size())) { - // No room in the parent_node, let us split the parent_node and continue - auto new_parent = this->alloc_interior_node(); - if (new_parent == nullptr) { - ret = btree_status_t::space_not_avail; - break; + } else { + cur_parent->set_next_bnode(empty_bnodeid); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", + cur_parent->node_id(), child_node->to_string()); } - new_parent->set_next_bnode(cur_parent->next_bnode()); - cur_parent->set_next_bnode(new_parent->node_id()); - new_parent->set_level(cur_parent->level()); - cur_parent->inc_link_version(); + break; + } + } - new_parent_nodes.push_back(new_parent); - cur_parent = std::move(new_parent); + if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(), + BtreeLinkInfo::get_fixed_size())) { + // No room in the parent_node, let us split the parent_node and continue + auto new_parent = this->alloc_interior_node(); + if (new_parent == nullptr) { + ret = btree_status_t::space_not_avail; + break; } - // Insert the last key of the child node into parent node - if (!child_node->is_node_deleted()) { - if (child_node->total_entries() == 0) { - if (orig_child_infos.contains(child_node->node_id())) { - child_last_key = orig_child_infos[child_node->node_id()]; - LOGTRACEMOD(wbcache, - "Reach to an empty child node [{}], but not the end of the parent node, so we need to update the key of parent node as original one {}", - child_node->to_string(), child_last_key.to_string()); - } else { - LOGTRACEMOD(wbcache, - "Reach to an empty child node [{}] but not belonging to this parent (probably next parent sibling); Hence end loop", child_node->to_string()); - break; - } - } - cur_parent->insert(cur_parent->total_entries(), child_last_key, - BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); - } else { - // Node deleted indicates it's freed & no longer used during recovery - LOGTRACEMOD(wbcache, "Repairing node={}, child node=[{}] is deleted, skipping the insert", - cur_parent->node_id(), child_node->to_string()); - if (pre_child_node) { - // We need to update the next of the previous child node to this child node + new_parent->set_next_bnode(cur_parent->next_bnode()); + cur_parent->set_next_bnode(new_parent->node_id()); + new_parent->set_level(cur_parent->level()); + cur_parent->inc_link_version(); - LOGTRACEMOD(wbcache, - "Repairing node={}, child_node=[{}] is deleted, set next of previous child node [{}] to this child node [{}]", cur_parent->node_id(), child_node->to_string(), - pre_child_node->to_string(), child_node->next_bnode()); - pre_child_node->set_next_bnode(child_node->next_bnode()); - // repairing the next of previous child node - // We need to set the state of the previous child node to clean, so that it can be flushed - IndexBtreeNode* idx_node = static_cast< IndexBtreeNode* >(pre_child_node.get()); - idx_node->m_idx_buf->set_state(index_buf_state_t::CLEAN); - write_node_impl(pre_child_node, cp_ctx); - // update the key of last entry of the parent with the last key of deleted child + new_parent_nodes.push_back(new_parent); + cur_parent = std::move(new_parent); + } + + // Insert the last key of the child node into parent node + if (!child_node->is_node_deleted()) { + if (child_node->total_entries() == 0) { + if (orig_child_infos.contains(child_node->node_id())) { child_last_key = orig_child_infos[child_node->node_id()]; - LOGTRACEMOD(wbcache, "updating parent [{}] current last key with {}", cur_parent->to_string(), - child_last_key.to_string()); - // update it here to go to the next child node and unlock this node - LOGTRACEMOD(wbcache, "update the child node next to the next of previous child node"); - child_node->set_next_bnode(child_node->next_bnode()); + LOGTRACEMOD(wbcache, + "Reach to an empty child node [{}], but not the end of the parent node, so we need " + "to update the key of parent node as original one {}", + child_node->to_string(), child_last_key.to_string()); + } else { + LOGTRACEMOD(wbcache, + "Reach to an empty child node [{}] but not belonging to this parent (probably next " + "parent sibling); Hence end loop", + child_node->to_string()); + break; } } + cur_parent->insert(cur_parent->total_entries(), child_last_key, + BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + } else { + // Node deleted indicates it's freed & no longer used during recovery + LOGTRACEMOD(wbcache, "Repairing node={}, child node=[{}] is deleted, skipping the insert", + cur_parent->node_id(), child_node->to_string()); + if (pre_child_node) { + // We need to update the next of the previous child node to this child node + + LOGTRACEMOD(wbcache, + "Repairing node={}, child_node=[{}] is deleted, set next of previous child node [{}] " + "to this child node [{}]", + cur_parent->node_id(), child_node->to_string(), pre_child_node->to_string(), + child_node->next_bnode()); + pre_child_node->set_next_bnode(child_node->next_bnode()); + // repairing the next of previous child node + // We need to set the state of the previous child node to clean, so that it can be flushed + IndexBtreeNode* idx_node = static_cast< IndexBtreeNode* >(pre_child_node.get()); + idx_node->m_idx_buf->set_state(index_buf_state_t::CLEAN); + write_node_impl(pre_child_node, cp_ctx); + // update the key of last entry of the parent with the last key of deleted child + child_last_key = orig_child_infos[child_node->node_id()]; + LOGTRACEMOD(wbcache, "updating parent [{}] current last key with {}", cur_parent->to_string(), + child_last_key.to_string()); + // update it here to go to the next child node and unlock this node + LOGTRACEMOD(wbcache, "update the child node next to the next of previous child node"); + child_node->set_next_bnode(child_node->next_bnode()); + } + } - LOGTRACEMOD(wbcache, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), - cur_parent->to_string()); + LOGTRACEMOD(wbcache, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), + cur_parent->to_string()); - // Move to the next child node - auto const next_node_id = child_node->next_bnode(); - this->unlock_node(child_node, locktype_t::READ); + // Move to the next child node + auto const next_node_id = child_node->next_bnode(); + this->unlock_node(child_node, locktype_t::READ); + if (!child_node->is_node_deleted()) { + // We need to free the child node + pre_child_node = child_node; + } + if (next_node_id == empty_bnodeid) { + // This can be a deleted edge node - only check if it is still valid if (!child_node->is_node_deleted()) { - // We need to free the child node - pre_child_node = child_node; - } - if (next_node_id == empty_bnodeid) { - // This can be a deleted edge node - only check if it is still valid - if (!child_node->is_node_deleted()) { - BT_LOG_ASSERT(false, - "Child node={} next_node_id is empty, while its not a edge node, parent_node={} repair is partial", child_node->node_id(), parent_node->node_id()); - ret = btree_status_t::not_found; - } - child_node = nullptr; - break; - } - if (next_cur_child && next_node_id == next_cur_child->node_id()) { - // We are at the last child node, we can stop now - LOGTRACEMOD( - wbcache, - "REACH Repairing node={}, child_node=[{}] is the true child of sibling parent; Hence, end loop", child_node->node_id(), next_cur_child->to_string()); - child_node = nullptr; - break; - } - ret = this->read_and_lock_node(next_node_id, child_node, locktype_t::READ, locktype_t::READ, cp_ctx); - if (ret != btree_status_t::success) { - BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}", - parent_node->node_id(), enum_name(ret)); - child_node = nullptr; - break; + BT_LOG_ASSERT(false, + "Child node={} next_node_id is empty, while its not a edge node, parent_node={} " + "repair is partial", + child_node->node_id(), parent_node->node_id()); + ret = btree_status_t::not_found; } + child_node = nullptr; + break; + } + if (next_cur_child && next_node_id == next_cur_child->node_id()) { + // We are at the last child node, we can stop now + LOGTRACEMOD( + wbcache, + "REACH Repairing node={}, child_node=[{}] is the true child of sibling parent; Hence, end loop", + child_node->node_id(), next_cur_child->to_string()); + child_node = nullptr; + break; + } + ret = this->read_and_lock_node(next_node_id, child_node, locktype_t::READ, locktype_t::READ, cp_ctx); + if (ret != btree_status_t::success) { + BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}", + parent_node->node_id(), enum_name(ret)); + child_node = nullptr; + break; + } - } while (true); - - if (child_node) { this->unlock_node(child_node, locktype_t::READ); } - // if last parent has the key less than the last child key, then we need to update the parent node with - // the last child key if it doesn't have edge. - auto last_parent = parent_node; - if (new_parent_nodes.size() > 0) { last_parent = new_parent_nodes[new_parent_nodes.size() - 1]; } - if (last_parent->total_entries() && !last_parent->has_valid_edge()) { - if (last_parent->compare_nth_key(last_parent_key, last_parent->total_entries() - 1) < 0) { - BtreeLinkInfo child_info; - last_parent->get_nth_value(last_parent->total_entries() - 1, &child_info, false /* copy */); - parent_node->update(parent_node->total_entries() - 1, last_parent_key, child_info); - LOGTRACEMOD(wbcache, "Repairing parent node={} with last_parent_key={} and child_info={}", - parent_node->node_id(), last_parent_key.to_string(), child_info.to_string()); - } - // if last key of children is less than the last key of parent, then we need to update the last key of non interior child - if (last_parent->level() > 1 && !last_parent->has_valid_edge()) { - // read last child - BtreeNodePtr last_child; - BtreeLinkInfo child_info; - auto total_entries = last_parent->total_entries(); - last_parent->get_nth_value(total_entries - 1, &child_info, false /* copy */); - if (ret = read_node_impl(child_info.bnode_id(), last_child); ret == btree_status_t::success) { - // get last key of cur child - auto last_child_key = last_child->get_last_key< K >(); - BtreeLinkInfo last_child_info; - last_child->get_nth_value(last_child->total_entries() - 1, &last_child_info, false /* copy*/); - if (last_parent->compare_nth_key(last_child_key, total_entries - 1) > 0) { - auto cur_child_st = last_child->to_string(); - last_child->update(last_child->total_entries() - 1, last_parent_key, last_child_info); - LOGTRACEMOD(wbcache, - "Updating interior child node={} with last_parent_key={} and child_info={}", - cur_child_st, last_parent_key.to_string(), last_child_info.to_string()); - write_node_impl(last_child, cp_ctx); - } + } while (true); + + if (child_node) { this->unlock_node(child_node, locktype_t::READ); } + // if last parent has the key less than the last child key, then we need to update the parent node with + // the last child key if it doesn't have edge. + auto last_parent = parent_node; + if (new_parent_nodes.size() > 0) { last_parent = new_parent_nodes[new_parent_nodes.size() - 1]; } + if (last_parent->total_entries() && !last_parent->has_valid_edge()) { + if (last_parent->compare_nth_key(last_parent_key, last_parent->total_entries() - 1) < 0) { + BtreeLinkInfo child_info; + last_parent->get_nth_value(last_parent->total_entries() - 1, &child_info, false /* copy */); + parent_node->update(parent_node->total_entries() - 1, last_parent_key, child_info); + LOGTRACEMOD(wbcache, "Repairing parent node={} with last_parent_key={} and child_info={}", + parent_node->node_id(), last_parent_key.to_string(), child_info.to_string()); + } + // if last key of children is less than the last key of parent, then we need to update the last key of non + // interior child + if (last_parent->level() > 1 && !last_parent->has_valid_edge()) { + // read last child + BtreeNodePtr last_child; + BtreeLinkInfo child_info; + auto total_entries = last_parent->total_entries(); + last_parent->get_nth_value(total_entries - 1, &child_info, false /* copy */); + if (ret = read_node_impl(child_info.bnode_id(), last_child); ret == btree_status_t::success) { + // get last key of cur child + auto last_child_key = last_child->get_last_key< K >(); + BtreeLinkInfo last_child_info; + last_child->get_nth_value(last_child->total_entries() - 1, &last_child_info, false /* copy*/); + if (last_parent->compare_nth_key(last_child_key, total_entries - 1) > 0) { + auto cur_child_st = last_child->to_string(); + last_child->update(last_child->total_entries() - 1, last_parent_key, last_child_info); + LOGTRACEMOD(wbcache, + "Updating interior child node={} with last_parent_key={} and child_info={}", + cur_child_st, last_parent_key.to_string(), last_child_info.to_string()); + write_node_impl(last_child, cp_ctx); } } } + } - if (ret == btree_status_t::success) { - // Make write_buf happy for the parent node in case of multiple write (stale repair and link repair) - IndexBtreeNode* p_node = static_cast< IndexBtreeNode* >(parent_node.get()); - p_node->m_idx_buf->set_state(index_buf_state_t::CLEAN); - ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx); - } - - if (ret != btree_status_t::success) { - BT_LOG(ERROR, "An error occurred status={} during repair of parent_node={}, aborting the repair", - enum_name(ret), parent_node->node_id()); - std::memcpy(parent_node->m_phys_node_buf, tmp_buffer, this->m_bt_cfg.node_size()); - } + if (ret == btree_status_t::success) { + // Make write_buf happy for the parent node in case of multiple write (stale repair and link repair) + IndexBtreeNode* p_node = static_cast< IndexBtreeNode* >(parent_node.get()); + p_node->m_idx_buf->set_state(index_buf_state_t::CLEAN); + ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx); + } - delete[] tmp_buffer; - return ret; + if (ret != btree_status_t::success) { + BT_LOG(ERROR, "An error occurred status={} during repair of parent_node={}, aborting the repair", + enum_name(ret), parent_node->node_id()); + std::memcpy(parent_node->m_phys_node_buf, tmp_buffer, this->m_bt_cfg.node_size()); } + delete[] tmp_buffer; + return ret; + } + bnodeid_t find_true_sibling(BtreeNodePtr const& node) { if (node == nullptr) return empty_bnodeid; bnodeid_t sibling_id = empty_bnodeid; @@ -868,7 +884,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { if (read_node_impl(sibling_id, sibling_node) != btree_status_t::success) { return empty_bnodeid; } if (sibling_node->is_node_deleted()) { - LOGTRACEMOD(wbcache, "Sibling node [{}] is not the sibling for parent_node {}", sibling_node->to_string(), node->to_string()); + LOGTRACEMOD(wbcache, "Sibling node [{}] is not the sibling for parent_node {}", + sibling_node->to_string(), node->to_string()); return find_true_sibling(sibling_node); } else { return sibling_id; @@ -894,7 +911,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } } - }; } // namespace homestore diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index 76da72842..c22e70b15 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -277,7 +277,8 @@ void IndexBuffer::remove_down_buffer(const IndexBufferPtr& buf) { } } } - HS_DBG_ASSERT(found, "Down buffer {} is linked to up_buf, but up_buf {} doesn't have down_buf in its list", buf->to_string(), buf->m_up_buffer? buf->m_up_buffer->to_string(): std::string("nulptr")); + HS_DBG_ASSERT(found, "Down buffer {} is linked to up_buf, but up_buf {} doesn't have down_buf in its list", + buf->to_string(), buf->m_up_buffer ? buf->m_up_buffer->to_string() : std::string("nulptr")); #endif } @@ -307,6 +308,7 @@ MetaIndexBuffer::~MetaIndexBuffer() { hs_utils::iobuf_free(m_bytes, sisl::buftag::metablk); m_bytes = nullptr; } + m_valid = false; } void MetaIndexBuffer::copy_sb_to_buf() { std::memcpy(m_bytes, m_sb.raw_buf()->cbytes(), m_sb.size()); } diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 66abd4b37..4c81bd8c0 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -45,7 +45,7 @@ IndexWBCacheBase& wb_cache() { IndexWBCache::IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size) : m_vdev{vdev}, - m_cache{evictor, HS_DYNAMIC_CONFIG(generic.cache_hashmap_nbuckets), node_size, + m_cache{evictor, HS_DYNAMIC_CONFIG(generic.cache_hashmap_nbuckets), node_size, [](const BtreeNodePtr& node) -> BlkId { return static_cast< IndexBtreeNode* >(node.get())->m_idx_buf->m_blkid; }, @@ -597,7 +597,9 @@ void IndexWBCache::recover(sisl::byte_view sb) { buf->m_node_freed = false; r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted = false; m_vdev->commit_blk(buf->m_blkid); - // it can happen when children moved to one of right parent sibling and then the previous node is deleted but not commited during crash (upbuffer is not committed). but its children already committed. and freed (or changed) + // it can happen when children moved to one of right parent sibling and then the previous node is + // deleted but not commited during crash (upbuffer is not committed). but its children already + // committed. and freed (or changed) if (buf->m_node_level) { potential_parent_recovered_bufs.insert(buf); } } else { LOGINFO("deleting and creating new buf {}", buf->to_string()); @@ -638,7 +640,7 @@ void IndexWBCache::recover(sisl::byte_view sb) { buf->to_string()); update_up_buffer_counters(buf->m_up_buffer); } -// buf->m_up_buffer = nullptr; + // buf->m_up_buffer = nullptr; } } } @@ -653,25 +655,23 @@ void IndexWBCache::recover(sisl::byte_view sb) { #endif uint32_t cnt = 0; - LOGTRACEMOD(wbcache, "Potential parent recovered bufs (#of bufs = {})", - potential_parent_recovered_bufs.size()); + LOGTRACEMOD(wbcache, "Potential parent recovered bufs (#of bufs = {})", potential_parent_recovered_bufs.size()); for (auto const& buf : potential_parent_recovered_bufs) { LOGTRACEMOD(wbcache, " {} - check stale recovered buf {}", cnt++, buf->to_string()); } // This step is needed since there is a case where all(or some) children of an interior node is freed (after moving // to a previous sibling parent) and after crash, this node has stale links to its children cnt = 0; - std::vector buffers_to_repair; + std::vector< IndexBufferPtr > buffers_to_repair; for (auto const& buf : potential_parent_recovered_bufs) { LOGTRACEMOD(wbcache, " {} - potential parent recovered buf {}", cnt, buf->to_string()); parent_recover(buf); if (buf->m_bytes == nullptr || r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted) { // This buffer was marked as deleted during repair, so we also need to free it deleted_bufs.push_back(buf); - }else - { - // This buffer was not marked as deleted during repair, so we need to repair it - buffers_to_repair.push_back(buf); + } else { + // This buffer was not marked as deleted during repair, so we need to repair it + buffers_to_repair.push_back(buf); } } // let all unfreed buffers to be repaired first. This is important to let detect and remove all stale links first @@ -837,8 +837,11 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const if (buf->is_meta_buf()) { LOGTRACEMOD(wbcache, "Flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), buf->to_string()); - auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; - if (!sb.is_empty()) { meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); } + auto const sb_buf = r_cast< MetaIndexBuffer* >(buf.get()); + if (sb_buf->m_valid) { + auto const& sb = sb_buf->m_sb; + if (!sb.is_empty()) { meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); } + } process_write_completion(cp_ctx, buf); } else if (buf->m_node_freed) { LOGTRACEMOD(wbcache, "cp {} Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), From e87feeaabed4af83579b5a7b6634af184703ac67 Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Thu, 8 May 2025 10:00:23 -0700 Subject: [PATCH 119/170] =?UTF-8?q?Issue=20716:=20Fix=20log=20periodic=20c?= =?UTF-8?q?ancelt=5Fimer=20issue=20and=20solo=20repl=20dev=20init/destroy?= =?UTF-8?q?=20ra=E2=80=A6=20(#715)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix log periodic cancelt_imer issue and solo repl dev init/destroy race issue --- conanfile.py | 2 +- src/lib/logstore/log_dev.cpp | 7 ++++--- src/lib/replication/repl_dev/solo_repl_dev.cpp | 18 +++++++++++++++++- src/lib/replication/repl_dev/solo_repl_dev.h | 5 ++++- .../replication/service/generic_repl_svc.cpp | 17 ++++++++++------- 5 files changed, 36 insertions(+), 13 deletions(-) diff --git a/conanfile.py b/conanfile.py index 495c144cd..786603d39 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.8" + version = "6.13.9" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index a23b7c900..355c59490 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -168,9 +168,10 @@ void LogDev::start_timer() { void LogDev::stop_timer() { if (m_flush_timer_hdl != iomgr::null_timer_handle) { - // cancel the timer - iomanager.run_on_wait(logstore_service().flush_thread(), - [this]() { iomanager.cancel_timer(m_flush_timer_hdl, true); }); + iomanager.run_on_forget(logstore_service().flush_thread(), [this]() { + iomanager.cancel_timer(m_flush_timer_hdl, true); + m_flush_timer_hdl = iomgr::null_timer_handle; + }); } } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 587cb8b2e..2379068d1 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -1,6 +1,7 @@ #include #include "replication/repl_dev/solo_repl_dev.h" #include "replication/repl_dev/common.h" +#include #include #include #include @@ -10,6 +11,7 @@ namespace homestore { SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing) : m_rd_sb{std::move(rd_sb)}, m_group_id{m_rd_sb->group_id} { if (load_existing) { + m_logdev_id = m_rd_sb->logdev_id; logstore_service().open_logdev(m_rd_sb->logdev_id, flush_mode_t::TIMER); logstore_service() .open_log_store(m_rd_sb->logdev_id, m_rd_sb->logstore_id, true /* append_mode */) @@ -17,6 +19,7 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi m_data_journal = std::move(log_store); m_rd_sb->logstore_id = m_data_journal->get_store_id(); m_data_journal->register_log_found_cb(bind_this(SoloReplDev::on_log_found, 3)); + m_is_recovered = true; }); } else { m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::TIMER); @@ -24,6 +27,7 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi m_rd_sb->logstore_id = m_data_journal->get_store_id(); m_rd_sb->logdev_id = m_logdev_id; m_rd_sb.write(); + m_is_recovered = true; } } @@ -48,6 +52,17 @@ void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } } +// destroy is only called in worker thread; +void SoloReplDev::destroy() { + HS_REL_ASSERT(iomanager.am_i_worker_reactor(), "Destroy should be called in worker thread"); + while (!m_is_recovered) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + + hs()->logstore_service().remove_log_store(m_logdev_id, m_data_journal->get_store_id()); + hs()->logstore_service().destroy_log_dev(m_logdev_id); +} + void SoloReplDev::write_journal(repl_req_ptr_t rreq) { rreq->create_journal_entry(false /* raft_buf */, 1); @@ -211,6 +226,7 @@ void SoloReplDev::cp_flush(CP*) { m_rd_sb.write(); } -void SoloReplDev::cp_cleanup(CP*) { /* m_data_journal->truncate(m_rd_sb->checkpoint_lsn); */ } +void SoloReplDev::cp_cleanup(CP*) { /* m_data_journal->truncate(m_rd_sb->checkpoint_lsn); */ +} } // namespace homestore diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 35f089ec5..a690c4bc0 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -30,10 +30,11 @@ class CP; class SoloReplDev : public ReplDev { private: logdev_id_t m_logdev_id; - std::shared_ptr< HomeLogStore > m_data_journal; + std::shared_ptr< HomeLogStore > m_data_journal{nullptr}; superblk< repl_dev_superblk > m_rd_sb; uuid_t m_group_id; std::atomic< logstore_seq_num_t > m_commit_upto{-1}; + std::atomic< bool > m_is_recovered{false}; public: SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing); @@ -95,6 +96,8 @@ class SoloReplDev : public ReplDev { void cp_flush(CP* cp); void cp_cleanup(CP* cp); + void destroy(); + private: void write_journal(repl_req_ptr_t rreq); void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 5ac65981a..aeaa35f02 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -117,8 +117,8 @@ void SoloReplService::stop() { hs()->data_service().stop(); } -AsyncReplResult< shared< ReplDev > > -SoloReplService::create_repl_dev(group_id_t group_id, std::set< replica_id_t > const& members) { +AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t group_id, + std::set< replica_id_t > const& members) { superblk< repl_dev_superblk > rd_sb{get_meta_blk_name()}; rd_sb.create(); rd_sb->group_id = group_id; @@ -154,17 +154,20 @@ folly::SemiFuture< ReplServiceError > SoloReplService::remove_repl_dev(group_id_ // 1. Firstly stop the repl dev which waits for any outstanding requests to finish rdev_ptr->stop(); - // 2. detaches both ways: + // 2. Destroy the repl dev which will remove the logstore and free the memory; + dp_cast< SoloReplDev >(rdev_ptr)->destroy(); + + // 3. detaches both ways: // detach rdev from its listener and listener from rdev; rdev_ptr->detach_listener(); { - // 3. remove from rd map which finally call SoloReplDev's destructor because this is the last one holding ref to + // 4. remove from rd map which finally call SoloReplDev's destructor because this is the last one holding ref to // this instance; std::unique_lock lg(m_rd_map_mtx); m_rd_map.erase(group_id); } - // 4. now destroy the upper layer's listener instance; + // 5. now destroy the upper layer's listener instance; m_repl_app->destroy_repl_dev_listener(group_id); return folly::makeSemiFuture(ReplServiceError::OK); @@ -202,14 +205,14 @@ std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_ folly::Future< bool > SoloReplServiceCPHandler::cp_flush(CP* cp) { repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) { - if (repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_flush(cp); } + if (repl_dev) { dp_cast< SoloReplDev >(repl_dev)->cp_flush(cp); } }); return folly::makeFuture< bool >(true); } void SoloReplServiceCPHandler::cp_cleanup(CP* cp) { repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) { - if (repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_cleanup(cp); } + if (repl_dev) { dp_cast< SoloReplDev >(repl_dev)->cp_cleanup(cp); } }); } From 9ef938c7f794f7220b248ef85849f935402ba096 Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Mon, 12 May 2025 14:34:36 -0700 Subject: [PATCH 120/170] Issue 717: expose data service drive type (#718) --- conanfile.py | 2 +- src/include/homestore/blkdata_service.hpp | 23 +++++++++++++++++++ src/lib/blkdata_svc/blkdata_service.cpp | 3 +++ src/lib/device/virtual_dev.hpp | 1 + src/lib/logstore/log_dev.cpp | 3 ++- src/lib/logstore/log_store_service.cpp | 4 ++++ .../replication/repl_dev/solo_repl_dev.cpp | 2 ++ 7 files changed, 36 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index 786603d39..e184b8342 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.9" + version = "6.13.10" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index e1992b983..87aed9e01 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -220,10 +220,33 @@ class BlkDataService { */ void start(); + /** + * @brief Gets the total capacity of the block data service. + * + * This function returns the total capacity of the block data service, in bytes. + * + * @return The total capacity of the block data service, in bytes. + */ uint64_t get_total_capacity() const; + /** + * @brief Gets the used capacity of the block data service. + * + * This function returns the used capacity of the block data service, in bytes. + * + * @return The used capacity of the block data service, in bytes. + */ uint64_t get_used_capacity() const; + /** + * @brief Gets the drive type of the data service. + * + * Data Service doesn't support mixed drive types. + * + * @return The drive type of the data service, HDD or NVME. + */ + HSDevType get_dev_type() const; + void stop(); private: diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index 1219ed00e..d17462ac2 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -34,6 +34,7 @@ BlkDataService::BlkDataService(shared< ChunkSelector > chunk_selector) : m_custom_chunk_selector{std::move(chunk_selector)} { m_blk_read_tracker = std::make_unique< BlkReadTracker >(); } + BlkDataService::~BlkDataService() = default; // first-time boot path @@ -311,6 +312,8 @@ uint64_t BlkDataService::get_total_capacity() const { return m_vdev->size(); } uint64_t BlkDataService::get_used_capacity() const { return m_vdev->used_size(); } +HSDevType BlkDataService::get_dev_type() const { return static_cast< HSDevType >(m_vdev->get_dev_type()); } + uint32_t BlkDataService::get_align_size() const { return m_vdev->align_size(); } } // namespace homestore diff --git a/src/lib/device/virtual_dev.hpp b/src/lib/device/virtual_dev.hpp index c004e3c03..409aa167a 100644 --- a/src/lib/device/virtual_dev.hpp +++ b/src/lib/device/virtual_dev.hpp @@ -290,6 +290,7 @@ class VirtualDev { virtual nlohmann::json get_status(int log_level) const; virtual uint64_t get_total_chunk_num() const { return m_total_chunk_num; } + uint8_t get_dev_type() const { return m_vdev_info.hs_dev_type; } uint32_t align_size() const; uint32_t optimal_page_size() const; uint32_t atomic_page_size() const; diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 355c59490..c1ea83839 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -67,7 +67,8 @@ void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) { m_logdev_meta.create(m_logdev_id, m_flush_mode); m_vdev_jd->update_data_start_offset(0); } else { - HS_LOG_ASSERT(!m_logdev_meta.is_empty(), "Expected meta data to be read already before loading"); + HS_LOG_ASSERT(!m_logdev_meta.is_empty(), + "Expected meta data to be read already before loading this log dev id: {}", m_logdev_id); auto const store_list = m_logdev_meta.load(); // Notify to the caller that a new log store was reserved earlier and it is being loaded, with its meta info diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index 542204386..9f656d5c9 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -157,6 +157,7 @@ logdev_id_t LogStoreService::create_new_logdev(flush_mode_t flush_mode) { void LogStoreService::destroy_log_dev(logdev_id_t logdev_id) { if (is_stopping()) return; + HS_LOG(INFO, logstore, "Destroying logdev {}", logdev_id); incr_pending_request_num(); folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); const auto it = m_id_logdev_map.find(logdev_id); @@ -194,6 +195,7 @@ std::shared_ptr< LogDev > LogStoreService::create_new_logdev_internal(logdev_id_ const auto it = m_id_logdev_map.find(logdev_id); HS_REL_ASSERT((it == m_id_logdev_map.end()), "logdev id {} already exists", logdev_id); m_id_logdev_map.insert(std::make_pair<>(logdev_id, logdev)); + LOGINFO("Created logdev {}", logdev_id); return logdev; } @@ -302,6 +304,7 @@ folly::Future< shared< HomeLogStore > > LogStoreService::open_log_store(logdev_i void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t store_id) { if (is_stopping()) return; + HS_LOG(INFO, logstore, "Removing logstore {} from logdev {}", store_id, logdev_id); incr_pending_request_num(); folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); COUNTER_INCREMENT(m_metrics, logstores_count, 1); @@ -311,6 +314,7 @@ void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t stor return; } it->second->remove_log_store(store_id); + HS_LOG(INFO, logstore, "Successfully removed logstore {} from logdev {}", store_id, logdev_id); decr_pending_request_num(); COUNTER_DECREMENT(m_metrics, logstores_count, 1); } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 2379068d1..bb96a3fb0 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -61,6 +61,8 @@ void SoloReplDev::destroy() { hs()->logstore_service().remove_log_store(m_logdev_id, m_data_journal->get_store_id()); hs()->logstore_service().destroy_log_dev(m_logdev_id); + + m_rd_sb.destroy(); } void SoloReplDev::write_journal(repl_req_ptr_t rreq) { From 9acb18e6c5c50946a259a253b7e1d7896b62b7ab Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Tue, 13 May 2025 13:25:26 -0700 Subject: [PATCH 121/170] Fix occupied_size for prefix (#719) --- conanfile.py | 2 +- src/include/homestore/btree/detail/prefix_node.hpp | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index e184b8342..2661d521a 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.10" + version = "6.13.11" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp index cbcdc5257..7faa42b09 100644 --- a/src/include/homestore/btree/detail/prefix_node.hpp +++ b/src/include/homestore/btree/detail/prefix_node.hpp @@ -344,6 +344,11 @@ class FixedPrefixNode : public VariantNode< K, V > { } } + uint32_t occupied_size() const override { + return (this->node_data_size() - sizeof(prefix_node_header) - (prefix_bitset_.size() / 8) - + this->available_size()); + } + bool has_room_for_put(btree_put_type, uint32_t, uint32_t) const override { return has_room(1u); } uint32_t get_nth_key_size(uint32_t) const override { return dummy_key< K >.serialized_size(); } @@ -789,7 +794,8 @@ class FixedPrefixNode : public VariantNode< K, V > { //////////////////////// All Helper methods section //////////////////////// static uint32_t reqd_bitset_size(BtreeConfig const& cfg) { - return sisl::round_up(cfg.node_data_size() / (prefix_entry::key_size() + prefix_entry::value_size()) / 8, + return sisl::round_up((cfg.node_data_size() - sizeof(prefix_node_header)) / + (prefix_entry::key_size() + prefix_entry::value_size()) / 8, sisl::CompactBitSet::size_multiples()); } From b932189d646a6c370af80220ed184ccabc22b98b Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Wed, 7 May 2025 16:05:34 -0700 Subject: [PATCH 122/170] add long running test with put and remove --- conanfile.py | 2 +- src/tests/test_index_crash_recovery.cpp | 546 +++++++++++------------- src/tests/test_scripts/index_test.py | 15 + 3 files changed, 277 insertions(+), 286 deletions(-) diff --git a/conanfile.py b/conanfile.py index 2661d521a..c79c13e8d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.11" + version = "6.13.12" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 9121f7240..c7e196254 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -112,6 +112,17 @@ class SequenceGenerator { OperationList generateOperations(size_t numOperations, bool reset = false) { std::vector< Operation > operations; if (reset) { this->reset(); } + if(putFreq_ == 100 && end_range_ - start_range_ + 1 - in_use_key_cnt_.load() < numOperations) { + LOGDEBUG("All keys are in use, skipping operation generation. end_range_ {} start_range_ {} in_use_key_cnt_ {}, numOperations {}", + end_range_, start_range_, in_use_key_cnt_.load(), numOperations); + return operations; + } + if(removeFreq_ == 100 && in_use_key_cnt_.load() < numOperations) { + LOGDEBUG("Not enough keys are in use, skipping operation generation. in_use_key_cnt_ {} numOperations {}", + in_use_key_cnt_.load(), numOperations); + return operations; + } + while (operations.size() < numOperations) { uint32_t key = keyDist_(g_re); auto [it, inserted] = keyStates.try_emplace(key, false); @@ -122,9 +133,11 @@ class SequenceGenerator { if (operation == OperationType::Put && !inUse) { operations.emplace_back(key, OperationType::Put); inUse = true; + in_use_key_cnt_.fetch_add(1); } else if (operation == OperationType::Remove && inUse) { operations.emplace_back(key, OperationType::Remove); inUse = false; + in_use_key_cnt_.fetch_sub(1); } } @@ -225,6 +238,7 @@ class SequenceGenerator { std::uniform_int_distribution<> keyDist_; std::discrete_distribution<> opTypeDist_; std::map< uint64_t, bool > keyStates; + std::atomic< uint64_t > in_use_key_cnt_{0}; void updateOperationTypeDistribution() { opTypeDist_ = @@ -233,6 +247,19 @@ class SequenceGenerator { }; #ifdef _PRERELEASE + +struct long_running_crash_options { + uint32_t put_freq; + std::vector< std::string > put_flips{}; + std::vector< std::string > remove_flips{}; + uint32_t num_entries{SISL_OPTIONS["num_entries"].as< uint32_t >()}; + uint32_t preload_size{SISL_OPTIONS["preload_size"].as< uint32_t >()}; + uint32_t rounds{SISL_OPTIONS["num_rounds"].as< uint32_t >()}; + uint32_t num_entries_per_rounds{SISL_OPTIONS["num_entries_per_rounds"].as< uint32_t >()}; + bool load_mode{SISL_OPTIONS.count("load_from_file") > 0}; + bool save_mode{SISL_OPTIONS.count("save_to_file") > 0}; +}; + template < typename TestType > struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestType >, public ::testing::Test { using T = TestType; @@ -452,8 +479,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Sanity check passed for {} keys!", count); } - void crash_and_recover(std::string& flip, OperationList& operations, std::string filename = "") { - this->remove_flip(flip); + void crash_and_recover_common(OperationList& operations, std::string filename = "") { // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); LOGINFO("Before Crash: {} keys in shadow map and it is actually {} keys in tree - operations size {}", this->m_shadow_map.size(), tree_key_count(), operations.size()); @@ -494,8 +520,218 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT << "shadow map size and tree size mismatch"; } + void crash_and_recover(std::string& flip, OperationList& operations, std::string filename = "") { + this->remove_flip(flip); + this->crash_and_recover_common(operations, filename); + } + + void crash_and_recover(std::vector< std::string >& flips, OperationList& operations, std::string filename = "") { + for (auto const& flip : flips) { + this->remove_flip(flip); + } + this->crash_and_recover_common(operations, filename); + } + uint32_t tree_key_count() { return this->m_bt->count_keys(this->m_bt->root_node_id()); } + void long_running_crash(long_running_crash_options const& crash_test_options) { + // set putFreq 100 for the initial load + SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, crash_test_options.num_entries - 1 /*end_range*/); + + std::vector< std::string > flips; + OperationList operations; + auto m_start_time = Clock::now(); + auto time_to_stop = [this, m_start_time]() { return (get_elapsed_time_sec(m_start_time) > this->m_run_time); }; + double elapsed_time, progress_percent, last_progress_time = 0; + bool renew_btree_after_crash = false; + auto cur_put_flip_idx = 0; + auto cur_remove_flip_idx = 0; + std::uniform_int_distribution<> dis(1, 100); + int flip_percentage = 90; // Set the desired percentage here + bool normal_execution = true; + bool clean_shutdown = true; + // if it is safe then delete all previous save files + if (crash_test_options.save_mode) { + std::filesystem::remove_all("/tmp/operations_*.txt"); + std::filesystem::remove_all("/tmp/flips_history.txt"); + } + // init tree + LOGINFO("Step 0: Fill up the tree with {} entries", crash_test_options.preload_size); + if (crash_test_options.load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_0.txt")); + } else { + operations = generator.generateOperations(crash_test_options.preload_size, true /* reset */); + if (crash_test_options.save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); } + } + + LOGDEBUG("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); + uint32_t num_keys{0}; + + for (auto [k, _] : operations) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + num_keys++; + } + + generator.setPutFrequency(crash_test_options.put_freq); + generator.setRemoveFrequency(100 - crash_test_options.put_freq); + + // Trigger the cp to make sure middle part is successful + LOGINFO("Step 0-1: Flush all the entries so far"); + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + this->m_shadow_map.save(this->m_shadow_filename); + // this->print_keys("reapply: after preload"); + this->visualize_keys("tree_after_preload.dot"); + + for (uint32_t round = 1; + round <= crash_test_options.rounds && !time_to_stop(); round++) { + LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, crash_test_options.rounds); + bool print_time = false; + elapsed_time = get_elapsed_time_sec(m_start_time); + + if (crash_test_options.load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); + } else { + operations = generator.generateOperations(crash_test_options.num_entries_per_rounds, renew_btree_after_crash /* reset */); + if (crash_test_options.save_mode) { + SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations); + } + } + if(operations.empty()) { + LOGDEBUG("No operations generated, skipping round {}", round); + continue; + } + + flips.clear(); + if (crash_test_options.load_mode) { + std::ifstream file("/tmp/flips_history.txt"); + std::string line; + bool found = false; + for (uint32_t i = 0; i < round && std::getline(file, line); i++) { + if (i == round - 1) { + found = true; + break; + } + } + if (found && !line.empty()) { + if (line == "normal") { + normal_execution = true; + } else { + normal_execution = false; + std::istringstream iss(line); + std::string flip; + while (iss >> flip) { + flips.emplace_back(flip); + } + auto log_str = fmt::format("Step 1-{}: Set flag", round); + for(auto const& f : flips) { + log_str += fmt::format(" {}", f); + this->set_basic_flip(f, 1, 100); + } + LOGINFO("{}", log_str); + } + } + file.close(); + } else { + if (dis(g_re) <= flip_percentage) { + if(!crash_test_options.put_flips.empty()) { + flips.emplace_back(crash_test_options.put_flips[cur_put_flip_idx++ % crash_test_options.put_flips.size()]); + } + if(!crash_test_options.remove_flips.empty()) { + flips.emplace_back(crash_test_options.remove_flips[cur_remove_flip_idx++ % crash_test_options.remove_flips.size()]); + } + auto log_str = fmt::format("Step 1-{}: Set flag", round); + for(auto const& f : flips) { + log_str += fmt::format(" {}", f); + this->set_basic_flip(f, 1, 100); + } + LOGINFO("{}", log_str); + normal_execution = false; + } else { + normal_execution = true; + LOGINFO("Step 1-{}: No flip set", round); + } + if (crash_test_options.save_mode) { + // save the filp name to a file for later use + std::ofstream file("/tmp/flips_history.txt", std::ios::app); + if (file.is_open()) { + std::string out_line{"normal"}; + if (!normal_execution) { + out_line = flips[0]; + for (size_t i = 1; i < flips.size(); i++) { + out_line += " " + flips[i]; + } + } + file << out_line << "\n"; + } + file.close(); + } + } + + LOGDEBUG("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); + + for (auto [k, op] : operations) { + if (op == OperationType::Remove) { + if(num_keys < 1) { + // remove flips and continue + for (auto const& flip : flips) { + this->remove_flip(flip); + } + continue; + } + LOGDEBUG("Removing key {}", k); + this->remove_one(k, true /* expect_success */); + num_keys--; + } else { + if (num_keys >= crash_test_options.num_entries) { + // remove flips and continue + for (auto const& flip : flips) { + this->remove_flip(flip); + } + continue; + } + LOGDEBUG("Inserting key {}", k); + this->put(k, btree_put_type::INSERT, true /* expect_success */); + num_keys++; + } + if (!time_to_stop()) { + static bool print_alert = false; + if (print_alert) { + LOGINFO("It is time to stop but let's finish this round and then stop!"); + print_alert = false; + } + } + } + if (normal_execution) { + if (clean_shutdown) { + this->m_shadow_map.save(this->m_shadow_filename); + this->restart_homestore(); + } else { + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + } + } else { + // remove the flips so that they do not get triggered erroneously + this->crash_and_recover(flips, operations, fmt::format("long_tree_{}", round)); + } + if (elapsed_time - last_progress_time > 30) { + last_progress_time = elapsed_time; + print_time = true; + } + if (print_time) { + LOGINFO("\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " + "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", + round, crash_test_options.rounds, round * 100.0 / crash_test_options.rounds, elapsed_time, this->m_run_time, + elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), crash_test_options.num_entries, + this->tree_key_count() * 100.0 / crash_test_options.num_entries); + } + // this->print_keys(fmt::format("reapply: after round {}", round)); + if (renew_btree_after_crash) { this->reset_btree(); }; + } + this->destroy_btree(); + log_obj_life_counter(); + } + protected: const std::string m_shadow_filename = "/tmp/shadow_map_index_recovery.txt"; }; @@ -598,293 +834,33 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) { } TYPED_TEST(IndexCrashTest, long_running_put_crash) { - - // Define the lambda function - auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); - auto const preload_size = SISL_OPTIONS["preload_size"].as< uint32_t >(); - auto const rounds = SISL_OPTIONS["num_rounds"].as< uint32_t >(); - auto const num_entries_per_rounds = SISL_OPTIONS["num_entries_per_rounds"].as< uint32_t >(); - bool load_mode = SISL_OPTIONS.count("load_from_file"); - bool save_mode = SISL_OPTIONS.count("save_to_file"); - SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); - vector< std::string > flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", - "crash_flush_on_split_at_right_child"}; - - std::string flip = ""; - OperationList operations; - auto m_start_time = Clock::now(); - auto time_to_stop = [this, m_start_time]() { return (get_elapsed_time_sec(m_start_time) > this->m_run_time); }; - double elapsed_time, progress_percent, last_progress_time = 0; - bool renew_btree_after_crash = false; - auto cur_flip_idx = 0; - std::uniform_int_distribution<> dis(1, 100); - int flip_percentage = 90; // Set the desired percentage here - bool normal_execution = true; - bool clean_shutdown = true; - // if it is safe then delete all previous save files - if (save_mode) { - std::filesystem::remove_all("/tmp/operations_*.txt"); - std::filesystem::remove_all("/tmp/flips_history.txt"); - } - // init tree - LOGINFO("Step 0: Fill up the tree with {} entries", preload_size); - if (load_mode) { - operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_0.txt")); - } else { - operations = generator.generateOperations(preload_size, true /* reset */); - if (save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); } - } - auto opstr = SequenceGenerator::printOperations(operations); - LOGINFO("Lets before crash print operations\n{}", opstr); - - for (auto [k, _] : operations) { - this->put(k, btree_put_type::INSERT, true /* expect_success */); - } - - // Trigger the cp to make sure middle part is successful - LOGINFO("Step 0-1: Flush all the entries so far"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - this->m_shadow_map.save(this->m_shadow_filename); - // this->print_keys("reapply: after preload"); - this->visualize_keys("tree_after_preload.dot"); - - for (uint32_t round = 1; - round <= rounds && !time_to_stop() && this->tree_key_count() < num_entries - num_entries_per_rounds; round++) { - LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, rounds); - bool print_time = false; - elapsed_time = get_elapsed_time_sec(m_start_time); - if (load_mode) { - std::ifstream file("/tmp/flips_history.txt"); - std::string line; - bool found = false; - for (uint32_t i = 0; i < round && std::getline(file, line); i++) { - if (i == round - 1) { - found = true; - break; - } - } - if (found && !line.empty()) { - if (line == "normal") { - normal_execution = true; - } else { - normal_execution = false; - flip = line; - LOGINFO("Step 1-{}: Set flag {}", round, flip); - this->set_basic_flip(flip, 1, 100); - } - } - file.close(); - } else { - if (dis(g_re) <= flip_percentage) { - flip = flips[cur_flip_idx++ % flips.size()]; - LOGINFO("Step 1-{}: Set flag {}", round, flip); - this->set_basic_flip(flip, 1, 100); - normal_execution = false; - } else { - normal_execution = true; - LOGINFO("Step 1-{}: No flip set", round); - } - if (save_mode) { - // save the filp name to a file for later use - std::ofstream file("/tmp/flips_history.txt", std::ios::app); - if (file.is_open()) { file << (normal_execution ? "normal" : flip) << "\n"; } - file.close(); - } - } - if (load_mode) { - operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); - } else { - operations = generator.generateOperations(num_entries_per_rounds, renew_btree_after_crash /* reset */); - if (save_mode) { - SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations); - } - } - LOGINFO("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); - for (auto [k, _] : operations) { - this->put(k, btree_put_type::INSERT, true /* expect_success */); - if (!time_to_stop()) { - static bool print_alert = false; - if (print_alert) { - LOGINFO("It is time to stop but let's finish this round and then stop!"); - print_alert = false; - } - } - } - if (normal_execution) { - if (clean_shutdown) { - this->m_shadow_map.save(this->m_shadow_filename); - this->restart_homestore(); - } else { - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - } - } else { - // remove the flips so that they do not get triggered erroneously - this->crash_and_recover(flip, operations, fmt::format("long_tree_{}", round)); - } - if (elapsed_time - last_progress_time > 30) { - last_progress_time = elapsed_time; - print_time = true; - } - if (print_time) { - LOGINFO("\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " - "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", - round, rounds, round * 100.0 / rounds, elapsed_time, this->m_run_time, - elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), num_entries, - this->tree_key_count() * 100.0 / num_entries); - } - // this->print_keys(fmt::format("reapply: after round {}", round)); - if (renew_btree_after_crash) { this->reset_btree(); }; - } - this->destroy_btree(); - log_obj_life_counter(); + long_running_crash_options crash_test_options{ + .put_freq = 100, + .put_flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", + "crash_flush_on_split_at_right_child"}, + }; + this->long_running_crash(crash_test_options); } TYPED_TEST(IndexCrashTest, long_running_remove_crash) { + long_running_crash_options crash_test_options{ + .put_freq = 0, + .remove_flips = {"crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child" + /*, "crash_flush_on_freed_child"*/}, + .preload_size = SISL_OPTIONS["num_entries"].as< uint32_t >(), + }; + this->long_running_crash(crash_test_options); +} - // Define the lambda function - auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); - auto const rounds = SISL_OPTIONS["num_rounds"].as< uint32_t >(); - auto const num_entries_per_rounds = SISL_OPTIONS["num_entries_per_rounds"].as< uint32_t >(); - bool load_mode = SISL_OPTIONS.count("load_from_file"); - bool save_mode = SISL_OPTIONS.count("save_to_file"); - SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); - vector< std::string > flips = {"crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child"/*, - "crash_flush_on_freed_child"*/}; - - std::string flip = ""; - OperationList operations; - auto m_start_time = Clock::now(); - auto time_to_stop = [this, m_start_time]() { return (get_elapsed_time_sec(m_start_time) > this->m_run_time); }; - double elapsed_time, progress_percent, last_progress_time = 0; - bool renew_btree_after_crash = false; - auto cur_flip_idx = 0; - std::uniform_int_distribution<> dis(1, 100); - int flip_percentage = 90; // Set the desired percentage here - bool normal_execution = true; - bool clean_shutdown = true; - // if it is safe then delete all previous save files - if (save_mode) { - std::filesystem::remove_all("/tmp/operations_*.txt"); - std::filesystem::remove_all("/tmp/flips_history.txt"); - } - // init tree - LOGINFO("Step 0: Fill up the tree with {} entries", num_entries); - if (load_mode) { - operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_0.txt")); - } else { - operations = generator.generateOperations(num_entries, true /* reset */); - if (save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); } - } - // auto opstr = SequenceGenerator::printOperations(operations); - // LOGINFO("Lets before crash print operations\n{}", opstr); - - for (auto [k, _] : operations) { - this->put(k, btree_put_type::INSERT, true /* expect_success */); - } - generator.setPutFrequency(0); - generator.setRemoveFrequency(100); - - // Trigger the cp to make sure middle part is successful - LOGINFO("Step 0-1: Flush all the entries so far"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - this->m_shadow_map.save(this->m_shadow_filename); - // this->print_keys("reapply: after preload"); - this->visualize_keys("tree_after_preload.dot"); - - for (uint32_t round = 1; round <= rounds && !time_to_stop() && this->tree_key_count() >= num_entries_per_rounds; - round++) { - LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, rounds); - // this->print_keys(fmt::format("before round {}",round)); - bool print_time = false; - elapsed_time = get_elapsed_time_sec(m_start_time); - if (load_mode) { - std::ifstream file("/tmp/flips_history.txt"); - std::string line; - bool found = false; - for (uint32_t i = 0; i < round && std::getline(file, line); i++) { - if (i == round - 1) { - found = true; - break; - } - } - if (found && !line.empty()) { - if (line == "normal") { - normal_execution = true; - } else { - normal_execution = false; - flip = line; - LOGINFO("Step 1-{}: Set flag {}", round, flip); - this->set_basic_flip(flip, 1, 100); - } - } - file.close(); - } else { - if (dis(g_re) <= flip_percentage) { - flip = flips[cur_flip_idx++ % flips.size()]; - LOGINFO("Step 1-{}: Set flag {}", round, flip); - this->set_basic_flip(flip, 1, 100); - normal_execution = false; - } else { - normal_execution = true; - LOGINFO("Step 1-{}: No flip set", round); - } - if (save_mode) { - // save the filp name to a file for later use - std::ofstream file("/tmp/flips_history.txt", std::ios::app); - if (file.is_open()) { file << (normal_execution ? "normal" : flip) << "\n"; } - file.close(); - } - } - if (load_mode) { - operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); - } else { - operations = generator.generateOperations(num_entries_per_rounds, renew_btree_after_crash /* reset */); - if (save_mode) { - SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations); - } - } - // LOGINFO("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); - for (auto [k, _] : operations) { - this->remove_one(k, true /* expect_success */); - if (!time_to_stop()) { - static bool print_alert = false; - if (print_alert) { - LOGINFO("It is time to stop but let's finish this round and then stop!"); - print_alert = false; - } - } - } - if (normal_execution) { - if (clean_shutdown) { - this->m_shadow_map.save(this->m_shadow_filename); - this->restart_homestore(); - } else { - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - } - } else { - this->crash_and_recover(flip, operations, fmt::format("long_tree_{}", round)); - } - if (elapsed_time - last_progress_time > 30) { - last_progress_time = elapsed_time; - print_time = true; - } - if (print_time) { - LOGINFO("\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " - "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", - round, rounds, round * 100.0 / rounds, elapsed_time, this->m_run_time, - elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), num_entries, - this->tree_key_count() * 100.0 / num_entries); - } - // this->print_keys(fmt::format("reapply: after round {}", round)); - if (renew_btree_after_crash) { this->reset_btree(); }; - } - this->print_keys(fmt::format("tree at end")); - this->destroy_btree(); - log_obj_life_counter(); +TYPED_TEST(IndexCrashTest, long_running_put_remove_crash) { + long_running_crash_options crash_test_options{ + .put_freq = 50, + .put_flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", + "crash_flush_on_split_at_right_child"}, + .remove_flips = {"crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child" + /*, "crash_flush_on_freed_child"*/}, + }; + this->long_running_crash(crash_test_options); } // Basic reverse and forward order remove with different flip points diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index 54059cf0a..bf2098fd4 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -114,6 +114,18 @@ def long_running_crash_remove(options): run_crash_test(options, 'remove', 0) print("Long running crash put completed") +def long_running_crash_put_remove(options): + print("Long running crash put_remove started") + options['num_entries'] = 2000 # 1280K + options['init_device'] = True + options['run_time'] = 14400 # 4 hours + options['preload_size'] = 1024 + options['min_keys_in_node'] = 3 + options['max_keys_in_node'] = 10 + print(f"options: {options}") + run_crash_test(options, 'put_remove', 0) + print("Long running crash put_remove completed") + def main(): options = parse_arguments() @@ -132,6 +144,9 @@ def main(): def long_running(*args): options = parse_arguments() + for i in range(20): + print(f"Iteration {i + 1}") + long_running_crash_put_remove(options) for i in range(50): print(f"Iteration {i + 1}") long_running_crash_remove(options) From 3c1b26b1420973823d9a7662887da4c4c1ca7870 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Fri, 16 May 2025 11:15:59 -0700 Subject: [PATCH 123/170] Fix prefix - reload compactbitset after updating node phys buffer (#724) --- conanfile.py | 3 +-- .../homestore/btree/detail/btree_node.hpp | 8 +++++++- .../homestore/btree/detail/prefix_node.hpp | 5 ++++- .../homestore/btree/detail/variant_node.hpp | 1 + src/tests/CMakeLists.txt | 2 +- src/tests/test_index_btree.cpp | 2 +- src/tests/test_mem_btree.cpp | 18 +++++++----------- 7 files changed, 22 insertions(+), 17 deletions(-) diff --git a/conanfile.py b/conanfile.py index c79c13e8d..43d3aafc8 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,8 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.12" - + version = "6.13.13" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" topics = ("ebay", "nublox") diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index 1c45501aa..5cdaa94c5 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -472,7 +472,13 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { } public: - void update_phys_buf(uint8_t* buf) { m_phys_node_buf = buf; } + void update_phys_buf(uint8_t* buf) { + m_phys_node_buf = buf; + on_update_phys_buf(); + } + // This method is called when the physical buffer is updated. + // Derived classes can override this method to perform additional actions. + virtual void on_update_phys_buf() = 0; persistent_hdr_t* get_persistent_header() { return r_cast< persistent_hdr_t* >(m_phys_node_buf); } const persistent_hdr_t* get_persistent_header_const() const { return r_cast< const persistent_hdr_t* >(m_phys_node_buf); diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp index 7faa42b09..7525729b0 100644 --- a/src/include/homestore/btree/detail/prefix_node.hpp +++ b/src/include/homestore/btree/detail/prefix_node.hpp @@ -160,7 +160,10 @@ class FixedPrefixNode : public VariantNode< K, V > { } virtual ~FixedPrefixNode() = default; - + virtual void on_update_phys_buf() override { + // Update the prefix bitset with the new buffer + prefix_bitset_ = sisl::CompactBitSet{sisl::blob{bitset_area(), prefix_bitset_.size() / 8}, false}; + } ///////////////////////////// All overrides of BtreeIntervalNode /////////////////////////////////// /// @brief Upserts a batch of entries into a prefix node. /// diff --git a/src/include/homestore/btree/detail/variant_node.hpp b/src/include/homestore/btree/detail/variant_node.hpp index 283ca114a..b9ee4dd35 100644 --- a/src/include/homestore/btree/detail/variant_node.hpp +++ b/src/include/homestore/btree/detail/variant_node.hpp @@ -313,5 +313,6 @@ class VariantNode : public StoreSpecificBtreeNode { } return ret; } + virtual void on_update_phys_buf() override {}; }; } // namespace homestore \ No newline at end of file diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index f255ea81b..106a1afeb 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -71,7 +71,7 @@ if (${io_tests}) set(TEST_INDEXBTREE_SOURCE_FILES test_index_btree.cpp) add_executable(test_index_btree ${TEST_INDEXBTREE_SOURCE_FILES}) target_link_libraries(test_index_btree homestore ${COMMON_TEST_DEPS} GTest::gtest) - add_test(NAME IndexBtree COMMAND test_index_btree --gtest_filter=*/0.*) + add_test(NAME IndexBtree COMMAND test_index_btree) set_property(TEST IndexBtree PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true") set_tests_properties(IndexBtree PROPERTIES TIMEOUT 1200) diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp index c62ce2f84..2c08b7d5c 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_index_btree.cpp @@ -133,7 +133,7 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { test_common::HSTestHelper m_helper; }; -using BtreeTypes = testing::Types< FixedLenBtree, VarKeySizeBtree, VarValueSizeBtree, VarObjSizeBtree >; +using BtreeTypes = testing::Types< FixedLenBtree, PrefixIntervalBtree, VarKeySizeBtree, VarValueSizeBtree, VarObjSizeBtree >; TYPED_TEST_SUITE(BtreeTest, BtreeTypes); diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 50f8df9cd..4625167fd 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -50,8 +50,8 @@ SISL_OPTION_GROUP( ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), - (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", - ::cxxopts::value< uint32_t >()->default_value("0"), ""), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("0"), + ""), (seed, "", "seed", "random engine seed, use random if not defined", ::cxxopts::value< uint64_t >()->default_value("0"), "number"), (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds")) @@ -111,16 +111,14 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { #endif this->m_cfg.m_max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); this->m_cfg.m_merge_turned_on = !SISL_OPTIONS["disable_merge"].as< bool >(); - //if TestType is PrefixIntervalBtreeTest print here something - if constexpr (std::is_same_v) { - this->m_cfg.m_merge_turned_on = false; - } + // if TestType is PrefixIntervalBtreeTest print here something + if constexpr (std::is_same_v< TestType, PrefixIntervalBtreeTest >) { this->m_cfg.m_merge_turned_on = false; } this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); } }; -using BtreeTypes = testing::Types< FixedLenBtreeTest, VarKeySizeBtreeTest, - VarValueSizeBtreeTest, VarObjSizeBtreeTest, PrefixIntervalBtreeTest >; +using BtreeTypes = testing::Types< FixedLenBtreeTest, PrefixIntervalBtreeTest, VarKeySizeBtreeTest, + VarValueSizeBtreeTest, VarObjSizeBtreeTest >; TYPED_TEST_SUITE(BtreeTest, BtreeTypes); TYPED_TEST(BtreeTest, SequentialInsert) { @@ -317,9 +315,7 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin #endif this->m_cfg.m_max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); this->m_cfg.m_merge_turned_on = !SISL_OPTIONS["disable_merge"].as< bool >(); - if constexpr (std::is_same_v) { - this->m_cfg.m_merge_turned_on = false; - } + if constexpr (std::is_same_v< TestType, PrefixIntervalBtreeTest >) { this->m_cfg.m_merge_turned_on = false; } this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); } From 320687b72066d2c838827e56291b6e839057e12e Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Sat, 17 May 2025 07:24:52 -0700 Subject: [PATCH 124/170] [Solo repl dev] Fix log dev flush timer cancel race (#723) * Wait on cancel_timer during stop logdev --- conanfile.py | 3 ++- src/lib/logstore/log_dev.cpp | 24 +++++++++++++++++------- src/lib/logstore/log_dev.hpp | 2 +- src/tests/test_meta_blk_mgr.cpp | 6 ++++-- src/tests/test_scripts/log_meta_test.py | 2 +- 5 files changed, 25 insertions(+), 12 deletions(-) diff --git a/conanfile.py b/conanfile.py index 43d3aafc8..38daaa660 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,8 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.13" + version = "6.13.14" + homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" topics = ("ebay", "nublox") diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index c1ea83839..acdedc280 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -143,13 +143,17 @@ void LogDev::stop() { } folly::SharedMutexWritePriority::ReadHolder holder(m_store_map_mtx); - for (auto& [_, store] : m_id_logstore_map) + for (auto& [_, store] : m_id_logstore_map) { store.log_store->stop(); + } // after we call stop, we need to do any pending device truncations truncate(); m_id_logstore_map.clear(); - if (allow_timer_flush()) stop_timer(); + if (allow_timer_flush()) { + auto f = stop_timer(); + std::move(f).get(); + } } void LogDev::destroy() { @@ -167,13 +171,19 @@ void LogDev::start_timer() { }); } -void LogDev::stop_timer() { - if (m_flush_timer_hdl != iomgr::null_timer_handle) { - iomanager.run_on_forget(logstore_service().flush_thread(), [this]() { +folly::Future< int > LogDev::stop_timer() { + // return future to the caller; + // this future will be completed when the timer is stopped + auto p = std::make_shared< folly::Promise< int > >(); + auto f = p->getFuture(); + iomanager.run_on_forget(logstore_service().flush_thread(), [this, p]() mutable { + if (m_flush_timer_hdl != iomgr::null_timer_handle) { iomanager.cancel_timer(m_flush_timer_hdl, true); m_flush_timer_hdl = iomgr::null_timer_handle; - }); - } + } + p->setValue(0); + }); + return f; } void LogDev::do_load(off_t device_cursor) { diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index d43dab219..8a5954f67 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -721,7 +721,7 @@ class LogDev : public std::enable_shared_from_this< LogDev > { private: void start_timer(); - void stop_timer(); + folly::Future< int > stop_timer(); bool allow_inline_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::INLINE); } bool allow_timer_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::TIMER); } diff --git a/src/tests/test_meta_blk_mgr.cpp b/src/tests/test_meta_blk_mgr.cpp index af80e9da5..f087f81c0 100644 --- a/src/tests/test_meta_blk_mgr.cpp +++ b/src/tests/test_meta_blk_mgr.cpp @@ -187,7 +187,7 @@ class VMetaBlkMgrTest : public ::testing::Test { uint64_t total_size_written(const void* cookie) { return m_mbm->meta_size(cookie); } void do_write_to_full() { - static constexpr uint64_t blkstore_overhead = 4 * 1024ul * 1024ul; // 4MB + static constexpr uint64_t blkstore_overhead = 256 * 1024ul * 1024ul; // 256MB ssize_t free_size = uint64_cast(m_mbm->total_size() - m_mbm->used_size() - blkstore_overhead); HS_REL_ASSERT_GT(free_size, 0); @@ -195,8 +195,10 @@ class VMetaBlkMgrTest : public ::testing::Test { uint64_t size_written{0}; while (free_size > 0) { + LOGDEBUG("free size: {}, total size: {}, used size: {}, available blks: {}", free_size, m_mbm->total_size(), + m_mbm->used_size(), m_mbm->available_blks()); // if it is overflow, 2 extra blocks are needed for ovf blk header and meta blk; - if (free_size - 2 * m_mbm->block_size() >= gp.max_wrt_sz) { + if (free_size - 2 * m_mbm->block_size() >= gp.max_wrt_sz) { size_written = do_sb_write(do_overflow(), 0); } else { size_written = do_sb_write(false, m_mbm->meta_blk_context_sz()); diff --git a/src/tests/test_scripts/log_meta_test.py b/src/tests/test_scripts/log_meta_test.py index 5ffda0018..83c8f994f 100755 --- a/src/tests/test_scripts/log_meta_test.py +++ b/src/tests/test_scripts/log_meta_test.py @@ -85,7 +85,7 @@ def meta_nightly(options, addln_opts): subprocess.check_call(options.dirpath + "test_meta_blk_mgr " + cmd_opts + addln_opts, stderr=subprocess.STDOUT, shell=True) - cmd_opts = "--gtest_filter=VMetaBlkMgrTest.random_load_test --gtest_filter=VMetaBlkMgrTest.write_to_full_test --use_file=true" # write to file instead of real disk to save time; + cmd_opts = "--gtest_filter=VMetaBlkMgrTest.write_to_full_test --use_file=true" # write to file instead of real disk to save time; subprocess.check_call(options.dirpath + "test_meta_blk_mgr " + cmd_opts + addln_opts, stderr=subprocess.STDOUT, shell=True) From 635d7801e5e010b71d5fe829defcc7aa14d58946 Mon Sep 17 00:00:00 2001 From: Sanal Date: Tue, 20 May 2025 11:11:28 -0700 Subject: [PATCH 125/170] Add submit_io_batch api in repl dev. (#725) Use submit_io_batch when part_of_batch is set to true for read/write. --- conanfile.py | 2 +- src/include/homestore/blkdata_service.hpp | 7 +++++++ src/lib/blkdata_svc/blkdata_service.cpp | 2 ++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 38daaa660..28682756e 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.13.14" + version = "6.14.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index 87aed9e01..9671a3901 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -151,6 +151,13 @@ class BlkDataService { folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch = false); + /** + * @brief Submit the io batch, which is a mandatory method to be called if read/write are issued with part_of_batch + * is set to true. In those cases, without this method, IOs might not be even issued. No-op if previous io requests + * are not part of batch. + * */ + void submit_io_batch(); + /** * @brief Commits the block with the given MultiBlkId. * diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index d17462ac2..f327d7834 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -222,6 +222,8 @@ BlkDataService::async_write(sisl::sg_list const& sgs, std::vector< MultiBlkId > return collect_all_futures(s_futs); } +void BlkDataService::submit_io_batch() { m_vdev->submit_batch(); } + BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, MultiBlkId& out_blkids) { if (is_stopping()) return BlkAllocStatus::FAILED; incr_pending_request_num(); From 052394fa4ce5d5e565ab6591d2ceeee95a922041 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Thu, 15 May 2025 11:38:30 +0800 Subject: [PATCH 126/170] Redesign replacemember API --- conanfile.py | 2 +- .../homestore/replication/repl_decls.h | 20 +- src/include/homestore/replication/repl_dev.h | 12 +- src/include/homestore/replication_service.hpp | 12 +- src/lib/common/homestore_config.fbs | 9 +- src/lib/common/homestore_utils.hpp | 4 + src/lib/replication/repl_dev/common.cpp | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 499 ++++++++++++++---- src/lib/replication/repl_dev/raft_repl_dev.h | 23 +- src/lib/replication/repl_dev/solo_repl_dev.h | 9 +- .../replication/service/generic_repl_svc.cpp | 17 +- .../replication/service/generic_repl_svc.h | 12 +- .../replication/service/raft_repl_service.cpp | 55 +- .../replication/service/raft_repl_service.h | 13 +- src/tests/test_common/raft_repl_test_base.hpp | 41 +- src/tests/test_raft_repl_dev_dynamic.cpp | 111 +++- src/tests/test_solo_repl_dev.cpp | 3 +- 17 files changed, 672 insertions(+), 172 deletions(-) diff --git a/conanfile.py b/conanfile.py index 28682756e..7734cc43d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.14.0" + version = "6.15.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 54435c1c7..7bbb87f40 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -38,6 +38,12 @@ VENUM(ReplServiceError, int32_t, DATA_DUPLICATED = -20002, QUIENCE_STATE = -20003, FAILED = -32768); + +VENUM(PeerRole, uint8_t, + UNKNOWN = 0, + LEADER = 1, + FOLLOWER = 2, + LEARNER = 3); // clang-format on template < typename V, typename E > @@ -76,15 +82,15 @@ struct peer_info { // Peer ID. replica_id_t id_; // The last replication index that the peer has, from this server's point of view. - uint64_t replication_idx_; + uint64_t replication_idx_ = 0; // The elapsed time since the last successful response from this peer, set to 0 on leader - uint64_t last_succ_resp_us_; + uint64_t last_succ_resp_us_ = 0; // The priority for leader election - uint32_t priority_; - // The peer is learner or not - bool is_learner_; - // The peer is new joiner or not - bool is_new_joiner_; + uint32_t priority_ = 0; + // The peer role in replication group + PeerRole role_ = PeerRole::UNKNOWN; + // If this peer is myself + bool is_self_ = false; }; struct replica_member_info { diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 9a4cba340..eedbebc44 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -46,7 +46,8 @@ VENUM(journal_type_t, uint16_t, HS_DATA_LINKED = 0, // Linked data where each entry will store physical blkid where data reside HS_DATA_INLINED = 1, // Data is inlined in the header of journal entry HS_CTRL_DESTROY = 2, // Control message to destroy the repl_dev - HS_CTRL_REPLACE = 3, // Control message to replace a member + HS_CTRL_START_REPLACE = 3, // Control message to start replace a member + HS_CTRL_COMPLETE_REPLACE = 4, // Control message to complete replace a member ) // magic num comes from the first 8 bytes of 'echo homestore_resync_data | md5sum' @@ -367,8 +368,13 @@ class ReplDevListener { /// after restart in case crash happened during the destroy. virtual void on_destroy(const group_id_t& group_id) = 0; - /// @brief Called when replace member is performed. - virtual void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) = 0; + /// @brief Called when start replace member. + virtual void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + trace_id_t tid) = 0; + + /// @brief Called when complete replace member. + virtual void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + trace_id_t tid) = 0; /// @brief Called when the snapshot is being created by nuraft virtual AsyncReplResult<> create_snapshot(shared< snapshot_context > context) = 0; diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index 23ee2422c..56154226b 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -41,10 +41,16 @@ class ReplicationService { /// @return A Future which gets called after schedule to release (before garbage collection is kicked in) virtual folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) = 0; - virtual AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, - uint32_t commit_quorum = 0, uint64_t trace_id = 0) const = 0; + virtual AsyncReplResult<> start_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const = 0; + virtual AsyncReplResult<> complete_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const = 0; + + virtual AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, + bool wait_and_verify = true, uint64_t trace_id = 0) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened /// @param group_id Group id interested in /// @return ReplDev is opened or ReplServiceError::SERVER_NOT_FOUND if it doesn't exist diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index a661da497..053022c0c 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -250,7 +250,8 @@ table Consensus { stale_log_gap_lo_threshold: int32 = 30; // Minimum log gap a replica has to be from leader before joining the replica set. - min_log_gap_to_join: int32 = 2147483647; + // 0 indicates the new member will join in cluster immediately. + min_log_gap_to_join: int32 = 0; // amount of time in millis to wait on data write before fetch data from remote; wait_data_write_timer_ms: uint64 = 1500 (hotswap); @@ -290,6 +291,12 @@ table Consensus { // then decay the target_priority and wait again until its priority >= target_priority. This setting helps us to set proper priority for peers. // 0 means all members have the same priority. max_wait_rounds_of_priority_election: uint32 = 2; + + // Maximum number of retries when raft is undergoing config changing + config_changing_error_retries: int32 = 3; + + // The time to wait for config change to be applied in ms + wait_for_config_change_ms: uint32 = 500; } table HomeStoreSettings { diff --git a/src/lib/common/homestore_utils.hpp b/src/lib/common/homestore_utils.hpp index 2ee51b03d..b6989ff48 100644 --- a/src/lib/common/homestore_utils.hpp +++ b/src/lib/common/homestore_utils.hpp @@ -53,4 +53,8 @@ class hs_utils { static bool topological_sort(std::unordered_map< std::string, std::vector< std::string > >& DAG, std::vector< std::string >& ordered_entries); }; + +static bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, + uint32_t interval_ms = 100); + } // namespace homestore diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 2782a36a5..3b44600ca 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -266,7 +266,7 @@ std::string repl_req_ctx::to_string() const { } std::string repl_req_ctx::to_compact_string() const { - if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_REPLACE) { + if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_START_REPLACE || m_op_code == journal_type_t::HS_CTRL_START_REPLACE) { return fmt::format("term={} lsn={} op={}", m_rkey.term, m_lsn, enum_name(m_op_code)); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 88aa9d6c3..f77f8f513 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -15,6 +15,7 @@ #include "common/homestore_assert.hpp" #include "common/homestore_config.hpp" +#include "common/homestore_utils.hpp" #include "replication/service/raft_repl_service.h" #include "replication/repl_dev/raft_repl_dev.h" #include "device/chunk.h" @@ -136,16 +137,16 @@ bool RaftReplDev::join_group() { return true; } -AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id) { +AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) { if (is_stopping()) { - LOGINFO("repl dev is being shutdown! trace_id={}", trace_id); + RD_LOGI(trace_id, "repl dev is being shutdown!"); return make_async_error<>(ReplServiceError::STOPPING); } incr_pending_request_num(); - RD_LOGI(trace_id, "Replace member, member_out={} member_in={}", boost::uuids::to_string(member_out.id), + RD_LOGI(trace_id, "Start replace member, member_out={} member_in={}", boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); if (commit_quorum >= 1) { @@ -153,105 +154,375 @@ AsyncReplResult<> RaftReplDev::replace_member(const replica_member_info& member_ reset_quorum_size(commit_quorum, trace_id); } - // Step 1: Check if leader itself is requested to move out. + // Step1, validate request + auto out_srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member_out.id)); + if (!out_srv_cfg) { + RD_LOGE(trace_id, "Step1. Replace member invalid parameter, out member is not found"); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); + } + // Check if leader itself is requested to move out. if (m_my_repl_id == member_out.id && m_my_repl_id == get_leader_id()) { - // If leader is the member requested to move out, then give up leadership and return error. - // Client will retry replace_member request to the new leader. + // If leader is the member requested to move out, then set priority to 0(or it will be elected as leader again) + // and give up leadership and return error. Client will retry start_replace_member request to the new leader. + RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out, member_out={}", + boost::uuids::to_string(member_out.id)); + if (out_srv_cfg->get_priority() != 0) { + auto ret = set_priority(member_out, 0, trace_id); + if (ret != ReplServiceError::OK) { + // Actually this is the expected path, because nuraft will BROADCAST error if we are trying to set + // leader's priority=0 + RD_LOGE(trace_id, "Step1. Replace member, set leader's priority to 0, failed {}", ret); + } + } raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); - RD_LOGI(trace_id, "Replace member leader is the member_out so yield leadership"); + RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out so yield leadership"); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::NOT_LEADER); } - // Step 2. Add the new member. - return m_msg_mgr.add_member(m_group_id, member_in.id) - .via(&folly::InlineExecutor::instance()) - .thenValue([this, member_in, member_out, commit_quorum, trace_id](auto&& e) -> AsyncReplResult<> { - // TODO Currently we ignore the cancelled, fix nuraft_mesg to not timeout - // when adding member. Member is added to cluster config until member syncs fully - // with atleast stop gap. This will take a lot of time for block or - // object storage. - if (e.hasError()) { - // Ignore the server already exists as server already added to the cluster. - // The pg member change requests from control path are idemepotent and request - // can be resend and one of the add or remove can failed and has to retried. - if (e.error() == nuraft::cmd_result_code::CANCELLED || - e.error() == nuraft::cmd_result_code::SERVER_ALREADY_EXISTS) { - RD_LOGI(trace_id, "Ignoring error returned from nuraft add_member {}", e.error()); - } else { - RD_LOGE(trace_id, "Replace member error in add member : {}", e.error()); - reset_quorum_size(0, trace_id); - decr_pending_request_num(); - return make_async_error<>(RaftReplService::to_repl_error(e.error())); + // Step 2: Handle out member. +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("replace_member_set_learner_failure")) { + RD_LOGE(trace_id, "Simulating set member to learner failure"); + return make_async_error(ReplServiceError::FAILED); + } +#endif + RD_LOGI(trace_id, "Step2. Replace member flip member to learner"); + auto learner_ret = do_flip_learner(member_out, true, true, trace_id); + if (learner_ret != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step2. Replace member set learner failed {}", learner_ret); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error(std::move(learner_ret)); + } + RD_LOGI(trace_id, "Step2. Replace member flip out member to learner and set priority to 0"); + + // Step 3. Append log entry to mark the old member is out and new member is added. + RD_LOGI(trace_id, "Step3. Replace member propose to raft for HS_CTRL_START_REPLACE req, group_id={}", + group_id_str()); + auto rreq = repl_req_ptr_t(new repl_req_ctx{}); + start_replace_members_ctx members; + members.replica_out = member_out; + members.replica_in = member_in; + + sisl::blob header(r_cast< uint8_t* >(&members), sizeof(start_replace_members_ctx)); + rreq->init(repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = trace_id}, + journal_type_t::HS_CTRL_START_REPLACE, true, header, sisl::blob{}, 0, m_listener); + + auto err = m_state_machine->propose_to_raft(std::move(rreq)); + if (err != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step3. Replace member propose to raft for HS_CTRL_START_REPLACE req failed {}", err); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(err)); + } + + // Step 4. Add the new member, new member will inherit the priority of the out member. +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("replace_member_add_member_failure")) { + RD_LOGE(trace_id, "Simulating add member failure"); + return make_async_error(ReplServiceError::FAILED); + } +#endif + RD_LOGI(trace_id, "Step4. Replace member propose to raft to add new member, group_id={}", group_id_str()); + auto ret = do_add_member(member_in, trace_id); + if (ret != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step4. Replace member, add member failed {}", ret); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(ret)); + } + RD_LOGI(trace_id, "Step4. Proposed to raft to add member, member={}", boost::uuids::to_string(member_in.id)); + + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_success<>(); +} + +AsyncReplResult<> RaftReplDev::complete_replace_member(const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) { + if (is_stopping()) { + RD_LOGI(trace_id, "repl dev is being shutdown!"); + return make_async_error<>(ReplServiceError::STOPPING); + } + incr_pending_request_num(); + + RD_LOGI(trace_id, "Complete replace member, member={}", boost::uuids::to_string(member_out.id), + boost::uuids::to_string(member_out.id)); + + if (commit_quorum >= 1) { + // Two members are down and leader cant form the quorum. Reduce the quorum size. + reset_quorum_size(commit_quorum, trace_id); + } + + // Step 5: Remove member + RD_LOGI(trace_id, "Step5. Replace member, remove old member, member={}", boost::uuids::to_string(member_out.id)); +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("replace_member_remove_member_failure")) { + RD_LOGE(trace_id, "Simulating remove member failure"); + return make_async_error(ReplServiceError::FAILED); + } +#endif + auto ret = do_remove_member(member_out, trace_id); + if (ret != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step5. Replace member, failed to remove member, member={}, err={}", + boost::uuids::to_string(member_out.id), ret); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(ret)); + } + RD_LOGI(trace_id, "Step5. Replace member, proposed to raft to remove member, member={}", + boost::uuids::to_string(member_out.id)); + auto timeout = HS_DYNAMIC_CONFIG(consensus.wait_for_config_change_ms); + // TODO Move wait logic to nuraft_mesg + if (!wait_and_check( + [&]() { + auto srv_conf = raft_server()->get_srv_config(nuraft_mesg::to_server_id(member_out.id)); + if (srv_conf) { + RD_LOGD(trace_id, "out member still exists in raft group, member={}", + boost::uuids::to_string(member_out.id)); + return false; } - } + return true; + }, + timeout)) { + RD_LOGD(trace_id, + "Step5. Replace member, wait for old member removed timed out, cancel the request, timeout: {}", + timeout); + // If the member_out is down, leader will force remove it after + // leave_timeout=leave_limit_(default=5)*heart_beat_interval_, it's better for client to retry it. + return make_async_error<>(ReplServiceError::CANCELLED); + } + RD_LOGD(trace_id, "Step5. Replace member, old member is removed, member={}", + boost::uuids::to_string(member_out.id)); + + // Step 2. Append log entry to complete replace member + RD_LOGI(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req, group_id={}", + group_id_str()); + auto rreq = repl_req_ptr_t(new repl_req_ctx{}); + start_replace_members_ctx members; + members.replica_out = member_out; + members.replica_in = member_in; + + sisl::blob header(r_cast< uint8_t* >(&members), sizeof(start_replace_members_ctx)); + rreq->init(repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = trace_id}, + journal_type_t::HS_CTRL_COMPLETE_REPLACE, true, header, sisl::blob{}, 0, m_listener); + + auto err = m_state_machine->propose_to_raft(std::move(rreq)); + if (err != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req failed , err={}", + err); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(err)); + } - RD_LOGI(trace_id, "Replace member added member={} to group_id={}", boost::uuids::to_string(member_in.id), - group_id_str()); - - // Step 3. Append log entry to mark the old member is out and new member is added. - auto rreq = repl_req_ptr_t(new repl_req_ctx{}); - replace_members_ctx members; - members.replica_out = member_out; - members.replica_in = member_in; - - sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_members_ctx)); - auto status = init_req_ctx(rreq, - repl_key{.server_id = server_id(), - .term = raft_server()->get_term(), - .dsn = m_next_dsn.fetch_add(1), - .traceID = trace_id}, - journal_type_t::HS_CTRL_REPLACE, true, header, sisl::blob{}, 0, m_listener); - - if (status != ReplServiceError::OK) { - // Failed to initialize the repl_req_ctx for replace member. - RD_LOGE(trace_id, "Failed to initialize repl_req_ctx for replace member, error={}", status); - reset_quorum_size(0, trace_id); - decr_pending_request_num(); - return make_async_error<>(std::move(status)); - } + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + RD_LOGI(trace_id, "Complete replace member done, group_id={}, member_out={} member_in={}", group_id_str(), + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + return make_async_success<>(); +} - status = m_state_machine->propose_to_raft(std::move(rreq)); - if (status != ReplServiceError::OK) { - RD_LOGE(trace_id, "Replace member propose to raft failed {}", status); - reset_quorum_size(0, trace_id); - decr_pending_request_num(); - return make_async_error<>(std::move(status)); - } +ReplServiceError RaftReplDev::do_add_member(const replica_member_info& member, uint64_t trace_id) { + if (m_my_repl_id != get_leader_id()) { + RD_LOGI(trace_id, "Member to add failed, not leader"); + return ReplServiceError::BAD_REQUEST; + } + auto ret = retry_when_config_change( + [&] { + auto rem_ret = m_msg_mgr.add_member(m_group_id, member.id) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, member, trace_id](auto&& e) -> nuraft::cmd_result_code { + return e.hasError() ? e.error() : nuraft::cmd_result_code::OK; + }); + return rem_ret.value(); + }, + trace_id); + if (ret == nuraft::cmd_result_code::SERVER_ALREADY_EXISTS) { + RD_LOGW(trace_id, "Ignoring error returned from nuraft add_member, member={}, err={}", + boost::uuids::to_string(member.id), ret); + } else if (ret != nuraft::cmd_result_code::OK) { + // Its ok to retry this request as the request + // of replace member is idempotent. + RD_LOGE(trace_id, "Add member failed, member={}, err={}", boost::uuids::to_string(member.id), ret); + return ReplServiceError::RETRY_REQUEST; + } + RD_LOGI(trace_id, "Proposed to raft to add member, member={}", boost::uuids::to_string(member.id)); + return ReplServiceError::OK; +} - RD_LOGI(trace_id, "Replace member proposed to raft group_id={}", group_id_str()); - - // Step 4. Remove the old member. Even if the old member is temporarily - // down and recovers, nuraft mesg see member remove from cluster log - // entry and call exit_group() and leave(). - return m_msg_mgr.rem_member(m_group_id, member_out.id) - .via(&folly::InlineExecutor::instance()) - .thenValue([this, member_out, commit_quorum, trace_id](auto&& e) -> AsyncReplResult<> { - if (e.hasError()) { - // Ignore the server not found as server removed from the cluster - // as requests are idempotent and can be resend. - if (e.error() == nuraft::cmd_result_code::SERVER_NOT_FOUND) { - RD_LOGW(trace_id, "Remove member not found in group error, ignoring"); - } else { - // Its ok to retry this request as the request - // of replace member is idempotent. - RD_LOGE(trace_id, "Replace member failed to remove member : {}", e.error()); - reset_quorum_size(0, trace_id); - decr_pending_request_num(); - return make_async_error<>(ReplServiceError::RETRY_REQUEST); - } - } else { - RD_LOGI(trace_id, "Replace member removed member={} from group_id={}", - boost::uuids::to_string(member_out.id), group_id_str()); - } +ReplServiceError RaftReplDev::do_remove_member(const replica_member_info& member, uint64_t trace_id) { + // The member should not be the leader. + if (m_my_repl_id == member.id && m_my_repl_id == get_leader_id()) { + // If leader is the member requested to move out, then give up leadership and return error. + // Client will retry start_replace_member request to the new leader. + raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); + RD_LOGI(trace_id, "Member to remove is the leader so yield leadership"); + return ReplServiceError::NOT_LEADER; + } + auto ret = retry_when_config_change( + [&] { + auto rem_ret = m_msg_mgr.rem_member(m_group_id, member.id) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, member, trace_id](auto&& e) -> nuraft::cmd_result_code { + return e.hasError() ? e.error() : nuraft::cmd_result_code::OK; + }); + return rem_ret.value(); + }, + trace_id); + if (ret == nuraft::cmd_result_code::SERVER_NOT_FOUND) { + RD_LOGW(trace_id, "Remove member not found in group error, ignoring, member={}", + boost::uuids::to_string(member.id)); + } else if (ret != nuraft::cmd_result_code::OK) { + // Its ok to retry this request as the request + // of replace member is idempotent. + RD_LOGE(trace_id, "Replace member failed to remove member, member={}, err={}", + boost::uuids::to_string(member.id), ret); + return ReplServiceError::RETRY_REQUEST; + } + RD_LOGI(trace_id, "Proposed to raft to remove member, member={}", boost::uuids::to_string(member.id)); + return ReplServiceError::OK; +} - // Revert the quorum size back to 0. - reset_quorum_size(0, trace_id); - decr_pending_request_num(); - return make_async_success<>(); - }); - }); +AsyncReplResult<> RaftReplDev::flip_learner_flag(const replica_member_info& member, bool target, uint32_t commit_quorum, + bool wait_and_verify, uint64_t trace_id) { + RD_LOGI(trace_id, "Flip learner flag to {}, member={}", target, boost::uuids::to_string(member.id)); + if (is_stopping()) { + RD_LOGI(trace_id, "repl dev is being shutdown!"); + return make_async_error<>(ReplServiceError::STOPPING); + } + incr_pending_request_num(); + + if (commit_quorum >= 1) { + // Two members are down and leader cant form the quorum. Reduce the quorum size. + reset_quorum_size(commit_quorum, trace_id); + } + auto ret = do_flip_learner(member, target, wait_and_verify, trace_id); + if (ret != ReplServiceError::OK) { + RD_LOGE(trace_id, "Flip learner flag failed {}, member={}", ret, boost::uuids::to_string(member.id)); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(ret)); + } + RD_LOGI(trace_id, "Learner flag has been set to {}, member={}", target, boost::uuids::to_string(member.id)); + return make_async_success<>(); +} + +ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, bool target, bool wait_and_verify, + uint64_t trace_id) { + // 1. Prerequisite check + if (m_my_repl_id != get_leader_id()) { + RD_LOGI(trace_id, "flip learner flag failed, not leader"); + return ReplServiceError::NOT_LEADER; + } + if (!target && member.priority == 0) { + RD_LOGI(trace_id, "clear learner flag failed, priority is 0, member={}", boost::uuids::to_string(member.id)); + return ReplServiceError::BAD_REQUEST; + } + + // 2. Flip learner + RD_LOGI(trace_id, "flip learner flag to {}, member={}", target, boost::uuids::to_string(member.id)); + auto srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member.id)); + if (!srv_cfg) { + RD_LOGE(trace_id, "invalid parameter, member is not found, member={}", boost::uuids::to_string(member.id)); + return ReplServiceError::SERVER_NOT_FOUND; + } + if (srv_cfg->is_learner() != target) { + auto ret = retry_when_config_change( + [&] { + auto learner_ret = raft_server()->flip_learner_flag(nuraft_mesg::to_server_id(member.id), target); + return learner_ret->get_result_code(); + }, + trace_id); + if (ret != nuraft::cmd_result_code::OK) { + RD_LOGE(trace_id, "Propose to raft to flip learner failed, err: {}", ret); + return ReplServiceError::RETRY_REQUEST; + } + } else { + RD_LOGD(trace_id, "learner flag has already been set to {}, skip, member={}", target, + boost::uuids::to_string(member.id)); + } + + // 3. Set priority + // Based on the current nuraft implementation, learner could be elected as leader, so we set priority to 0 to avoid + // it. And in turn, we need to revert prioiry change if the member is going to become a normal member. + auto priority = target ? 0 : member.priority; + RD_LOGI(trace_id, "Set the priority of the member to {}, member={}", priority, boost::uuids::to_string(member.id)); + if (srv_cfg->get_priority() != priority) { + auto priority_ret = set_priority(member, priority); + if (priority_ret != ReplServiceError::OK) { return ReplServiceError::NOT_LEADER; } + } else { + RD_LOGD(trace_id, "Priority has already been set to {}, skip, member={}", priority, + boost::uuids::to_string(member.id)); + } + + // 4. Verification + if (wait_and_verify) { + auto timeout = HS_DYNAMIC_CONFIG(consensus.wait_for_config_change_ms); + if (!wait_and_check( + [&]() { + auto srv_conf = raft_server()->get_srv_config(nuraft_mesg::to_server_id(member.id)); + return srv_conf->is_learner() && srv_conf->get_priority() == 0; + }, + timeout)) { + RD_LOGD(trace_id, "Wait for learner and priority config change timed out, cancel the request, timeout: {}", + timeout); + return ReplServiceError::CANCELLED; + } + } + + return ReplServiceError::OK; +} + +nuraft::cmd_result_code RaftReplDev::retry_when_config_change(const std::function< nuraft::cmd_result_code() >& func, + uint64_t trace_id) { + auto ret = nuraft::cmd_result_code::OK; + int32_t retries = HS_DYNAMIC_CONFIG(consensus.config_changing_error_retries); + for (auto i = 0; i < retries; i++) { + ret = func(); + if (ret == nuraft::cmd_result_code::CONFIG_CHANGING) { + RD_LOGW(trace_id, "Propose to raft failed due to config_changing, attempt: {}", i); + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + continue; + } + break; + } + return ret; +} + +bool RaftReplDev::wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms) { + auto times = timeout_ms / interval_ms; + if (times == 0) { times = 1; } + for (auto i = 0; i < static_cast< int32_t >(times); i++) { + if (check_func()) { return true; } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + return false; +} + +ReplServiceError RaftReplDev::set_priority(const replica_member_info& member_out, int32_t priority, uint64_t trace_id) { + auto priority_ret = raft_server()->set_priority(nuraft_mesg::to_server_id(member_out.id), priority); + // Set_priority should be handled by leader, but if the intent is to set the leader's priority to 0, it returns + // BROADCAST. In this case return NOT_LEADER to let client retry new leader. + // If there is an uncommited_config, nuraft set_priority will honor this uncommited config and generate new + // config based on it and won't have config_changing error. + if (priority_ret != nuraft::raft_server::PrioritySetResult::SET) { + RD_LOGE(trace_id, "Propose to raft to set priority failed, result: {}", + priority_ret == nuraft::raft_server::PrioritySetResult::BROADCAST ? "BROADCAST" : "IGNORED"); + return ReplServiceError::NOT_LEADER; + } + return ReplServiceError::OK; } void RaftReplDev::reset_quorum_size(uint32_t commit_quorum, uint64_t trace_id) { @@ -1018,8 +1289,10 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { RD_LOGD(rreq->traceID(), "Raft channel: Commit rreq=[{}]", rreq->to_compact_string()); if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY) { leave(); - } else if (rreq->op_code() == journal_type_t::HS_CTRL_REPLACE) { - replace_member(rreq); + } else if (rreq->op_code() == journal_type_t::HS_CTRL_START_REPLACE) { + start_replace_member(rreq); + } else if (rreq->op_code() == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { + complete_replace_member(rreq); } else { m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), {rreq->local_blkid()}, rreq); } @@ -1087,8 +1360,11 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) }); } } else if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY || - rreq->op_code() == journal_type_t::HS_CTRL_REPLACE) { - if (rreq->is_proposer()) { m_destroy_promise.setValue(err); } + rreq->op_code() == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { + if (rreq->is_proposer()) { + RD_LOGE(rreq->traceID(), "Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); + m_destroy_promise.setValue(err); + } } // TODO: Validate if this is a correct assert or not. Is it possible that the log is already flushed and we receive @@ -1103,13 +1379,22 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) rreq->clear(); } -void RaftReplDev::replace_member(repl_req_ptr_t rreq) { - auto members = r_cast< const replace_members_ctx* >(rreq->header().cbytes()); +void RaftReplDev::start_replace_member(repl_req_ptr_t rreq) { + auto members = r_cast< const start_replace_members_ctx* >(rreq->header().cbytes()); + + RD_LOGI(rreq->traceID(), "Raft repl start_replace_member commit member_out={} member_in={}", + boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); + + m_listener->on_start_replace_member(members->replica_out, members->replica_in, rreq->traceID()); +} + +void RaftReplDev::complete_replace_member(repl_req_ptr_t rreq) { + auto members = r_cast< const start_replace_members_ctx* >(rreq->header().cbytes()); - RD_LOGI(rreq->traceID(), "Raft repl replace_member commit member_out={} member_in={}", + RD_LOGI(rreq->traceID(), "Raft repl complete_replace_member commit member_out={} member_in={}", boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); - m_listener->on_replace_member(members->replica_out, members->replica_in); + m_listener->on_complete_replace_member(members->replica_out, members->replica_in, rreq->traceID()); } static bool blob_equals(sisl::blob const& a, sisl::blob const& b) { @@ -1174,12 +1459,14 @@ std::vector< peer_info > RaftReplDev::get_replication_status() const { std::vector< peer_info > pi; auto rep_status = m_repl_svc_ctx->get_raft_status(); for (auto const& pinfo : rep_status) { - pi.emplace_back(peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), - .replication_idx_ = pinfo.last_log_idx_, - .last_succ_resp_us_ = pinfo.last_succ_resp_us_, - .priority_ = pinfo.priority_, - .is_learner_ = pinfo.is_learner_, - .is_new_joiner_ = pinfo.is_new_joiner_}); + auto peer = peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), + .replication_idx_ = pinfo.last_log_idx_, + .last_succ_resp_us_ = pinfo.last_succ_resp_us_, + .priority_ = pinfo.priority_}; + peer.role_ = pinfo.is_learner_ ? PeerRole::LEARNER : PeerRole::FOLLOWER; + if (peer.id_ == get_leader_id()) { peer.role_ = PeerRole::LEADER; } + peer.is_self_ = (peer.id_ == m_my_repl_id); + pi.emplace_back(peer); } return pi; } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 42d100ebb..6a790c017 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -36,7 +36,7 @@ using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, DESTROYING, DESTROYED, PERMANENT_DESTROYED); -struct replace_members_ctx { +struct start_replace_members_ctx { replica_member_info replica_out; replica_member_info replica_in; }; @@ -224,8 +224,22 @@ class RaftReplDev : public ReplDev, bool bind_data_service(); bool join_group(); - AsyncReplResult<> replace_member(const replica_member_info& member_out, const replica_member_info& member_in, - uint32_t commit_quorum, uint64_t trace_id = 0); + AsyncReplResult<> start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + uint32_t commit_quorum = 0, uint64_t trace_id = 0); + AsyncReplResult<> complete_replace_member(const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0); + AsyncReplResult<> flip_learner_flag(const replica_member_info& member, bool target, uint32_t commit_quorum, + bool wait_and_verify = true, uint64_t trace_id = 0); + ReplServiceError do_add_member(const replica_member_info& member, uint64_t trace_id = 0); + ReplServiceError do_remove_member(const replica_member_info& member, uint64_t trace_id = 0); + ReplServiceError do_flip_learner(const replica_member_info& member, bool target, bool wait_and_verify, + uint64_t trace_id = 0); + ReplServiceError set_priority(const replica_member_info& member, int32_t priority, uint64_t trace_id = 0); + nuraft::cmd_result_code retry_when_config_change(const std::function< nuraft::cmd_result_code() >& func, + uint64_t trace_id = 0); + bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms = 100); + folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// @@ -419,7 +433,8 @@ class RaftReplDev : public ReplDev, void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); void set_log_store_last_durable_lsn(store_lsn_t lsn); void commit_blk(repl_req_ptr_t rreq); - void replace_member(repl_req_ptr_t rreq); + void start_replace_member(repl_req_ptr_t rreq); + void complete_replace_member(repl_req_ptr_t rreq); void reset_quorum_size(uint32_t commit_quorum, uint64_t trace_id); void create_snp_resync_data(raft_buf_ptr_t& data_out); bool save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index a690c4bc0..25b0a5d8f 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -61,12 +62,8 @@ class SoloReplDev : public ReplDev { bool is_leader() const override { return true; } replica_id_t get_leader_id() const override { return m_group_id; } std::vector< peer_info > get_replication_status() const override { - return std::vector< peer_info >{peer_info{.id_ = m_group_id, - .replication_idx_ = 0, - .last_succ_resp_us_ = 0, - .priority_ = 1, - .is_learner_ = false, - .is_new_joiner_ = false}}; + return std::vector< peer_info >{ + peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0, .priority_ = 1, .is_self_ = true}}; } bool is_ready_for_traffic() const override { return true; } void purge() override {} diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index aeaa35f02..d48801939 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -193,9 +193,20 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } } -AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id) const { +AsyncReplResult<> SoloReplService::start_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { + return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); +} + +AsyncReplResult<> SoloReplService::complete_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { + return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); +} + +AsyncReplResult<> SoloReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, + uint32_t commit_quorum, bool wait_and_verify, uint64_t trace_id) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index ab2fd4bf4..3b3857d90 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -89,9 +89,15 @@ class SoloReplService : public GenericReplService { std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0) const override; + AsyncReplResult<> start_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; + AsyncReplResult<> complete_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; + AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, + uint32_t commit_quorum, bool wait_and_verify = true, + uint64_t trace_id = 0) const override; }; class SoloReplServiceCPHandler : public CPCallbacks { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index b1b4b9a89..ecd925dbb 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -123,7 +123,7 @@ void RaftReplService::start() { // new_joiner_type fully disabled log pack behavior. // There is no callback available for handling and localizing the log entries within the pack, which could // result in data corruption. - r_params.use_new_joiner_type_ = true; + r_params.use_new_joiner_type_ = false; r_params.use_bg_thread_for_snapshot_io_ = HS_DYNAMIC_CONFIG(consensus.use_bg_thread_for_snapshot_io); r_params.return_method_ = nuraft::raft_params::async_handler; m_msg_mgr->register_mgr_type(params.default_group_type_, r_params); @@ -476,9 +476,9 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki add_repl_dev(group_id, rdev); } -AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id) const { +AsyncReplResult<> RaftReplService::start_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); incr_pending_request_num(); auto rdev_result = get_repl_dev(group_id); @@ -488,7 +488,52 @@ AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const rep } return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) - ->replace_member(member_out, member_in, commit_quorum, trace_id) + ->start_replace_member(member_out, member_in, commit_quorum, trace_id) + .via(&folly::InlineExecutor::instance()) + .thenValue([this](auto&& e) mutable { + if (e.hasError()) { + decr_pending_request_num(); + return make_async_error<>(e.error()); + } + decr_pending_request_num(); + return make_async_success<>(); + }); +} + +AsyncReplResult<> RaftReplService::complete_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { + if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); + incr_pending_request_num(); + auto rdev_result = get_repl_dev(group_id); + if (!rdev_result) { + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); + } + return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) + ->complete_replace_member(member_out, member_in, commit_quorum, trace_id) + .via(&folly::InlineExecutor::instance()) + .thenValue([this](auto&& e) mutable { + if (e.hasError()) { + decr_pending_request_num(); + return make_async_error<>(e.error()); + } + decr_pending_request_num(); + return make_async_success<>(); + }); +} + +AsyncReplResult<> RaftReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, + bool wait_and_verify, uint64_t trace_id) const { + if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); + incr_pending_request_num(); + auto rdev_result = get_repl_dev(group_id); + if (!rdev_result) { + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); + } + return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) + ->flip_learner_flag(member, target, commit_quorum, wait_and_verify, trace_id) .via(&folly::InlineExecutor::instance()) .thenValue([this](auto&& e) mutable { if (e.hasError()) { diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 5f70efd0e..c739eeb9e 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -78,9 +78,16 @@ class RaftReplService : public GenericReplService, std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0) const override; + AsyncReplResult<> start_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; + AsyncReplResult<> complete_replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id = 0) const override; + + AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, + uint32_t commit_quorum, bool wait_and_verify = true, + uint64_t trace_id = 0) const override; private: RaftReplDev* raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie); diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 0dbd539e3..2d4519b94 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -48,7 +48,7 @@ using namespace homestore; SISL_LOGGING_DEF(test_raft_repl_dev) -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS, nuraft_mesg) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS, nuraft_mesg, nuraft) SISL_OPTION_GROUP(test_raft_repl_dev, (block_size, "", "block_size", "block size to io", @@ -344,8 +344,13 @@ class TestReplicatedDB : public homestore::ReplDevListener { } return blk_alloc_hints{}; } - void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) override { - LOGINFO("[Replica={}] replace member out {} in {}", g_helper->replica_num(), + void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override { + LOGINFO("[Replica={}] start replace member out {} in {}", g_helper->replica_num(), + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + } + + void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override { + LOGINFO("[Replica={}] complete replace member out {} in {}", g_helper->replica_num(), boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); } @@ -737,19 +742,39 @@ class RaftReplDevTestBase : public testing::Test { void create_snapshot() { dbs_[0]->create_snapshot(); } void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } - void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, + void start_replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, uint32_t commit_quorum = 0, ReplServiceError error = ReplServiceError::OK) { this->run_on_leader(db, [this, error, db, member_out, member_in, commit_quorum]() { - LOGINFO("Replace member out={} in={}", boost::uuids::to_string(member_out), + LOGINFO("Start replace member out={} in={}", boost::uuids::to_string(member_out), + boost::uuids::to_string(member_in)); + + replica_member_info out{member_out, ""}; + replica_member_info in{member_in, ""}; + auto result = hs()->repl_service().start_replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); + if (error == ReplServiceError::OK) { + ASSERT_EQ(result.hasError(), false) << "Error in replacing member, err=" << result.error(); + } else { + ASSERT_EQ(result.hasError(), true) << "Error in replacing member, err="<< result.error(); + ASSERT_EQ(result.error(), error); + } + }); + } + + void complete_replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, + replica_id_t member_in, uint32_t commit_quorum = 0, + ReplServiceError error = ReplServiceError::OK) { + this->run_on_leader(db, [this, error, db, member_out, member_in, commit_quorum]() { + LOGINFO("Complete replace member out={} in={}", boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); replica_member_info out{member_out, ""}; replica_member_info in{member_in, ""}; - auto result = hs()->repl_service().replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); + auto result = + hs()->repl_service().complete_replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); if (error == ReplServiceError::OK) { - ASSERT_EQ(result.hasError(), false) << "Error in replacing member"; + ASSERT_EQ(result.hasError(), false) << "Error in replacing member, err=" << result.error(); } else { - ASSERT_EQ(result.hasError(), true) << "Error in replacing member"; + ASSERT_EQ(result.hasError(), true) << "Error in replacing member, err=" << result.error(); ASSERT_EQ(result.error(), error); } }); diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp index 5a6095959..0897a5201 100644 --- a/src/tests/test_raft_repl_dev_dynamic.cpp +++ b/src/tests/test_raft_repl_dev_dynamic.cpp @@ -38,11 +38,11 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { g_helper->sync_for_test_start(num_members); if (g_helper->replica_num() < num_replicas) { - // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -55,7 +55,15 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { // Skip the member which is going to be replaced. Validate data on all other replica's. LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); this->validate_data(); - } else if (g_helper->replica_num() == member_out) { + } + + g_helper->sync_for_test_start(num_members); + LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); + complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); while (repl_dev && !repl_dev->is_destroyed()) { @@ -106,7 +114,7 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) { // Replace down replica 2 with spare replica 3 with commit quorum 1 // so that leader can go ahead with replacing member. LOGINFO("Replace member started"); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/); + start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/); this->write_on_leader(num_io_entries, true /* wait_for_commit */); LOGINFO("Leader completed num_io={}", num_io_entries); } @@ -154,16 +162,18 @@ TEST_F(ReplDevDynamicTest, OneMemberDown) { g_helper->sync_for_test_start(num_members); - this->shutdown_replica(2); - LOGINFO("Shutdown replica 2"); - std::this_thread::sleep_for(std::chrono::seconds(3)); if (g_helper->replica_num() == 0) { - // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); + } + //shut down before replace member + this->shutdown_replica(2); + LOGINFO("Shutdown replica 2"); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + if (g_helper->replica_num() == 0) { + start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -178,14 +188,46 @@ TEST_F(ReplDevDynamicTest, OneMemberDown) { this->validate_data(); } - g_helper->sync_for_cleanup_start(num_members); + //shutdown after becoming learner + // this->shutdown_replica(2); + // LOGINFO("Shutdown replica 2"); + // std::this_thread::sleep_for(std::chrono::seconds(2)); + + g_helper->sync_for_test_start(num_members); + LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); + this->run_on_leader(db, [this, db, member_out, member_in]() { + replica_member_info out{g_helper->replica_id(member_out), ""}; + replica_member_info in{g_helper->replica_id(member_in), ""}; + auto result = hs()->repl_service().complete_replace_member(db->repl_dev()->group_id(), out, in).get(); + if (result.hasError()) { + ASSERT_EQ(result.error(), ReplServiceError::CANCELLED) + << "Unexpected error in replacing member, err=" << result.error(); + LOGWARN("Error in completing replace member, err={}, will retry after 2s", result.error()); + std::this_thread::sleep_for(std::chrono::seconds(2)); + complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + } + }); + LOGINFO("Replace member old leader done"); + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); if (g_helper->replica_num() == 2) { LOGINFO("Start replica 2"); - db->set_zombie(); this->start_replica(2); + // The out member will have the repl dev destroyed. + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + while (repl_dev && !repl_dev->is_destroyed()) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num()); + } + LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); + db->set_zombie(); } + g_helper->sync_for_cleanup_start(num_members); LOGINFO("OneMemberDown test done replica={}", g_helper->replica_num()); } @@ -209,18 +251,18 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { if (g_helper->replica_num() != member_in) { LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); - // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. this->write_on_leader(num_io_entries, true /* wait_for_commit */); // Leader will return error NOT_LEADER and yield leadership, sleep and connect again // to the new leader. LOGINFO("Replace old leader"); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, ReplServiceError::NOT_LEADER); LOGINFO("Replace member leader yield done"); std::this_thread::sleep_for(std::chrono::seconds(3)); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); LOGINFO("Replace member old leader done"); } @@ -236,7 +278,24 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { this->validate_data(); } - if (g_helper->replica_num() == member_out) { db->set_zombie(); } + g_helper->sync_for_test_start(num_members); + LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); + complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (g_helper->replica_num() == member_out) { + // The out member will have the repl dev destroyed. + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + while (repl_dev && !repl_dev->is_destroyed()) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num()); + } + LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); + db->set_zombie(); + } g_helper->sync_for_cleanup_start(num_members); LOGINFO("LeaderReplace test done replica={}", g_helper->replica_num()); @@ -264,11 +323,11 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { } if (g_helper->replica_num() == 0) { - // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -283,6 +342,24 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { this->validate_data(); } + g_helper->sync_for_test_start(num_members); + LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); + complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (g_helper->replica_num() == member_out) { + // The out member will have the repl dev destroyed. + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + while (repl_dev && !repl_dev->is_destroyed()) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num()); + } + LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); + } + g_helper->sync_for_cleanup_start(num_members); LOGINFO("OneMemberRestart test done replica={}", g_helper->replica_num()); } diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 4d271efcb..23a429722 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -131,7 +131,8 @@ class SoloReplDevTest : public testing::Test { cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received error={} on repl_dev", enum_name(error)); } - void on_replace_member(const replica_member_info& member_out, const replica_member_info& member_in) override {} + void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {} + void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {} void on_destroy(const group_id_t& group_id) override {} void notify_committed_lsn(int64_t lsn) override {} void on_config_rollback(int64_t lsn) override {} From 6904f1d0721a7063581e7f52eb7a19034e18f8af Mon Sep 17 00:00:00 2001 From: yuwmao Date: Wed, 28 May 2025 11:14:45 +0800 Subject: [PATCH 127/170] Add a reaper thread to check and complete replace member --- .../homestore/replication/repl_decls.h | 15 +- src/include/homestore/replication_service.hpp | 12 +- src/lib/common/homestore_config.fbs | 3 + src/lib/replication/repl_dev/common.cpp | 2 +- src/lib/replication/repl_dev/common.h | 9 +- .../replication/repl_dev/raft_repl_dev.cpp | 197 ++++++++++++++---- src/lib/replication/repl_dev/raft_repl_dev.h | 16 +- src/lib/replication/repl_dev/solo_repl_dev.h | 2 +- .../replication/service/generic_repl_svc.cpp | 8 +- .../replication/service/generic_repl_svc.h | 5 +- .../replication/service/raft_repl_service.cpp | 50 +++-- .../replication/service/raft_repl_service.h | 7 +- src/tests/CMakeLists.txt | 2 +- src/tests/test_common/hs_repl_test_common.hpp | 3 +- src/tests/test_common/raft_repl_test_base.hpp | 28 +-- src/tests/test_raft_repl_dev_dynamic.cpp | 134 +++++++----- 16 files changed, 308 insertions(+), 185 deletions(-) diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 7bbb87f40..6094d0ada 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -36,14 +36,9 @@ VENUM(ReplServiceError, int32_t, NO_SPACE_LEFT = -20000, DRIVE_WRITE_ERROR = -20001, DATA_DUPLICATED = -20002, - QUIENCE_STATE = -20003, + QUIENCE_STATE = -20003, + QUORUM_NOT_MET = -20004, FAILED = -32768); - -VENUM(PeerRole, uint8_t, - UNKNOWN = 0, - LEADER = 1, - FOLLOWER = 2, - LEARNER = 3); // clang-format on template < typename V, typename E > @@ -87,10 +82,8 @@ struct peer_info { uint64_t last_succ_resp_us_ = 0; // The priority for leader election uint32_t priority_ = 0; - // The peer role in replication group - PeerRole role_ = PeerRole::UNKNOWN; - // If this peer is myself - bool is_self_ = false; + // Whether the peer can vote. If a peer is learner, this will be false. Hide the raft details. + bool can_vote = true; }; struct replica_member_info { diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index 56154226b..f28704546 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -41,14 +41,16 @@ class ReplicationService { /// @return A Future which gets called after schedule to release (before garbage collection is kicked in) virtual folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) = 0; - virtual AsyncReplResult<> start_replace_member(group_id_t group_id, const replica_member_info& member_out, + /// @brief Replace one of the members with a new one. + /// @param group_id Group where the replace member happens + /// @param member_out The member which is going to be replaced + /// @param member_in The member which is going to be added in place of member_out + /// @param commit_quorum Commit quorum to be used for this operation. If 0, it will use the default commit quorum. + /// @return A Future on replace the member accepted or Future ReplServiceError upon error + virtual AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0) const = 0; - virtual AsyncReplResult<> complete_replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0) const = 0; - virtual AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, uint64_t trace_id = 0) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index 053022c0c..b012a8bed 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -297,6 +297,9 @@ table Consensus { // The time to wait for config change to be applied in ms wait_for_config_change_ms: uint32 = 500; + + // The interval in ms to check if the new member in replace_member is fully synced and ready to take over + replace_member_sync_check_interval_ms: uint64 = 60000; } table HomeStoreSettings { diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 3b44600ca..6b8ce122b 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -266,7 +266,7 @@ std::string repl_req_ctx::to_string() const { } std::string repl_req_ctx::to_compact_string() const { - if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_START_REPLACE || m_op_code == journal_type_t::HS_CTRL_START_REPLACE) { + if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_START_REPLACE || m_op_code == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { return fmt::format("term={} lsn={} op={}", m_rkey.term, m_lsn, enum_name(m_op_code)); } diff --git a/src/lib/replication/repl_dev/common.h b/src/lib/replication/repl_dev/common.h index 43bbb7cbf..c3433083f 100644 --- a/src/lib/replication/repl_dev/common.h +++ b/src/lib/replication/repl_dev/common.h @@ -15,7 +15,7 @@ #pragma once #include - +#include #include #include #include @@ -95,4 +95,11 @@ auto make_async_success() { return folly::makeSemiFuture< ReplResult< folly::Unit > >(folly::Unit{}); } +inline uint64_t generateRandomTraceId() { + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution< uint64_t > dis; + return dis(gen); +} + } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index f77f8f513..795a2db55 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -137,6 +137,7 @@ bool RaftReplDev::join_group() { return true; } +// All the steps in the implementation should be idempotent and retryable. AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) { @@ -157,31 +158,59 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m // Step1, validate request auto out_srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member_out.id)); if (!out_srv_cfg) { + auto in_srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member_in.id)); + if (in_srv_cfg) { + RD_LOGI( + trace_id, + "Step1. Replace member, the intent has already been fulfilled, ignore it, member_out={} member_in={}", + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_success<>(); + } RD_LOGE(trace_id, "Step1. Replace member invalid parameter, out member is not found"); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } + if (m_my_repl_id != get_leader_id()) { + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::NOT_LEADER); + } // Check if leader itself is requested to move out. - if (m_my_repl_id == member_out.id && m_my_repl_id == get_leader_id()) { - // If leader is the member requested to move out, then set priority to 0(or it will be elected as leader again) - // and give up leadership and return error. Client will retry start_replace_member request to the new leader. - RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out, member_out={}", - boost::uuids::to_string(member_out.id)); - if (out_srv_cfg->get_priority() != 0) { - auto ret = set_priority(member_out, 0, trace_id); - if (ret != ReplServiceError::OK) { - // Actually this is the expected path, because nuraft will BROADCAST error if we are trying to set - // leader's priority=0 - RD_LOGE(trace_id, "Step1. Replace member, set leader's priority to 0, failed {}", ret); - } - } - raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); + if (m_my_repl_id == member_out.id) { + // immediate=false successor=-1, nuraft will choose an alive peer with highest priority as successor, and wait + // until the successor finishes the catch-up of the latest log, and then resign. Return NOT_LEADER and let + // client retry. + raft_server()->yield_leadership(false /* immediate */, -1 /* successor */); RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out so yield leadership"); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::NOT_LEADER); } + // quorum safety check. TODO currently only consider lsn, need to check last response time. + auto active_peers = get_active_peers(); + // active_peers doesn't include leader itself. + auto quorum = active_peers.size() + 1; + for (const auto& p : active_peers) { + quorum = p == member_out.id ? quorum - 1 : quorum; + quorum = p == member_in.id ? quorum - 1 : quorum; + } + RD_LOGD(trace_id, + "Step1. Replace member, quorum safety check, active_peers={}, active_peers_exclude_out/in_member={}, " + "commit_quorum={}", + active_peers.size(), quorum, commit_quorum); + // commit_quorum=0 means actual commit quorum is the majority. In this case, active normal member count should be + // greater than 1. To be more specific, if we have S1(leader), S2, S3(out), S4(in), we don't allow + // replace_member(S3, S4) if S2 is down or laggy. Needs to recover S2 first or retry with commit_quorum=1. + if (quorum <= 1 && commit_quorum == 0) { + RD_LOGE(trace_id, "Step1. Replace member, quorum safety check failed, active_peers={}, active_peers_exclude_out/in_member={}, commit_quorum={}", + active_peers.size(), quorum, commit_quorum); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::QUORUM_NOT_MET); + } // Step 2: Handle out member. #ifdef _PRERELEASE @@ -190,25 +219,25 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m return make_async_error(ReplServiceError::FAILED); } #endif - RD_LOGI(trace_id, "Step2. Replace member flip member to learner"); + RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner"); auto learner_ret = do_flip_learner(member_out, true, true, trace_id); if (learner_ret != ReplServiceError::OK) { - RD_LOGE(trace_id, "Step2. Replace member set learner failed {}", learner_ret); + RD_LOGE(trace_id, "Step2. Replace member, failed to flip out member to learner {}", learner_ret); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error(std::move(learner_ret)); } - RD_LOGI(trace_id, "Step2. Replace member flip out member to learner and set priority to 0"); + RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner and set priority to 0"); // Step 3. Append log entry to mark the old member is out and new member is added. - RD_LOGI(trace_id, "Step3. Replace member propose to raft for HS_CTRL_START_REPLACE req, group_id={}", + RD_LOGI(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req, group_id={}", group_id_str()); auto rreq = repl_req_ptr_t(new repl_req_ctx{}); - start_replace_members_ctx members; + replace_member_ctx members; members.replica_out = member_out; members.replica_in = member_in; - sisl::blob header(r_cast< uint8_t* >(&members), sizeof(start_replace_members_ctx)); + sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_member_ctx)); rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1), @@ -217,7 +246,7 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { - RD_LOGE(trace_id, "Step3. Replace member propose to raft for HS_CTRL_START_REPLACE req failed {}", err); + RD_LOGE(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req failed {}", err); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(err)); @@ -230,7 +259,7 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m return make_async_error(ReplServiceError::FAILED); } #endif - RD_LOGI(trace_id, "Step4. Replace member propose to raft to add new member, group_id={}", group_id_str()); + RD_LOGI(trace_id, "Step4. Replace member, propose to raft to add new member, group_id={}", group_id_str()); auto ret = do_add_member(member_in, trace_id); if (ret != ReplServiceError::OK) { RD_LOGE(trace_id, "Step4. Replace member, add member failed {}", ret); @@ -238,8 +267,7 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m decr_pending_request_num(); return make_async_error<>(std::move(ret)); } - RD_LOGI(trace_id, "Step4. Proposed to raft to add member, member={}", boost::uuids::to_string(member_in.id)); - + RD_LOGI(trace_id, "Step4. Replace member, proposed to raft to add member, member={}", boost::uuids::to_string(member_in.id)); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_success<>(); @@ -307,11 +335,11 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(const replica_member_info RD_LOGI(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req, group_id={}", group_id_str()); auto rreq = repl_req_ptr_t(new repl_req_ctx{}); - start_replace_members_ctx members; + replace_member_ctx members; members.replica_out = member_out; members.replica_in = member_in; - sisl::blob header(r_cast< uint8_t* >(&members), sizeof(start_replace_members_ctx)); + sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_member_ctx)); rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1), @@ -339,7 +367,7 @@ ReplServiceError RaftReplDev::do_add_member(const replica_member_info& member, u RD_LOGI(trace_id, "Member to add failed, not leader"); return ReplServiceError::BAD_REQUEST; } - auto ret = retry_when_config_change( + auto ret = retry_when_config_changing( [&] { auto rem_ret = m_msg_mgr.add_member(m_group_id, member.id) .via(&folly::InlineExecutor::instance()) @@ -366,12 +394,12 @@ ReplServiceError RaftReplDev::do_remove_member(const replica_member_info& member // The member should not be the leader. if (m_my_repl_id == member.id && m_my_repl_id == get_leader_id()) { // If leader is the member requested to move out, then give up leadership and return error. - // Client will retry start_replace_member request to the new leader. - raft_server()->yield_leadership(true /* immediate */, -1 /* successor */); + // Client will retry replace_member request to the new leader. + raft_server()->yield_leadership(false /* immediate */, -1 /* successor */); RD_LOGI(trace_id, "Member to remove is the leader so yield leadership"); return ReplServiceError::NOT_LEADER; } - auto ret = retry_when_config_change( + auto ret = retry_when_config_changing( [&] { auto rem_ret = m_msg_mgr.rem_member(m_group_id, member.id) .via(&folly::InlineExecutor::instance()) @@ -427,6 +455,9 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, return ReplServiceError::NOT_LEADER; } if (!target && member.priority == 0) { + // If the intent is to take the learner back to normal member, then priority should not be 0(never has chance to + // become leader). Client need to trace the peers' priority, and give a meaningful value, currently default + // priorities of the quorum: leader=100, follower=66. RD_LOGI(trace_id, "clear learner flag failed, priority is 0, member={}", boost::uuids::to_string(member.id)); return ReplServiceError::BAD_REQUEST; } @@ -439,7 +470,7 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, return ReplServiceError::SERVER_NOT_FOUND; } if (srv_cfg->is_learner() != target) { - auto ret = retry_when_config_change( + auto ret = retry_when_config_changing( [&] { auto learner_ret = raft_server()->flip_learner_flag(nuraft_mesg::to_server_id(member.id), target); return learner_ret->get_result_code(); @@ -457,10 +488,11 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, // 3. Set priority // Based on the current nuraft implementation, learner could be elected as leader, so we set priority to 0 to avoid // it. And in turn, we need to revert prioiry change if the member is going to become a normal member. + // FIXME after nuraft fixes the bug, we can remove this logic. auto priority = target ? 0 : member.priority; RD_LOGI(trace_id, "Set the priority of the member to {}, member={}", priority, boost::uuids::to_string(member.id)); if (srv_cfg->get_priority() != priority) { - auto priority_ret = set_priority(member, priority); + auto priority_ret = set_priority(member.id, priority); if (priority_ret != ReplServiceError::OK) { return ReplServiceError::NOT_LEADER; } } else { RD_LOGD(trace_id, "Priority has already been set to {}, skip, member={}", priority, @@ -485,7 +517,7 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, return ReplServiceError::OK; } -nuraft::cmd_result_code RaftReplDev::retry_when_config_change(const std::function< nuraft::cmd_result_code() >& func, +nuraft::cmd_result_code RaftReplDev::retry_when_config_changing(const std::function< nuraft::cmd_result_code() >& func, uint64_t trace_id) { auto ret = nuraft::cmd_result_code::OK; int32_t retries = HS_DYNAMIC_CONFIG(consensus.config_changing_error_retries); @@ -511,8 +543,8 @@ bool RaftReplDev::wait_and_check(const std::function< bool() >& check_func, uint return false; } -ReplServiceError RaftReplDev::set_priority(const replica_member_info& member_out, int32_t priority, uint64_t trace_id) { - auto priority_ret = raft_server()->set_priority(nuraft_mesg::to_server_id(member_out.id), priority); +ReplServiceError RaftReplDev::set_priority(const replica_id_t& member, int32_t priority, uint64_t trace_id) { + auto priority_ret = raft_server()->set_priority(nuraft_mesg::to_server_id(member), priority); // Set_priority should be handled by leader, but if the intent is to set the leader's priority to 0, it returns // BROADCAST. In this case return NOT_LEADER to let client retry new leader. // If there is an uncommited_config, nuraft set_priority will honor this uncommited config and generate new @@ -1380,21 +1412,32 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) } void RaftReplDev::start_replace_member(repl_req_ptr_t rreq) { - auto members = r_cast< const start_replace_members_ctx* >(rreq->header().cbytes()); + auto members = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); RD_LOGI(rreq->traceID(), "Raft repl start_replace_member commit member_out={} member_in={}", boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); m_listener->on_start_replace_member(members->replica_out, members->replica_in, rreq->traceID()); + // record the replace_member intent + std::unique_lock lg{m_sb_mtx}; + m_rd_sb->replace_member_ctx.replica_in = members->replica_out.id; + m_rd_sb->replace_member_ctx.replica_out = members->replica_in.id; + m_rd_sb.write(); } void RaftReplDev::complete_replace_member(repl_req_ptr_t rreq) { - auto members = r_cast< const start_replace_members_ctx* >(rreq->header().cbytes()); + auto members = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); RD_LOGI(rreq->traceID(), "Raft repl complete_replace_member commit member_out={} member_in={}", boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); m_listener->on_complete_replace_member(members->replica_out, members->replica_in, rreq->traceID()); + + // clear the replace_member intent + std::unique_lock lg{m_sb_mtx}; + m_rd_sb->replace_member_ctx = replace_member_ctx_superblk{}; + m_rd_sb.write(); + RD_LOGI(rreq->traceID(), "Raft repl replace_member_ctx has been cleared."); } static bool blob_equals(sisl::blob const& a, sisl::blob const& b) { @@ -1459,14 +1502,13 @@ std::vector< peer_info > RaftReplDev::get_replication_status() const { std::vector< peer_info > pi; auto rep_status = m_repl_svc_ctx->get_raft_status(); for (auto const& pinfo : rep_status) { - auto peer = peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), - .replication_idx_ = pinfo.last_log_idx_, - .last_succ_resp_us_ = pinfo.last_succ_resp_us_, - .priority_ = pinfo.priority_}; - peer.role_ = pinfo.is_learner_ ? PeerRole::LEARNER : PeerRole::FOLLOWER; - if (peer.id_ == get_leader_id()) { peer.role_ = PeerRole::LEADER; } - peer.is_self_ = (peer.id_ == m_my_repl_id); - pi.emplace_back(peer); + for (auto const& pinfo : rep_status) { + pi.emplace_back(peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), + .replication_idx_ = pinfo.last_log_idx_, + .last_succ_resp_us_ = pinfo.last_succ_resp_us_, + .priority_ = pinfo.priority_, + .can_vote = !pinfo.is_learner_}); + } } return pi; } @@ -1475,6 +1517,7 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { auto repl_status = get_replication_status(); std::set< replica_id_t > res; auto my_committed_idx = m_commit_upto_lsn.load(); + auto laggy=HS_DYNAMIC_CONFIG(consensus.laggy_threshold); uint64_t least_active_repl_idx = my_committed_idx > HS_DYNAMIC_CONFIG(consensus.laggy_threshold) ? my_committed_idx - HS_DYNAMIC_CONFIG(consensus.laggy_threshold) : 0; @@ -1486,6 +1529,10 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { if (p.id_ == m_my_repl_id) { continue; } if (p.replication_idx_ >= least_active_repl_idx) { res.insert(p.id_); + RD_LOGW(NO_TRACE_ID, + "Found active peer {}, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}, laggy={}", p.id_, + my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_, least_active_repl_idx, + laggy); } else { RD_LOGW(NO_TRACE_ID, "Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}", @@ -1776,6 +1823,66 @@ void RaftReplDev::flush_durable_commit_lsn() { m_rd_sb.write(); } +void RaftReplDev::check_replace_member_status() { + if (is_destroyed()) { + RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore check replace member status"); + return; + } + if (!m_repl_svc_ctx || !is_leader()) { return; } + if (m_rd_sb->replace_member_ctx.replica_in == boost::uuids::nil_uuid() || + m_rd_sb->replace_member_ctx.replica_out == boost::uuids::nil_uuid()) { + RD_LOGT(NO_TRACE_ID, "No replace member in progress, return"); + return; + } + + auto peers = get_replication_status(); + repl_lsn_t in_lsn = 0; + repl_lsn_t out_lsn = 0; + repl_lsn_t laggy = HS_DYNAMIC_CONFIG(consensus.laggy_threshold); + + for (auto& peer : peers) { + if (peer.id_ == m_rd_sb->replace_member_ctx.replica_out) { + out_lsn = peer.replication_idx_; + RD_LOGD(NO_TRACE_ID, "Replica out {} with lsn {}", boost::uuids::to_string(peer.id_), out_lsn); + } else if (peer.id_ == m_rd_sb->replace_member_ctx.replica_in) { + in_lsn = peer.replication_idx_; + RD_LOGD(NO_TRACE_ID, "Replica in {} with lsn {}", boost::uuids::to_string(peer.id_), in_lsn); + } + } + // TODO optimize the condition + bool catch_up = in_lsn + laggy >= out_lsn; + + if (!catch_up) { + RD_LOGD(NO_TRACE_ID, "Checking replace member status, replica_in={} with lsn={}, replica_out={} with lsn={}", + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), in_lsn, + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out), out_lsn); + return; + } + + RD_LOGD( + NO_TRACE_ID, + "Checking replace member status, new member has caught up, replica_in={} with lsn={}, replica_out={} with lsn={}", + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), in_lsn, + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out), out_lsn); + + trace_id_t trace_id = generateRandomTraceId(); + + RD_LOGD(trace_id, "Trigger complete_replace_member, replica_in={}, replica_out={}", + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out)); + + replica_member_info out{m_rd_sb->replace_member_ctx.replica_in, ""}; + replica_member_info in{m_rd_sb->replace_member_ctx.replica_out, ""}; + auto ret = complete_replace_member(out, in, 0, trace_id).get(); + if (ret.hasError()) { + RD_LOGE(trace_id, "Failed to complete replace member, next time will retry it, error={}", ret.error()); + return; + } + RD_LOGI(trace_id, "Complete replace member, next time will retry it, replica_in={}, replica_out={}", + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), + boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out)) +} + /////////////////////////////////// Private metohds //////////////////////////////////// void RaftReplDev::cp_flush(CP* cp, cshared< ReplDevCPContext > ctx) { if (is_destroyed()) { diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 6a790c017..abede36bf 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -15,6 +15,10 @@ #include "replication/log_store/repl_log_store.h" namespace homestore { +struct replace_member_ctx_superblk { + replica_id_t replica_out; + replica_id_t replica_in; +}; #pragma pack(1) struct raft_repl_dev_superblk : public repl_dev_superblk { @@ -26,6 +30,7 @@ struct raft_repl_dev_superblk : public repl_dev_superblk { uint64_t last_applied_dsn; // Last applied data sequence number uint8_t destroy_pending; // Flag to indicate whether the group is in destroy pending state repl_lsn_t last_snapshot_lsn; // Last snapshot LSN follower received from leader + replace_member_ctx_superblk replace_member_ctx; // Replace members context, used to track the replace member status uint32_t get_raft_sb_version() const { return raft_sb_version; } }; @@ -36,7 +41,7 @@ using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, DESTROYING, DESTROYED, PERMANENT_DESTROYED); -struct start_replace_members_ctx { +struct replace_member_ctx { replica_member_info replica_out; replica_member_info replica_in; }; @@ -235,8 +240,8 @@ class RaftReplDev : public ReplDev, ReplServiceError do_remove_member(const replica_member_info& member, uint64_t trace_id = 0); ReplServiceError do_flip_learner(const replica_member_info& member, bool target, bool wait_and_verify, uint64_t trace_id = 0); - ReplServiceError set_priority(const replica_member_info& member, int32_t priority, uint64_t trace_id = 0); - nuraft::cmd_result_code retry_when_config_change(const std::function< nuraft::cmd_result_code() >& func, + ReplServiceError set_priority(const replica_id_t& member, int32_t priority, uint64_t trace_id = 0); + nuraft::cmd_result_code retry_when_config_changing(const std::function< nuraft::cmd_result_code() >& func, uint64_t trace_id = 0); bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms = 100); @@ -366,6 +371,11 @@ class RaftReplDev : public ReplDev, */ void flush_durable_commit_lsn(); + /** + * Check the replace_member status, if the new member is fully synced up and ready to take over, remove the old member. + */ + void check_replace_member_status(); + /** * \brief This method is called during restart to notify the upper layer */ diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 25b0a5d8f..9cf41dcce 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -63,7 +63,7 @@ class SoloReplDev : public ReplDev { replica_id_t get_leader_id() const override { return m_group_id; } std::vector< peer_info > get_replication_status() const override { return std::vector< peer_info >{ - peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0, .priority_ = 1, .is_self_ = true}}; + peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0, .priority_ = 1}}; } bool is_ready_for_traffic() const override { return true; } void purge() override {} diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index d48801939..2debd1ae5 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -193,18 +193,12 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } } -AsyncReplResult<> SoloReplService::start_replace_member(group_id_t group_id, const replica_member_info& member_out, +AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } -AsyncReplResult<> SoloReplService::complete_replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id) const { - return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); -} - AsyncReplResult<> SoloReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify, uint64_t trace_id) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index 3b3857d90..d7f332d0c 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -89,12 +89,9 @@ class SoloReplService : public GenericReplService { std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> start_replace_member(group_id_t group_id, const replica_member_info& member_out, + AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0) const override; - AsyncReplResult<> complete_replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0) const override; AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, uint64_t trace_id = 0) const override; diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index ecd925dbb..75d1a766d 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -476,9 +476,15 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki add_repl_dev(group_id, rdev); } -AsyncReplResult<> RaftReplService::start_replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id) const { +// replace_member actually has two phases: +// 1. start_replace_member: flip member_out to learner and add member_in. +// 2. complete_replace_member: remove member_out. +// In this function, it only invokes replDev start_replace_member. There is +// a background reaper thread helps periodically check the member_in replication status, after in_member has caught up, +// will trigger replDev complete_replace_member. +AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); incr_pending_request_num(); auto rdev_result = get_repl_dev(group_id); @@ -500,29 +506,6 @@ AsyncReplResult<> RaftReplService::start_replace_member(group_id_t group_id, con }); } -AsyncReplResult<> RaftReplService::complete_replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id) const { - if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); - incr_pending_request_num(); - auto rdev_result = get_repl_dev(group_id); - if (!rdev_result) { - decr_pending_request_num(); - return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); - } - return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) - ->complete_replace_member(member_out, member_in, commit_quorum, trace_id) - .via(&folly::InlineExecutor::instance()) - .thenValue([this](auto&& e) mutable { - if (e.hasError()) { - decr_pending_request_num(); - return make_async_error<>(e.error()); - } - decr_pending_request_num(); - return make_async_success<>(); - }); -} - AsyncReplResult<> RaftReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify, uint64_t trace_id) const { if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); @@ -576,12 +559,19 @@ void RaftReplService::start_reaper_thread() { HS_DYNAMIC_CONFIG(consensus.flush_durable_commit_interval_ms) * 1000 * 1000, true /* recurring */, nullptr, [this](void*) { flush_durable_commit_lsn(); }); + // Check replace_member sync status to see a new member is fully synced up and ready to remove the old member + m_replace_member_sync_check_timer_hdl = iomanager.schedule_thread_timer( + HS_DYNAMIC_CONFIG(consensus.replace_member_sync_check_interval_ms) * 1000 * 1000, true /* recurring */, + nullptr, [this](void*) { check_replace_member_status(); }); + + p.setValue(); } else { // Cancel all recurring timers started iomanager.cancel_timer(m_rdev_gc_timer_hdl, true /* wait */); iomanager.cancel_timer(m_rdev_fetch_timer_hdl, true /* wait */); iomanager.cancel_timer(m_flush_durable_commit_timer_hdl, true /* wait */); + iomanager.cancel_timer(m_replace_member_sync_check_timer_hdl, true /* wait */); } }); std::move(f).get(); @@ -670,6 +660,14 @@ void RaftReplService::flush_durable_commit_lsn() { } } +void RaftReplService::check_replace_member_status() { + std::unique_lock lg(m_rd_map_mtx); + for (auto& rdev_parent : m_rd_map) { + auto rdev = std::dynamic_pointer_cast< RaftReplDev >(rdev_parent.second); + rdev->check_replace_member_status(); + } +} + ///////////////////// RaftReplService CP Callbacks ///////////////////////////// int ReplSvcCPContext::add_repl_dev_ctx(ReplDev* dev, cshared< ReplDevCPContext > dev_ctx) { m_cp_ctx_map.emplace(dev, dev_ctx); diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index c739eeb9e..429ccb295 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -52,6 +52,7 @@ class RaftReplService : public GenericReplService, iomgr::timer_handle_t m_rdev_fetch_timer_hdl; iomgr::timer_handle_t m_rdev_gc_timer_hdl; iomgr::timer_handle_t m_flush_durable_commit_timer_hdl; + iomgr::timer_handle_t m_replace_member_sync_check_timer_hdl; iomgr::io_fiber_t m_reaper_fiber; std::mutex raft_restart_mutex; @@ -78,12 +79,9 @@ class RaftReplService : public GenericReplService, std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> start_replace_member(group_id_t group_id, const replica_member_info& member_out, + AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0) const override; - AsyncReplResult<> complete_replace_member(group_id_t group_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id = 0) const override; AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, @@ -97,6 +95,7 @@ class RaftReplService : public GenericReplService, void gc_repl_devs(); void gc_repl_reqs(); void flush_durable_commit_lsn(); + void check_replace_member_status(); void monitor_cert_changes(); void restart_raft_svc(const std::string filepath, const bool deleted); bool wait_for_cert(const std::string& filepath); diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 106a1afeb..0ceaf090b 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -130,7 +130,7 @@ if (${io_tests}) add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr) add_test(NAME DataService-Epoll COMMAND test_data_service) add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev) - # add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic) + add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic --override_config homestore_config.consensus.replace_member_sync_check_interval_ms=1000) add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) endif() diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp index 92ff45a69..c00788127 100644 --- a/src/tests/test_common/hs_repl_test_common.hpp +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -301,7 +301,8 @@ class HSReplTestHelper : public HSTestHelper { auto v = hs()->repl_service().create_repl_dev(repl_group_id, members).get(); ASSERT_EQ(v.hasValue(), true) - << "Error in creating repl dev for group_id=" << boost::uuids::to_string(repl_group_id).c_str(); + << "Error in creating repl dev for group_id=" << boost::uuids::to_string(repl_group_id).c_str() + << ", err=" << v.error(); auto& raftService = dynamic_cast< RaftReplService& >(hs()->repl_service()); auto follower_priority = raftService.compute_raft_follower_priority(); auto repl_dev = v.value(); diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 2d4519b94..80eeb1573 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -742,7 +742,7 @@ class RaftReplDevTestBase : public testing::Test { void create_snapshot() { dbs_[0]->create_snapshot(); } void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } - void start_replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, + void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, uint32_t commit_quorum = 0, ReplServiceError error = ReplServiceError::OK) { this->run_on_leader(db, [this, error, db, member_out, member_in, commit_quorum]() { LOGINFO("Start replace member out={} in={}", boost::uuids::to_string(member_out), @@ -750,32 +750,12 @@ class RaftReplDevTestBase : public testing::Test { replica_member_info out{member_out, ""}; replica_member_info in{member_in, ""}; - auto result = hs()->repl_service().start_replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); + auto result = hs()->repl_service().replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); if (error == ReplServiceError::OK) { ASSERT_EQ(result.hasError(), false) << "Error in replacing member, err=" << result.error(); } else { - ASSERT_EQ(result.hasError(), true) << "Error in replacing member, err="<< result.error(); - ASSERT_EQ(result.error(), error); - } - }); - } - - void complete_replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, - replica_id_t member_in, uint32_t commit_quorum = 0, - ReplServiceError error = ReplServiceError::OK) { - this->run_on_leader(db, [this, error, db, member_out, member_in, commit_quorum]() { - LOGINFO("Complete replace member out={} in={}", boost::uuids::to_string(member_out), - boost::uuids::to_string(member_in)); - - replica_member_info out{member_out, ""}; - replica_member_info in{member_in, ""}; - auto result = - hs()->repl_service().complete_replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); - if (error == ReplServiceError::OK) { - ASSERT_EQ(result.hasError(), false) << "Error in replacing member, err=" << result.error(); - } else { - ASSERT_EQ(result.hasError(), true) << "Error in replacing member, err=" << result.error(); - ASSERT_EQ(result.error(), error); + ASSERT_EQ(result.hasError(), true); + ASSERT_EQ(result.error(), error) << "Error in replacing member, err=" << result.error(); } }); } diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp index 0897a5201..4ae56a9c3 100644 --- a/src/tests/test_raft_repl_dev_dynamic.cpp +++ b/src/tests/test_raft_repl_dev_dynamic.cpp @@ -13,6 +13,8 @@ * *********************************************************************************/ #include "test_common/raft_repl_test_base.hpp" +#include +#include "common/homestore_config.hpp" // Dynamic tests spawn spare replica's also which can be used to add and remove from a repl dev. class ReplDevDynamicTest : public RaftReplDevTestBase { @@ -38,11 +40,11 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { g_helper->sync_for_test_start(num_members); if (g_helper->replica_num() < num_replicas) { - // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); - start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -56,13 +58,10 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); this->validate_data(); } - - g_helper->sync_for_test_start(num_members); - LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); - complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); - g_helper->sync_for_verify_start(num_members); - LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); + + //wait for background reaper thread to trigger complete_replace_member if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); @@ -114,7 +113,7 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) { // Replace down replica 2 with spare replica 3 with commit quorum 1 // so that leader can go ahead with replacing member. LOGINFO("Replace member started"); - start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/); this->write_on_leader(num_io_entries, true /* wait_for_commit */); LOGINFO("Leader completed num_io={}", num_io_entries); } @@ -145,11 +144,11 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) { LOGINFO("TwoMemberDown test done replica={}", g_helper->replica_num()); } -TEST_F(ReplDevDynamicTest, OneMemberDown) { +TEST_F(ReplDevDynamicTest, OutMemberDown) { // replica0(leader) and replica1 up, replica2 is down. Replace replica2 with replica3. // replica0 should be able to baseline resync to replica4(new member). // Write some IO's, replace a member, validate all members data except which is out. - LOGINFO("OneMemberDown test started replica={}", g_helper->replica_num()); + LOGINFO("OutMemberDown test started replica={}", g_helper->replica_num()); auto db = dbs_.back(); auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); @@ -164,7 +163,7 @@ TEST_F(ReplDevDynamicTest, OneMemberDown) { std::this_thread::sleep_for(std::chrono::seconds(3)); if (g_helper->replica_num() == 0) { - // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); } @@ -173,7 +172,7 @@ TEST_F(ReplDevDynamicTest, OneMemberDown) { LOGINFO("Shutdown replica 2"); if (g_helper->replica_num() == 0) { - start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -188,30 +187,16 @@ TEST_F(ReplDevDynamicTest, OneMemberDown) { this->validate_data(); } - //shutdown after becoming learner + // shutdown after becoming learner, in this case, the member_out won't remove replDev after restart. // this->shutdown_replica(2); // LOGINFO("Shutdown replica 2"); // std::this_thread::sleep_for(std::chrono::seconds(2)); - g_helper->sync_for_test_start(num_members); - LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); - this->run_on_leader(db, [this, db, member_out, member_in]() { - replica_member_info out{g_helper->replica_id(member_out), ""}; - replica_member_info in{g_helper->replica_id(member_in), ""}; - auto result = hs()->repl_service().complete_replace_member(db->repl_dev()->group_id(), out, in).get(); - if (result.hasError()) { - ASSERT_EQ(result.error(), ReplServiceError::CANCELLED) - << "Unexpected error in replacing member, err=" << result.error(); - LOGWARN("Error in completing replace member, err={}, will retry after 2s", result.error()); - std::this_thread::sleep_for(std::chrono::seconds(2)); - complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); - } - }); - - LOGINFO("Replace member old leader done"); - + // data synced, waiting for removing learner + LOGINFO("data synced, sync for completing replace member, replica={}", g_helper->replica_num()); g_helper->sync_for_verify_start(num_members); - LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + // Since the out_member stopped, it cannot response to remove_srv req, as a result the first time will get CANCELLED + // error, so waiting time is longer than other tests. if (g_helper->replica_num() == 2) { LOGINFO("Start replica 2"); this->start_replica(2); @@ -249,20 +234,21 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { g_helper->sync_for_test_start(num_members); - if (g_helper->replica_num() != member_in) { + if (g_helper->replica_num() == member_out) { LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); - // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. this->write_on_leader(num_io_entries, true /* wait_for_commit */); // Leader will return error NOT_LEADER and yield leadership, sleep and connect again // to the new leader. LOGINFO("Replace old leader"); - start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, ReplServiceError::NOT_LEADER); LOGINFO("Replace member leader yield done"); - - std::this_thread::sleep_for(std::chrono::seconds(3)); - start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + } + std::this_thread::sleep_for(std::chrono::seconds(3)); + if (g_helper->replica_num() != member_in) { + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); LOGINFO("Replace member old leader done"); } @@ -278,12 +264,8 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { this->validate_data(); } - g_helper->sync_for_test_start(num_members); - LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); - complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); - g_helper->sync_for_verify_start(num_members); - LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); @@ -318,16 +300,16 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { g_helper->sync_for_test_start(num_members); if (g_helper->replica_num() == 1) { - LOGINFO("Restart replica 1"); + LOGINFO("Restart replica 1, "); this->restart_replica(15); } if (g_helper->replica_num() == 0) { - // With existing raft repl dev group, write IO's, validate and call start_replace_member on leader. + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); - start_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -342,12 +324,8 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { this->validate_data(); } - g_helper->sync_for_test_start(num_members); - LOGINFO("sync for completing replace member, replica={}", g_helper->replica_num()); - complete_replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); - g_helper->sync_for_verify_start(num_members); - LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); @@ -364,6 +342,60 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { LOGINFO("OneMemberRestart test done replica={}", g_helper->replica_num()); } +TEST_F(ReplDevDynamicTest, ValidateRequest) { + LOGINFO("ValidateRequest test started replica={}", g_helper->replica_num()); + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.consensus.laggy_threshold = 0; + LOGINFO("setup consensus.laggy_threshold to {}", 0); + HS_SETTINGS_FACTORY().save(); + }); + + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + + //shut down before replace member + this->shutdown_replica(1); + LOGINFO("Shutdown replica 1"); + + //wait for shutdown + std::this_thread::sleep_for(std::chrono::seconds(3)); + g_helper->sync_for_verify_start(num_members); + if (g_helper->replica_num() == 0) { + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + } + g_helper->sync_for_verify_start(num_members); + if (g_helper->replica_num() == 0) { + // generate uuid + replica_id_t fake_member_out = boost::uuids::random_generator()(); + replica_id_t fake_member_in = boost::uuids::random_generator()(); + LOGINFO("test SERVER_NOT_FOUND"); + replace_member(db, fake_member_out, fake_member_in, 0, ReplServiceError::SERVER_NOT_FOUND); + LOGINFO("test replace_member already complete"); + replace_member(db, fake_member_out, g_helper->replica_id(0)); + LOGINFO("test QUORUM_NOT_MET", num_io_entries, g_helper->replica_num()); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + ReplServiceError::QUORUM_NOT_MET); + } + + if (g_helper->replica_num() == 1) { + LOGINFO("Start replica 1"); + this->start_replica(1); + } + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("ValidateRequest test done replica={}", g_helper->replica_num()); +} + int main(int argc, char* argv[]) { int parsed_argc = argc; char** orig_argv = argv; From 5a8aee721a42845d19590dc83397e7071fbeb22b Mon Sep 17 00:00:00 2001 From: yuwmao Date: Thu, 29 May 2025 15:23:33 +0800 Subject: [PATCH 128/170] Fix replace_member --- conanfile.py | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 36 +++++++++---------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/conanfile.py b/conanfile.py index 7734cc43d..e3359cc82 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.15.0" + version = "6.15.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 795a2db55..b5f684b5c 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1420,8 +1420,8 @@ void RaftReplDev::start_replace_member(repl_req_ptr_t rreq) { m_listener->on_start_replace_member(members->replica_out, members->replica_in, rreq->traceID()); // record the replace_member intent std::unique_lock lg{m_sb_mtx}; - m_rd_sb->replace_member_ctx.replica_in = members->replica_out.id; - m_rd_sb->replace_member_ctx.replica_out = members->replica_in.id; + m_rd_sb->replace_member_ctx.replica_in = members->replica_in.id; + m_rd_sb->replace_member_ctx.replica_out = members->replica_out.id; m_rd_sb.write(); } @@ -1529,7 +1529,7 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { if (p.id_ == m_my_repl_id) { continue; } if (p.replication_idx_ >= least_active_repl_idx) { res.insert(p.id_); - RD_LOGW(NO_TRACE_ID, + RD_LOGT(NO_TRACE_ID, "Found active peer {}, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}, laggy={}", p.id_, my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_, least_active_repl_idx, laggy); @@ -1836,15 +1836,17 @@ void RaftReplDev::check_replace_member_status() { } auto peers = get_replication_status(); + auto replica_in = m_rd_sb->replace_member_ctx.replica_in; + auto replica_out = m_rd_sb->replace_member_ctx.replica_out; repl_lsn_t in_lsn = 0; repl_lsn_t out_lsn = 0; repl_lsn_t laggy = HS_DYNAMIC_CONFIG(consensus.laggy_threshold); for (auto& peer : peers) { - if (peer.id_ == m_rd_sb->replace_member_ctx.replica_out) { + if (peer.id_ == replica_out) { out_lsn = peer.replication_idx_; RD_LOGD(NO_TRACE_ID, "Replica out {} with lsn {}", boost::uuids::to_string(peer.id_), out_lsn); - } else if (peer.id_ == m_rd_sb->replace_member_ctx.replica_in) { + } else if (peer.id_ == replica_in) { in_lsn = peer.replication_idx_; RD_LOGD(NO_TRACE_ID, "Replica in {} with lsn {}", boost::uuids::to_string(peer.id_), in_lsn); } @@ -1854,33 +1856,29 @@ void RaftReplDev::check_replace_member_status() { if (!catch_up) { RD_LOGD(NO_TRACE_ID, "Checking replace member status, replica_in={} with lsn={}, replica_out={} with lsn={}", - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), in_lsn, - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out), out_lsn); + boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); return; } - RD_LOGD( - NO_TRACE_ID, - "Checking replace member status, new member has caught up, replica_in={} with lsn={}, replica_out={} with lsn={}", - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), in_lsn, - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out), out_lsn); + RD_LOGD(NO_TRACE_ID, + "Checking replace member status, new member has caught up, replica_in={} with lsn={}, replica_out={} with " + "lsn={}", + boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); trace_id_t trace_id = generateRandomTraceId(); RD_LOGD(trace_id, "Trigger complete_replace_member, replica_in={}, replica_out={}", - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out)); + boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out)); - replica_member_info out{m_rd_sb->replace_member_ctx.replica_in, ""}; - replica_member_info in{m_rd_sb->replace_member_ctx.replica_out, ""}; + replica_member_info out{replica_out, ""}; + replica_member_info in{replica_in, ""}; auto ret = complete_replace_member(out, in, 0, trace_id).get(); if (ret.hasError()) { RD_LOGE(trace_id, "Failed to complete replace member, next time will retry it, error={}", ret.error()); return; } - RD_LOGI(trace_id, "Complete replace member, next time will retry it, replica_in={}, replica_out={}", - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_in), - boost::uuids::to_string(m_rd_sb->replace_member_ctx.replica_out)) + RD_LOGI(trace_id, "Complete replace member, replica_in={}, replica_out={}", + boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out)) } /////////////////////////////////// Private metohds //////////////////////////////////// From 7af9d961114ef1dd303799bd94a9ff391c915457 Mon Sep 17 00:00:00 2001 From: yuwmao Date: Fri, 30 May 2025 19:55:20 +0800 Subject: [PATCH 129/170] fix bug in get_replication_status --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/conanfile.py b/conanfile.py index e3359cc82..388374cb4 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.15.1" + version = "6.15.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index b5f684b5c..de7f9aaca 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1502,13 +1502,11 @@ std::vector< peer_info > RaftReplDev::get_replication_status() const { std::vector< peer_info > pi; auto rep_status = m_repl_svc_ctx->get_raft_status(); for (auto const& pinfo : rep_status) { - for (auto const& pinfo : rep_status) { - pi.emplace_back(peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), - .replication_idx_ = pinfo.last_log_idx_, - .last_succ_resp_us_ = pinfo.last_succ_resp_us_, - .priority_ = pinfo.priority_, - .can_vote = !pinfo.is_learner_}); - } + pi.emplace_back(peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), + .replication_idx_ = pinfo.last_log_idx_, + .last_succ_resp_us_ = pinfo.last_succ_resp_us_, + .priority_ = pinfo.priority_, + .can_vote = !pinfo.is_learner_}); } return pi; } From 472f7941e0fa4ca223aa39daf5f388636aca8c59 Mon Sep 17 00:00:00 2001 From: Brian Szmyd Date: Mon, 2 Jun 2025 09:11:24 -0600 Subject: [PATCH 130/170] The usage of EVP_DigestInit_ex2 in meta_blk test requires openssl3.x (#732) --- conanfile.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/conanfile.py b/conanfile.py index 388374cb4..8438490e3 100644 --- a/conanfile.py +++ b/conanfile.py @@ -60,6 +60,9 @@ def requirements(self): if self.settings.arch in ['x86', 'x86_64']: self.requires("isa-l/2.30.0", transitive_headers=True) + # Tests require OpenSSL 3.x + self.requires("openssl/[^3.1]", override=True) + def imports(self): self.copy(root_package="sisl", pattern="*", dst="bin/scripts/python/flip/", src="bindings/flip/python/", keep_path=False) From f076579d178afc071b79b927ee8e36821fd53167 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Mon, 2 Jun 2025 18:23:56 -0700 Subject: [PATCH 131/170] Fix prefix merge and enable long running (#729) --- .jenkins/jenkinsfile_nightly | 8 +- conanfile.py | 2 +- .../btree/detail/btree_remove_impl.ipp | 10 ++ .../homestore/btree/detail/prefix_node.hpp | 110 +++++++++++++----- src/tests/btree_helpers/btree_test_helper.hpp | 2 +- src/tests/test_btree_node.cpp | 34 +++++- .../test_common/homestore_test_common.hpp | 2 +- src/tests/test_index_btree.cpp | 11 +- src/tests/test_mem_btree.cpp | 7 +- src/tests/test_scripts/CMakeLists.txt | 19 +-- src/tests/test_scripts/index_test.py | 4 + 11 files changed, 153 insertions(+), 56 deletions(-) diff --git a/.jenkins/jenkinsfile_nightly b/.jenkins/jenkinsfile_nightly index 7100a0230..8083c816b 100644 --- a/.jenkins/jenkinsfile_nightly +++ b/.jenkins/jenkinsfile_nightly @@ -55,10 +55,10 @@ pipeline { find /home/jenkins -type f -wholename '*/test_data_service' -exec cp {} .jenkins/test_data_service \\; find /home/jenkins -type f -wholename '*/test_raft_repl_dev' -exec cp {} .jenkins/test_raft_repl_dev \\; find /home/jenkins -type f -wholename '*/test_solo_repl_dev' -exec cp {} .jenkins/test_solo_repl_dev \\; - find /home/jenkins -type f -wholename '*/scripts/index_test.py' -exec install -Dm755 {} .jenkins/index_test.py \\; - find /home/jenkins -type f -wholename '*/scripts/log_meta_test.py' -exec install -Dm755 {} .jenkins/log_meta_test.py \\; - find /home/jenkins -type f -wholename '*/scripts/data_test.py' -exec install -Dm755 {} .jenkins/data_test.py \\; - find /home/jenkins -type f -wholename '*/scripts/long_running.py' -exec install -Dm755 {} .jenkins/long_running.py \\; + find /home/jenkins -type f -wholename '*/test_scripts/index_test.py' -exec install -Dm755 {} .jenkins/index_test.py \\; + find /home/jenkins -type f -wholename '*/test_scripts/log_meta_test.py' -exec install -Dm755 {} .jenkins/log_meta_test.py \\; + find /home/jenkins -type f -wholename '*/test_scripts/data_test.py' -exec install -Dm755 {} .jenkins/data_test.py \\; + find /home/jenkins -type f -wholename '*/test_scripts/long_running.py' -exec install -Dm755 {} .jenkins/long_running.py \\; ''' } post { diff --git a/conanfile.py b/conanfile.py index 8438490e3..07ddcecf4 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.15.2" + version = "6.15.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp index 66955b6c7..de991edba 100644 --- a/src/include/homestore/btree/detail/btree_remove_impl.ipp +++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp @@ -267,6 +267,9 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const BT_NODE_LOG_ASSERT_EQ(child->is_node_deleted(), false, child); old_nodes.push_back(child); + // Todo: need a more precise calculation considering compacted size for prefix nodes because when merge happens + // compaction will occur for both leftmost and new nodes. This calculation makes available size not be balanced + // for the leftmost node and new nodes. total_size += child->occupied_size(); } @@ -323,6 +326,13 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const if ((old_nodes[i]->total_entries() - nentries) == 0) { // Entire node goes in available_size -= old_nodes[i]->occupied_size(); + // For prefix nodes, compaction will make the size smaller, so we can compact saving to available size; + // hence it cannot get negative. + if (old_nodes[i]->get_node_type() == btree_node_type::PREFIX) { + auto cur_node = static_cast< FixedPrefixNode< K, V >* >(old_nodes[i].get()); + available_size += cur_node->compact_saving(); + } + BT_NODE_DBG_ASSERT_EQ(available_size >= 0, true, leftmost_node, "negative available size"); if (i >= old_nodes.size() - 1) { src_cursor.ith_node = i + 1; src_cursor.nth_entry = std::numeric_limits< uint32_t >::max(); diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp index 7525729b0..ce2e922b2 100644 --- a/src/include/homestore/btree/detail/prefix_node.hpp +++ b/src/include/homestore/btree/detail/prefix_node.hpp @@ -43,7 +43,7 @@ class FixedPrefixNode : public VariantNode< K, V > { #pragma pack(1) struct prefix_node_header { uint16_t used_slots; // Number of slots actually used. TODO: We can deduce from set_bit_count of bitset - uint16_t tail_slot; // What is the tail slot number being used + uint16_t tail_slot; // The tail slot number being used. Address will point to the beginning of tail prefix std::string to_string() const { return fmt::format("slots_used={} tail_slot={} ", used_slots, tail_slot); } @@ -152,6 +152,7 @@ class FixedPrefixNode : public VariantNode< K, V > { FixedPrefixNode(uint8_t* node_buf, bnodeid_t id, bool init, bool is_leaf, const BtreeConfig& cfg) : VariantNode< K, V >(node_buf, id, init, is_leaf, cfg), prefix_bitset_{sisl::blob{bitset_area(), reqd_bitset_size(cfg)}, init} { + this->set_node_type(btree_node_type::PREFIX); if (init) { auto phdr = prefix_header(); phdr->used_slots = 0; @@ -305,7 +306,6 @@ class FixedPrefixNode : public VariantNode< K, V > { } } if (num_removed) { this->inc_gen(); } - #ifndef NDEBUG validate_sanity(); #endif @@ -338,10 +338,18 @@ class FixedPrefixNode : public VariantNode< K, V > { } } + uint16_t get_nth_suffix_slot_num(uint32_t idx) const { return get_suffix_entry_c(idx)->prefix_slot; } + + uint16_t get_nth_prefix_ref_count(uint32_t idx) const { + return get_prefix_entry_c(get_suffix_entry_c(idx)->prefix_slot)->ref_count; + } + + uint32_t compact_saving() const { return num_prefix_holes() * prefix_entry::size(); } + uint32_t available_size() const override { auto num_holes = num_prefix_holes(); if (num_holes > prefix_node_header::min_holes_to_compact) { - return available_size_without_compaction() + (num_holes * prefix_entry::size()); + return available_size_with_compaction(); } else { return available_size_without_compaction(); } @@ -430,7 +438,6 @@ class FixedPrefixNode : public VariantNode< K, V > { // part of Step 1, except generation count this->inc_gen(); dst_node.inc_gen(); - auto new_phdr = dst_node.prefix_header(); if (!this->is_leaf() && (dst_node.total_entries() != 0)) { // Incase this node is an edge node, move the stick to the right hand side node @@ -527,7 +534,9 @@ class FixedPrefixNode : public VariantNode< K, V > { this->invalidate_edge(); this->inc_gen(); prefix_bitset_ = sisl::CompactBitSet{sisl::blob{bitset_area(), reqd_bitset_size(cfg)}, true}; - + auto phdr = prefix_header(); + phdr->used_slots = 0; + phdr->tail_slot = 0; #ifndef NDEBUG validate_sanity(); #endif @@ -634,22 +643,25 @@ class FixedPrefixNode : public VariantNode< K, V > { } std::string to_string(bool print_friendly = false) const override { - auto str = fmt::format("{}id={} level={} nEntries={} {} next_node={} ", - (print_friendly ? "------------------------------------------------------------\n" : ""), - this->node_id(), this->level(), this->total_entries(), - (this->is_leaf() ? "LEAF" : "INTERIOR"), this->next_bnode()); + auto str = + fmt::format("{}id={} level={} nEntries={} {} next_node={} available_size={} occupied_size={} ", + (print_friendly ? "------------------------------------------------------------\n" : ""), + this->node_id(), this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), + this->next_bnode(), this->available_size(), this->occupied_size()); if (!this->is_leaf() && (this->has_valid_edge())) { fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid, this->edge_info().m_link_version); } - fmt::format_to(std::back_inserter(str), "{}Prefix_Hdr={}, Prefix_Bitmap=[{}]\n", - (print_friendly ? "\n\t" : " "), cprefix_header()->to_string(), prefix_bitset_.to_string()); + fmt::format_to(std::back_inserter(str), "{}Prefix_Hdr=[{}], Prefix_Bitmap = [{}] # of holes = {}\n", + (print_friendly ? "\n\t" : " "), cprefix_header()->to_string(), this->compact_bitset(), + this->num_prefix_holes()); for (uint32_t i{0}; i < this->total_entries(); ++i) { - fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={}]", (print_friendly ? "\n\t" : " "), i + 1, - BtreeNode::get_nth_key< K >(i, false).to_string(), - this->get_nth_value(i, false).to_string()); + fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={} slot#={} ref_count={}]", + (print_friendly ? "\n\t" : " "), i + 1, BtreeNode::get_nth_key< K >(i, false).to_string(), + this->get_nth_value(i, false).to_string(), this->get_nth_suffix_slot_num(i), + this->get_nth_prefix_ref_count(i)); } return str; } @@ -678,7 +690,9 @@ class FixedPrefixNode : public VariantNode< K, V > { auto phdr = prefix_header(); ++phdr->used_slots; - if (slot_num > phdr->tail_slot) { phdr->tail_slot = slot_num; } + if (slot_num + 1u > phdr->tail_slot) { phdr->tail_slot = slot_num + 1u; } + DEBUG_ASSERT_LE(phdr->used_slots, phdr->tail_slot, "Prefix slot number {} is not less than tail slot number {}", + slot_num, phdr->tail_slot); return slot_num; } @@ -693,9 +707,9 @@ class FixedPrefixNode : public VariantNode< K, V > { if (--pentry->ref_count == 0) { --phdr->used_slots; prefix_bitset_.reset_bit(slot_num); - if ((slot_num != 0) && (slot_num == phdr->tail_slot)) { + if (slot_num + 1u == phdr->tail_slot) { uint16_t prev_slot = prefix_bitset_.get_prev_set_bit(slot_num); - if (prev_slot != std::numeric_limits< uint16_t >::max()) { phdr->tail_slot = prev_slot; } + phdr->tail_slot = prev_slot + 1u; } } } @@ -711,17 +725,16 @@ class FixedPrefixNode : public VariantNode< K, V > { uint8_t const* suffix = r_cast< uint8_t const* >(get_suffix_entry_c(this->total_entries())); uint8_t const* prefix = r_cast< uint8_t const* >(get_prefix_entry_c(cprefix_header()->tail_slot)); - if (suffix <= prefix) { - return prefix - suffix; + if (suffix <= prefix + prefix_entry::size()) { + return prefix - suffix + prefix_entry::size(); } else { - DEBUG_ASSERT(false, "Node data is corrupted, suffix area is overlapping prefix area"); + DEBUG_ASSERT(false, "Node data is corrupted, suffix area is overlapping prefix area {}", + int64_t(suffix - prefix)); return 0; } } - uint32_t available_size_with_compaction() const { - return available_size_without_compaction() + (num_prefix_holes() * prefix_entry::size()); - } + uint32_t available_size_with_compaction() const { return available_size_without_compaction() + compact_saving(); } bool has_room(uint16_t for_nentries) const { return (available_size_without_compaction() >= (prefix_entry::size() + (for_nentries * suffix_entry::size()))); @@ -733,7 +746,9 @@ class FixedPrefixNode : public VariantNode< K, V > { uint32_t num_prefix_holes() const { auto phdr = cprefix_header(); - return (phdr->tail_slot + 1 - phdr->used_slots); + DEBUG_ASSERT_LE(phdr->used_slots, phdr->tail_slot, "Prefix slot number {} is not less than tail slot number {}", + phdr->used_slots, phdr->tail_slot); + return (phdr->tail_slot - phdr->used_slots); } bool is_compaction_suggested() const { return (num_prefix_holes() > prefix_node_header::min_holes_to_compact); } @@ -776,6 +791,9 @@ class FixedPrefixNode : public VariantNode< K, V > { // Finally adjust the tail offset to the compacted area. auto phdr = prefix_header(); phdr->tail_slot = phdr->used_slots; + DEBUG_ASSERT_EQ(phdr->tail_slot, prefix_bitset_.get_next_reset_bit(0u), + "Tail slot is not equal to the next reset bit, not expected"); + DEBUG_ASSERT_EQ(this->num_prefix_holes(), 0, "Shouldn't be any hole after compression, not expected"); } #ifndef NDEBUG @@ -814,13 +832,15 @@ class FixedPrefixNode : public VariantNode< K, V > { uint8_t const* csuffix_kv_area() const { return cbitset_area() + (prefix_bitset_.size() / 8); } prefix_entry* get_prefix_entry(uint16_t slot_num) { - return r_cast< prefix_entry* >(this->node_data_area() + - (this->node_data_size() - ((slot_num + 1) * prefix_entry::size()))); + return r_cast< prefix_entry* >( + this->node_data_area() + + (this->node_data_size() - (static_cast< uint16_t >(slot_num + 1) * prefix_entry::size()))); } prefix_entry const* get_prefix_entry_c(uint16_t slot_num) const { - return r_cast< prefix_entry const* >(this->node_data_area_const() + - (this->node_data_size() - ((slot_num + 1) * prefix_entry::size()))); + return r_cast< prefix_entry const* >( + this->node_data_area_const() + + (this->node_data_size() - (static_cast< uint16_t >(slot_num + 1) * prefix_entry::size()))); } suffix_entry* get_suffix_entry(uint16_t idx) { @@ -832,5 +852,39 @@ class FixedPrefixNode : public VariantNode< K, V > { static constexpr uint32_t get_key_size() { return prefix_entry::key_size() + suffix_entry::key_size(); } static constexpr uint32_t get_value_size() { return prefix_entry::value_size() + suffix_entry::value_size(); } + + std::string compact_bitset() const { + auto x = prefix_bitset_.to_string(); + std::ostringstream result; + std::vector< size_t > indices; + for (size_t i = 0; i < x.size(); ++i) { + if (x[i] == '1') { indices.push_back(i); } + } + + if (indices.empty()) { return result.str(); } + + size_t start = indices[0]; + size_t end = start; + result << "size = " << indices.size() << " : "; + for (size_t i = 1; i < indices.size(); ++i) { + if (indices[i] == end + 1) { + end = indices[i]; + } else { + if (start == end) { + result << start << ", "; + } else { + result << start << "-" << end << ", "; + } + start = end = indices[i]; + } + } + if (start == end) { + result << start; + } else { + result << start << "-" << end; + } + + return result.str(); + } }; } // namespace homestore diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index 0ff207f0d..0f709291a 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -250,7 +250,7 @@ struct BtreeTestHelper { } void range_remove_existing_random() { - static std::uniform_int_distribution< uint32_t > s_rand_range_generator{2, 5}; + static std::uniform_int_distribution< uint32_t > s_rand_range_generator{2, 50}; auto const [start_k, end_k] = m_shadow_map.pick_random_existing_keys(s_rand_range_generator(m_re)); do_range_remove(start_k, end_k, true /* only_existing */); diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp index 3046a45bd..275a47caa 100644 --- a/src/tests/test_btree_node.cpp +++ b/src/tests/test_btree_node.cpp @@ -344,6 +344,39 @@ TYPED_TEST(NodeTest, SequentialInsert) { this->validate_get_any(98, 102); } +TYPED_TEST(NodeTest, SimpleInsert) { + auto oc = this->m_node1->occupied_size(); + this->put(1, btree_put_type::INSERT); + this->put(2, btree_put_type::INSERT); + this->put(3, btree_put_type::INSERT); + this->remove(2); + this->remove(1); + this->remove(3); + auto oc2 = this->m_node1->occupied_size(); + ASSERT_EQ(oc, oc2) << "Occupied size cannot be more than original size"; + this->put(1, btree_put_type::INSERT); + this->put(2, btree_put_type::INSERT); + this->put(3, btree_put_type::INSERT); + this->remove(3); + this->remove(2); + this->remove(1); + ASSERT_EQ(oc, oc2) << "Occupied size must be the same as original size"; + + this->put(2, btree_put_type::INSERT); + this->put(1, btree_put_type::INSERT); + this->put(4, btree_put_type::INSERT); + this->put(3, btree_put_type::INSERT); + for (uint32_t i = 5; i <= 50; ++i) { + this->put(i, btree_put_type::INSERT); + } + LOGDEBUG("Creating a hole with size of 11 for prefix compaction usecase"); + for (uint32_t i = 10; i <= 20; ++i) { + this->remove(i); + } + this->m_node1->move_out_to_right_by_entries(this->m_cfg, *this->m_node2, 20); + this->m_node1->copy_by_entries(this->m_cfg, *this->m_node2, 0, std::numeric_limits< uint32_t >::max()); +} + TYPED_TEST(NodeTest, ReverseInsert) { for (uint32_t i{100}; (i > 0 && this->has_room()); --i) { this->put(i - 1, btree_put_type::INSERT); @@ -451,7 +484,6 @@ TYPED_TEST(NodeTest, Move) { ASSERT_EQ(this->m_node2->total_entries(), 0u) << "Remove all on right has failed"; ASSERT_EQ(this->m_node1->total_entries(), list.size()) << "Move in from right has failed"; this->validate_get_all(); - this->m_node1->move_out_to_right_by_entries(this->m_cfg, *this->m_node2, list.size() / 2); ASSERT_EQ(this->m_node1->total_entries(), list.size() / 2) << "Move out half entries to right has failed"; ASSERT_EQ(this->m_node2->total_entries(), list.size() - list.size() / 2) diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index c4979e203..ee7faeb7e 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -353,7 +353,7 @@ class HSTestHelper { auto fut = homestore::hs()->cp_mgr().trigger_cp_flush(true /* force */); auto on_complete = [&](auto success) { HS_REL_ASSERT_EQ(success, true, "CP Flush failed"); - LOGINFO("CP Flush completed"); + LOGDEBUG("CP Flush completed"); }; if (wait) { diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp index 2c08b7d5c..47fbd8f21 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_index_btree.cpp @@ -133,7 +133,8 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { test_common::HSTestHelper m_helper; }; -using BtreeTypes = testing::Types< FixedLenBtree, PrefixIntervalBtree, VarKeySizeBtree, VarValueSizeBtree, VarObjSizeBtree >; +using BtreeTypes = + testing::Types< FixedLenBtree, PrefixIntervalBtree, VarKeySizeBtree, VarValueSizeBtree, VarObjSizeBtree >; TYPED_TEST_SUITE(BtreeTest, BtreeTypes); @@ -200,7 +201,7 @@ TYPED_TEST(BtreeTest, TriggerCacheEviction) { s.resource_limits.cache_size_percent = 1u; HS_SETTINGS_FACTORY().save(); }); - + this->restart_homestore(); LOGINFO("TriggerCacheEviction test start"); @@ -532,6 +533,8 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin this->m_bt->count_keys(this->m_bt->root_node_id())); BtreeTestHelper< TestType >::TearDown(); m_helper.shutdown_homestore(false); + this->m_bt.reset(); + log_obj_life_counter(); } private: @@ -562,6 +565,10 @@ int main(int argc, char* argv[]) { auto seed = SISL_OPTIONS["seed"].as< uint64_t >(); LOGINFO("Using seed {} to sow the random generation", seed); g_re.seed(seed); + } else { + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + LOGINFO("No seed provided. Using randomly generated seed: {}", seed); + g_re.seed(seed); } auto ret = RUN_ALL_TESTS(); return ret; diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 4625167fd..14e81a2d9 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -111,8 +111,6 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { #endif this->m_cfg.m_max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); this->m_cfg.m_merge_turned_on = !SISL_OPTIONS["disable_merge"].as< bool >(); - // if TestType is PrefixIntervalBtreeTest print here something - if constexpr (std::is_same_v< TestType, PrefixIntervalBtreeTest >) { this->m_cfg.m_merge_turned_on = false; } this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); } }; @@ -315,7 +313,6 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin #endif this->m_cfg.m_max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); this->m_cfg.m_merge_turned_on = !SISL_OPTIONS["disable_merge"].as< bool >(); - if constexpr (std::is_same_v< TestType, PrefixIntervalBtreeTest >) { this->m_cfg.m_merge_turned_on = false; } this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); } @@ -348,6 +345,10 @@ int main(int argc, char* argv[]) { auto seed = SISL_OPTIONS["seed"].as< uint64_t >(); LOGINFO("Using seed {} to sow the random generation", seed); g_re.seed(seed); + } else { + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + LOGINFO("No seed provided. Using randomly generated seed: {}", seed); + g_re.seed(seed); } auto ret = RUN_ALL_TESTS(); return ret; diff --git a/src/tests/test_scripts/CMakeLists.txt b/src/tests/test_scripts/CMakeLists.txt index e1b5ff78c..4bb54bad5 100644 --- a/src/tests/test_scripts/CMakeLists.txt +++ b/src/tests/test_scripts/CMakeLists.txt @@ -1,15 +1,4 @@ -file(COPY vol_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY home_blk_flip.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY home_blk_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY index_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY log_meta_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY data_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY long_running.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) - -#add_test(NAME TestVolRecovery COMMAND ${CMAKE_BINARY_DIR}/bin/scripts/vol_test.py --test_suits=recovery --dirpath=${CMAKE_BINARY_DIR}/bin/) -#SET_TESTS_PROPERTIES(TestVolRecovery PROPERTIES DEPENDS TestVol) - -#add_test(NAME PerfTestVol COMMAND perf_test_volume) -#add_test(NAME RecoveryVol COMMAND python vol_test.py) -#add_test(NAME CheckBtree COMMAND check_btree) - +file(COPY index_test.py DESTINATION ../test_scripts) +file(COPY log_meta_test.py DESTINATION ../test_scripts) +file(COPY data_test.py DESTINATION ../test_scripts) +file(COPY long_running.py DESTINATION ../test_scripts) diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index bf2098fd4..df55a30b9 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -144,6 +144,10 @@ def main(): def long_running(*args): options = parse_arguments() + long_runnig_index(options, 0) + long_running_clean_shutdown(options, 0) + long_runnig_index(options, 1) + long_running_clean_shutdown(options, 1) for i in range(20): print(f"Iteration {i + 1}") long_running_crash_put_remove(options) From ad43af49ae3ce4c1a00a1422ee99fc44651819fc Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Mon, 2 Jun 2025 23:47:57 -0700 Subject: [PATCH 132/170] Fix overlapping range and enable index crash recovery for prefix (#735) --- conanfile.py | 2 +- .../homestore/btree/detail/prefix_node.hpp | 40 +++++++++++- src/tests/btree_helpers/btree_test_kvs.hpp | 33 +++++++--- src/tests/test_btree_node.cpp | 8 ++- src/tests/test_index_crash_recovery.cpp | 63 ++++++++++--------- 5 files changed, 106 insertions(+), 40 deletions(-) diff --git a/conanfile.py b/conanfile.py index 07ddcecf4..e3292bfaf 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.15.3" + version = "6.15.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp index ce2e922b2..7ba617fb2 100644 --- a/src/include/homestore/btree/detail/prefix_node.hpp +++ b/src/include/homestore/btree/detail/prefix_node.hpp @@ -89,6 +89,21 @@ class FixedPrefixNode : public VariantNode< K, V > { } } + int compare(BtreeKey const& key, BtreeValue const& val) const { + if constexpr (std::is_base_of_v< BtreeIntervalKey, K > && std::is_base_of_v< BtreeIntervalValue, V >) { + sisl::blob const kblob = s_cast< K const& >(key).serialize_prefix(); + sisl::blob const vblob = s_cast< V const& >(val).serialize_prefix(); + DEBUG_ASSERT_EQ(kblob.size(), key_size(), "Prefix key size mismatch with serialized prefix size"); + DEBUG_ASSERT_EQ(vblob.size(), value_size(), "Prefix value size mismatch with serialized prefix size"); + uint8_t const* cur_ptr = r_cast< uint8_t const* >(this) + sizeof(prefix_entry); + int cmp = std::memcmp(cur_ptr, kblob.cbytes(), kblob.size()); + if (cmp) { return cmp; } + cmp = std::memcmp(cur_ptr + kblob.size(), vblob.cbytes(), vblob.size()); + return cmp; + } + return 0; + } + sisl::blob key_buf() const { return sisl::blob{r_cast< uint8_t const* >(this) + sizeof(prefix_entry), key_size()}; } @@ -239,6 +254,10 @@ class FixedPrefixNode : public VariantNode< K, V > { } V new_val{s_cast< V const& >(val)}; new_val.shift(s_cast< K const& >(cur_key).distance(first_input_key), app_ctx); + if(get_prefix_entry_c(prefix_slot)->compare(cur_key, new_val)) { + LOGTRACEMOD(btree, "Adding new prefix entry for key={} val={}", cur_key.to_string(), new_val.to_string()); + prefix_slot = add_prefix(cur_key, new_val); + } write_suffix(idx, prefix_slot, cur_key, new_val); } @@ -317,6 +336,7 @@ class FixedPrefixNode : public VariantNode< K, V > { ///////////////////////////// All overrides of BtreeNode /////////////////////////////////// void get_nth_key_internal(uint32_t idx, BtreeKey& out_key, bool) const override { + DEBUG_ASSERT_LT(idx, this->total_entries(), "node={}", to_string()); suffix_entry const* sentry = get_suffix_entry_c(idx); prefix_entry const* pentry = get_prefix_entry_c(sentry->prefix_slot); DEBUG_ASSERT(prefix_bitset_.is_bit_set(sentry->prefix_slot), @@ -360,7 +380,13 @@ class FixedPrefixNode : public VariantNode< K, V > { this->available_size()); } - bool has_room_for_put(btree_put_type, uint32_t, uint32_t) const override { return has_room(1u); } + bool has_room_for_put(btree_put_type, uint32_t, uint32_t) const override { +#ifdef _PRERELEASE + auto max_keys = this->max_keys_in_node(); + if (max_keys && this->total_entries() > max_keys) { return false; } +#endif + return has_room(1u); + } uint32_t get_nth_key_size(uint32_t) const override { return dummy_key< K >.serialized_size(); } @@ -575,6 +601,14 @@ class FixedPrefixNode : public VariantNode< K, V > { uint32_t copy_internal(BtreeConfig const& cfg, BtreeNode const& o, uint32_t start_idx, bool by_size, uint32_t limit) { FixedPrefixNode const& src_node = s_cast< FixedPrefixNode const& >(o); +#ifdef _PRERELEASE + if (by_size) { + const uint32_t max_keys = this->max_keys_in_node(); + if (max_keys) { + if (this->total_entries() + limit > max_keys) { limit = max_keys - this->total_entries(); } + } + } +#endif // Adjust the size_to_move to cover the new node's reqd header space. uint32_t copied_size{0}; @@ -815,8 +849,8 @@ class FixedPrefixNode : public VariantNode< K, V > { //////////////////////// All Helper methods section //////////////////////// static uint32_t reqd_bitset_size(BtreeConfig const& cfg) { - return sisl::round_up((cfg.node_data_size() - sizeof(prefix_node_header)) / - (prefix_entry::key_size() + prefix_entry::value_size()) / 8, + return sisl::round_up((cfg.node_data_size() - sizeof(prefix_node_header) - suffix_entry::size()) / + prefix_entry::size() / 8, sisl::CompactBitSet::size_multiples()); } diff --git a/src/tests/btree_helpers/btree_test_kvs.hpp b/src/tests/btree_helpers/btree_test_kvs.hpp index 86d83a35c..fb87d1939 100644 --- a/src/tests/btree_helpers/btree_test_kvs.hpp +++ b/src/tests/btree_helpers/btree_test_kvs.hpp @@ -60,6 +60,17 @@ static std::string gen_random_string(size_t len, uint32_t preamble = std::numeri } return str; } +template < typename T > +static bool willAdditionOverflow(T a, int b) { + static_assert(std::is_integral< T >::value, "Template parameter must be an integral type."); + + if (b > 0) { + return a > std::numeric_limits< T >::max() - b; + } else if (b < 0) { + return a < std::numeric_limits< T >::min() - b; + } + return false; +} using namespace homestore; @@ -310,7 +321,7 @@ class TestIntervalKey : public BtreeIntervalKey { m_offset = other->m_offset; } - std::string to_string() const override { return fmt::format("{}.{}", m_base, m_offset); } + std::string to_string() const override { return fmt::format("{}", key()); } static uint32_t get_max_size() { return sizeof(TestIntervalKey); } @@ -319,13 +330,17 @@ class TestIntervalKey : public BtreeIntervalKey { static uint32_t get_fixed_size() { return sizeof(TestIntervalKey); } /////////////////// Overriding methods of BtreeIntervalKey ///////////////// - void shift(int n, void* app_ctx) override { m_offset += n; } + void shift(int n, void* app_ctx) override { + if (willAdditionOverflow< uint32_t >(m_offset, n)) { m_base++; } + m_offset += n; + } int distance(BtreeKey const& f) const override { TestIntervalKey const& from = s_cast< TestIntervalKey const& >(f); - DEBUG_ASSERT_EQ(m_base, from.m_base, "Invalid from key for distance"); - DEBUG_ASSERT_GE(m_offset, from.m_offset, "Invalid from key for distance"); - return m_offset - from.m_offset; + uint64_t this_val = (uint64_cast(m_base) << 32) | m_offset; + uint64_t from_val = (uint64_cast(from.m_base) << 32) | from.m_offset; + DEBUG_ASSERT_GE(this_val, from_val, "Invalid from key for distance"); + return static_cast< int >(this_val - from_val); } bool is_interval_key() const override { return true; } @@ -519,7 +534,8 @@ class TestIntervalValue : public BtreeIntervalValue { m_offset = other->m_offset; } - std::string to_string() const override { return fmt::format("{}.{}", m_base_val, m_offset); } + std::string to_string() const override { return fmt::format("{}", value()); } + uint64_t value() const { return (uint64_cast(m_base_val) << 16) | m_offset; } friend std::ostream& operator<<(std::ostream& os, const TestIntervalValue& v) { os << v.to_string(); @@ -536,7 +552,10 @@ class TestIntervalValue : public BtreeIntervalValue { } ///////////////////////////// Overriding methods of BtreeIntervalValue ////////////////////////// - void shift(int n, void* app_ctx) override { m_offset += n; } + void shift(int n, void* app_ctx) override { + if (willAdditionOverflow< uint32_t >(m_offset, n)) { m_base_val++; } + m_offset += n; + } sisl::blob serialize_prefix() const override { return sisl::blob{uintptr_cast(const_cast< uint32_t* >(&m_base_val)), uint32_cast(sizeof(uint32_t))}; diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp index 275a47caa..af803bfd4 100644 --- a/src/tests/test_btree_node.cpp +++ b/src/tests/test_btree_node.cpp @@ -107,7 +107,7 @@ struct NodeTest : public testing::Test { } } - void put_range(uint32_t k, uint32_t count) { + void put_range(uint64_t k, uint32_t count) { btree_put_type put_type; if constexpr (!std::is_same_v< V, TestIntervalValue >) { // For non-interval values we support only update, so we need to first put the value @@ -377,6 +377,12 @@ TYPED_TEST(NodeTest, SimpleInsert) { this->m_node1->copy_by_entries(this->m_cfg, *this->m_node2, 0, std::numeric_limits< uint32_t >::max()); } +TYPED_TEST(NodeTest, RangeChangeInsert) { + if (this->m_node1->get_node_type() != btree_node_type::PREFIX) {return;} + this->put_range(0xFFFFFFFF - 10,20); + this->print(); +} + TYPED_TEST(NodeTest, ReverseInsert) { for (uint32_t i{100}; (i > 0 && this->has_room()); --i) { this->put(i - 1, btree_put_type::INSERT); diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index c7e196254..254432cb5 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -112,15 +112,16 @@ class SequenceGenerator { OperationList generateOperations(size_t numOperations, bool reset = false) { std::vector< Operation > operations; if (reset) { this->reset(); } - if(putFreq_ == 100 && end_range_ - start_range_ + 1 - in_use_key_cnt_.load() < numOperations) { - LOGDEBUG("All keys are in use, skipping operation generation. end_range_ {} start_range_ {} in_use_key_cnt_ {}, numOperations {}", + if (putFreq_ == 100 && end_range_ - start_range_ + 1 - in_use_key_cnt_.load() < numOperations) { + LOGDEBUG("All keys are in use, skipping operation generation. end_range_ {} start_range_ {} " + "in_use_key_cnt_ {}, numOperations {}", end_range_, start_range_, in_use_key_cnt_.load(), numOperations); - return operations; + return operations; } - if(removeFreq_ == 100 && in_use_key_cnt_.load() < numOperations) { + if (removeFreq_ == 100 && in_use_key_cnt_.load() < numOperations) { LOGDEBUG("Not enough keys are in use, skipping operation generation. in_use_key_cnt_ {} numOperations {}", in_use_key_cnt_.load(), numOperations); - return operations; + return operations; } while (operations.size() < numOperations) { @@ -536,7 +537,8 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT void long_running_crash(long_running_crash_options const& crash_test_options) { // set putFreq 100 for the initial load - SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, crash_test_options.num_entries - 1 /*end_range*/); + SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, + crash_test_options.num_entries - 1 /*end_range*/); std::vector< std::string > flips; OperationList operations; @@ -561,9 +563,11 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_0.txt")); } else { operations = generator.generateOperations(crash_test_options.preload_size, true /* reset */); - if (crash_test_options.save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); } + if (crash_test_options.save_mode) { + SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); + } } - + LOGDEBUG("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); uint32_t num_keys{0}; @@ -583,8 +587,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT // this->print_keys("reapply: after preload"); this->visualize_keys("tree_after_preload.dot"); - for (uint32_t round = 1; - round <= crash_test_options.rounds && !time_to_stop(); round++) { + for (uint32_t round = 1; round <= crash_test_options.rounds && !time_to_stop(); round++) { LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, crash_test_options.rounds); bool print_time = false; elapsed_time = get_elapsed_time_sec(m_start_time); @@ -592,12 +595,13 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT if (crash_test_options.load_mode) { operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); } else { - operations = generator.generateOperations(crash_test_options.num_entries_per_rounds, renew_btree_after_crash /* reset */); + operations = generator.generateOperations(crash_test_options.num_entries_per_rounds, + renew_btree_after_crash /* reset */); if (crash_test_options.save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations); } } - if(operations.empty()) { + if (operations.empty()) { LOGDEBUG("No operations generated, skipping round {}", round); continue; } @@ -624,7 +628,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT flips.emplace_back(flip); } auto log_str = fmt::format("Step 1-{}: Set flag", round); - for(auto const& f : flips) { + for (auto const& f : flips) { log_str += fmt::format(" {}", f); this->set_basic_flip(f, 1, 100); } @@ -634,14 +638,16 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT file.close(); } else { if (dis(g_re) <= flip_percentage) { - if(!crash_test_options.put_flips.empty()) { - flips.emplace_back(crash_test_options.put_flips[cur_put_flip_idx++ % crash_test_options.put_flips.size()]); + if (!crash_test_options.put_flips.empty()) { + flips.emplace_back( + crash_test_options.put_flips[cur_put_flip_idx++ % crash_test_options.put_flips.size()]); } - if(!crash_test_options.remove_flips.empty()) { - flips.emplace_back(crash_test_options.remove_flips[cur_remove_flip_idx++ % crash_test_options.remove_flips.size()]); + if (!crash_test_options.remove_flips.empty()) { + flips.emplace_back(crash_test_options.remove_flips[cur_remove_flip_idx++ % + crash_test_options.remove_flips.size()]); } auto log_str = fmt::format("Step 1-{}: Set flag", round); - for(auto const& f : flips) { + for (auto const& f : flips) { log_str += fmt::format(" {}", f); this->set_basic_flip(f, 1, 100); } @@ -667,12 +673,12 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT file.close(); } } - + LOGDEBUG("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); - + for (auto [k, op] : operations) { if (op == OperationType::Remove) { - if(num_keys < 1) { + if (num_keys < 1) { // remove flips and continue for (auto const& flip : flips) { this->remove_flip(flip); @@ -719,11 +725,12 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT print_time = true; } if (print_time) { - LOGINFO("\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " - "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", - round, crash_test_options.rounds, round * 100.0 / crash_test_options.rounds, elapsed_time, this->m_run_time, - elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), crash_test_options.num_entries, - this->tree_key_count() * 100.0 / crash_test_options.num_entries); + LOGINFO( + "\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " + "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", + round, crash_test_options.rounds, round * 100.0 / crash_test_options.rounds, elapsed_time, + this->m_run_time, elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), + crash_test_options.num_entries, this->tree_key_count() * 100.0 / crash_test_options.num_entries); } // this->print_keys(fmt::format("reapply: after round {}", round)); if (renew_btree_after_crash) { this->reset_btree(); }; @@ -737,7 +744,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT }; // Crash recovery can test one simple btree, since focus is not on btree test itself, but index recovery -using BtreeTypes = testing::Types< FixedLenBtree >; +using BtreeTypes = testing::Types< FixedLenBtree, PrefixIntervalBtree >; TYPED_TEST_SUITE(IndexCrashTest, BtreeTypes); TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) { @@ -856,7 +863,7 @@ TYPED_TEST(IndexCrashTest, long_running_put_remove_crash) { long_running_crash_options crash_test_options{ .put_freq = 50, .put_flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", - "crash_flush_on_split_at_right_child"}, + "crash_flush_on_split_at_right_child"}, .remove_flips = {"crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child" /*, "crash_flush_on_freed_child"*/}, }; From 575d8c153f36fe235c93fa95b4e531713efd5849 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Wed, 4 Jun 2025 08:06:59 +0800 Subject: [PATCH 133/170] add get_used_blk for vchunk (#733) --- conanfile.py | 2 +- src/include/homestore/vchunk.h | 1 + src/lib/device/vchunk.cpp | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index e3292bfaf..a76cb945b 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.15.4" + version = "6.16.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/vchunk.h b/src/include/homestore/vchunk.h index 4b69b1332..c3f020aa1 100644 --- a/src/include/homestore/vchunk.h +++ b/src/include/homestore/vchunk.h @@ -31,6 +31,7 @@ class VChunk { const uint8_t* get_user_private() const; blk_num_t get_total_blks() const; blk_num_t available_blks() const; + blk_num_t get_used_blks() const; blk_num_t get_defrag_nblks() const; uint32_t get_pdev_id() const; uint16_t get_chunk_id() const; diff --git a/src/lib/device/vchunk.cpp b/src/lib/device/vchunk.cpp index a809450d1..69d8b9579 100644 --- a/src/lib/device/vchunk.cpp +++ b/src/lib/device/vchunk.cpp @@ -25,6 +25,8 @@ const uint8_t* VChunk::get_user_private() const { return m_internal_chunk->user_ blk_num_t VChunk::get_total_blks() const { return m_internal_chunk->blk_allocator()->get_total_blks(); } +blk_num_t VChunk::get_used_blks() const { return m_internal_chunk->blk_allocator()->get_used_blks(); } + void VChunk::reset() { m_internal_chunk->blk_allocator_mutable()->reset(); } blk_num_t VChunk::available_blks() const { return m_internal_chunk->blk_allocator()->available_blks(); } From 3f79968c5cdaec88573fa79f544132e212b1f126 Mon Sep 17 00:00:00 2001 From: Sanal Date: Wed, 4 Jun 2025 15:12:15 -0700 Subject: [PATCH 134/170] Add truncation and recovery changes for solo repl dev. (#734) Add commit blk for recovery and truncation for solo repl dev. --- conanfile.py | 2 +- src/include/homestore/homestore_decl.hpp | 3 +- src/lib/logstore/log_store.cpp | 7 -- src/lib/replication/repl_dev/common.h | 1 + .../replication/repl_dev/solo_repl_dev.cpp | 40 ++++++++- src/lib/replication/repl_dev/solo_repl_dev.h | 17 +++- .../replication/service/generic_repl_svc.cpp | 6 +- src/tests/test_solo_repl_dev.cpp | 88 ++++++++++++++++++- 8 files changed, 143 insertions(+), 21 deletions(-) diff --git a/conanfile.py b/conanfile.py index a76cb945b..027c780bb 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.16.0" + version = "6.16.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/homestore_decl.hpp b/src/include/homestore/homestore_decl.hpp index b36317ea9..05d62ebb6 100644 --- a/src/include/homestore/homestore_decl.hpp +++ b/src/include/homestore/homestore_decl.hpp @@ -212,4 +212,5 @@ struct cap_attrs { } // namespace homestore ////////////// Misc /////////////////// -#define HOMESTORE_LOG_MODS btree, device, blkalloc, cp, metablk, wbcache, logstore, transient, replication, journalvdev +#define HOMESTORE_LOG_MODS \ + btree, device, blkalloc, cp, metablk, wbcache, logstore, transient, replication, journalvdev, solorepl diff --git a/src/lib/logstore/log_store.cpp b/src/lib/logstore/log_store.cpp index 26a5dba5f..f4eee9760 100644 --- a/src/lib/logstore/log_store.cpp +++ b/src/lib/logstore/log_store.cpp @@ -340,13 +340,6 @@ logstore_seq_num_t HomeLogStore::get_contiguous_completed_seq_num(logstore_seq_n bool HomeLogStore::flush(logstore_seq_num_t upto_lsn) { if (is_stopping()) return false; incr_pending_request_num(); - if (!m_logdev->allow_explicit_flush()) { - HS_LOG_ASSERT(false, - "Explicit flush is turned off or calling flush on wrong thread for this logdev, ignoring flush"); - decr_pending_request_num(); - return false; - } - m_logdev->flush_under_guard(); decr_pending_request_num(); return true; diff --git a/src/lib/replication/repl_dev/common.h b/src/lib/replication/repl_dev/common.h index c3433083f..3085a9d3c 100644 --- a/src/lib/replication/repl_dev/common.h +++ b/src/lib/replication/repl_dev/common.h @@ -78,6 +78,7 @@ struct repl_dev_superblk { rdev_name[max_name_len - 1] = '\0'; } }; + #pragma pack() template < class V = folly::Unit > diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index bb96a3fb0..2bb38a4df 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -1,3 +1,4 @@ +#include #include #include "replication/repl_dev/solo_repl_dev.h" #include "replication/repl_dev/common.h" @@ -6,9 +7,13 @@ #include #include #include "common/homestore_assert.hpp" +#include "common/homestore_config.hpp" +#include + +SISL_LOGGING_DECL(solorepl) namespace homestore { -SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing) : +SoloReplDev::SoloReplDev(superblk< solo_repl_dev_superblk >&& rd_sb, bool load_existing) : m_rd_sb{std::move(rd_sb)}, m_group_id{m_rd_sb->group_id} { if (load_existing) { m_logdev_id = m_rd_sb->logdev_id; @@ -21,11 +26,13 @@ SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existi m_data_journal->register_log_found_cb(bind_this(SoloReplDev::on_log_found, 3)); m_is_recovered = true; }); + m_commit_upto = m_rd_sb->durable_commit_lsn; } else { m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::TIMER); m_data_journal = logstore_service().create_new_log_store(m_logdev_id, true /* append_mode */); m_rd_sb->logstore_id = m_data_journal->get_store_id(); m_rd_sb->logdev_id = m_logdev_id; + m_rd_sb->checkpoint_lsn = -1; m_rd_sb.write(); m_is_recovered = true; } @@ -134,13 +141,13 @@ folly::Future< std::error_code > SoloReplDev::async_write(const std::vector< Mul } return folly::collectAllUnsafe(futs).thenValue([this](auto&& v_res) { + decr_pending_request_num(); for (const auto& err_c : v_res) { if (sisl_unlikely(err_c.value())) { return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::io_error)); } } - decr_pending_request_num(); return folly::makeFuture< std::error_code >(std::error_code{}); }); } @@ -195,6 +202,10 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx auto cur_lsn = m_commit_upto.load(); if (cur_lsn < lsn) { m_commit_upto.compare_exchange_strong(cur_lsn, lsn); } + for (const auto& blkid : blkids) { + data_service().commit_blk(blkid); + } + m_listener->on_commit(lsn, header, key, blkids, nullptr); } @@ -224,11 +235,34 @@ uint32_t SoloReplDev::get_blk_size() const { return data_service().get_blk_size( void SoloReplDev::cp_flush(CP*) { auto lsn = m_commit_upto.load(); m_rd_sb->durable_commit_lsn = lsn; + // Store the LSN's for last 3 checkpoints + m_rd_sb->last_checkpoint_lsn_2 = m_rd_sb->last_checkpoint_lsn_1; + m_rd_sb->last_checkpoint_lsn_1 = m_rd_sb->checkpoint_lsn; m_rd_sb->checkpoint_lsn = lsn; + HS_LOG(TRACE, solorepl, "dev={} cp flush cp_lsn={} cp_lsn_1={} cp_lsn_2={}", boost::uuids::to_string(group_id()), + lsn, m_rd_sb->last_checkpoint_lsn_1, m_rd_sb->last_checkpoint_lsn_2); m_rd_sb.write(); } -void SoloReplDev::cp_cleanup(CP*) { /* m_data_journal->truncate(m_rd_sb->checkpoint_lsn); */ +void SoloReplDev::truncate() { + // Ignore truncate when HS is initializing. And we need atleast 3 checkpoints to start truncating. + + if (homestore::hs()->is_initializing() || m_rd_sb->last_checkpoint_lsn_2 <= 0) { return; } + + // Truncate is safe anything below last_checkpoint_lsn - 2 as all the free blks + // before that will be flushed in the last_checkpoint. + HS_LOG(TRACE, solorepl, "dev={} truncating at lsn={}", boost::uuids::to_string(group_id()), + m_rd_sb->last_checkpoint_lsn_2); + m_data_journal->truncate(m_rd_sb->last_checkpoint_lsn_2); +} + +void SoloReplDev::cp_cleanup(CP*) { +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("solo_repl_dev_manual_truncate")) { return; } +#endif + // cp_cleanup is called after all components' CP flush is done. + // We call truncate during cp clean up. + truncate(); } } // namespace homestore diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 9cf41dcce..fc37b30b5 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -28,17 +29,28 @@ namespace homestore { class CP; +#pragma pack(1) + +struct solo_repl_dev_superblk : public repl_dev_superblk { + // Store the last 2 checkpoint lsn's where + // last_checkpoint_lsn_2 < last_checkpoint_lsn_1 < checkpoint_lsn + repl_lsn_t last_checkpoint_lsn_1{-1}; // LSN at last_checkpoint - 1 + repl_lsn_t last_checkpoint_lsn_2{-1}; // LSN at last_checkpoint - 2 +}; + +#pragma pack() + class SoloReplDev : public ReplDev { private: logdev_id_t m_logdev_id; std::shared_ptr< HomeLogStore > m_data_journal{nullptr}; - superblk< repl_dev_superblk > m_rd_sb; + superblk< solo_repl_dev_superblk > m_rd_sb; uuid_t m_group_id; std::atomic< logstore_seq_num_t > m_commit_upto{-1}; std::atomic< bool > m_is_recovered{false}; public: - SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing); + SoloReplDev(superblk< solo_repl_dev_superblk >&& rd_sb, bool load_existing); virtual ~SoloReplDev() = default; virtual std::error_code alloc_blks(uint32_t data_size, const blk_alloc_hints& hints, @@ -94,6 +106,7 @@ class SoloReplDev : public ReplDev { void cp_cleanup(CP* cp); void destroy(); + void truncate(); private: void write_journal(repl_req_ptr_t rreq); diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 2debd1ae5..849c91729 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -79,7 +79,7 @@ hs_stats GenericReplService::get_cap_stats() const { ///////////////////// SoloReplService specializations and CP Callbacks ///////////////////////////// SoloReplService::SoloReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} {} -SoloReplService::~SoloReplService(){}; +SoloReplService::~SoloReplService() {}; void SoloReplService::start() { for (auto const& [buf, mblk] : m_sb_bufs) { @@ -119,7 +119,7 @@ void SoloReplService::stop() { AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t group_id, std::set< replica_id_t > const& members) { - superblk< repl_dev_superblk > rd_sb{get_meta_blk_name()}; + superblk< solo_repl_dev_superblk > rd_sb{get_meta_blk_name()}; rd_sb.create(); rd_sb->group_id = group_id; auto rdev = std::make_shared< SoloReplDev >(std::move(rd_sb), false /* load_existing */); @@ -174,7 +174,7 @@ folly::SemiFuture< ReplServiceError > SoloReplService::remove_repl_dev(group_id_ } void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) { - superblk< repl_dev_superblk > rd_sb{get_meta_blk_name()}; + superblk< solo_repl_dev_superblk > rd_sb{get_meta_blk_name()}; rd_sb.load(buf, meta_cookie); HS_DBG_ASSERT_EQ(rd_sb->get_magic(), repl_dev_superblk::REPL_DEV_SB_MAGIC, "Invalid rdev metablk, magic mismatch"); HS_DBG_ASSERT_EQ(rd_sb->get_version(), repl_dev_superblk::REPL_DEV_SB_VERSION, "Invalid version of rdev metablk"); diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 23a429722..935f17230 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,7 @@ #include "common/homestore_utils.hpp" #include "test_common/homestore_test_common.hpp" #include "replication/service/generic_repl_svc.h" +#define private public #include "replication/repl_dev/solo_repl_dev.h" //////////////////////////////////////////////////////////////////////////// @@ -131,8 +133,10 @@ class SoloReplDevTest : public testing::Test { cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received error={} on repl_dev", enum_name(error)); } - void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {} - void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {} + void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + trace_id_t tid) override {} + void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + trace_id_t tid) override {} void on_destroy(const group_id_t& group_id) override {} void notify_committed_lsn(int64_t lsn) override {} void on_config_rollback(int64_t lsn) override {} @@ -183,6 +187,9 @@ class SoloReplDevTest : public testing::Test { m_repl_dev2 = hs()->repl_service().create_repl_dev(m_uuid2, {}).get().value(); } + shared< ReplDev > repl_dev1() { return m_repl_dev1; } + shared< ReplDev > repl_dev2() { return m_repl_dev2; } + virtual void TearDown() override { m_repl_dev1.reset(); m_repl_dev2.reset(); @@ -224,7 +231,8 @@ class SoloReplDevTest : public testing::Test { rdev->async_alloc_write(*req->header, req->key ? *req->key : sisl::blob{}, req->write_sgs, req); } - void async_write_data_and_journal(uint32_t key_size, uint64_t data_size, uint32_t max_size_per_iov) { + intrusive< test_repl_req > async_write_data_and_journal(uint32_t key_size, uint64_t data_size, + uint32_t max_size_per_iov, bool rand_dev = true) { data_size = data_size == 0 ? g_block_size : data_size; auto req = intrusive< test_repl_req >(new test_repl_req()); req->header = sisl::make_byte_array(sizeof(test_repl_req::journal_header)); @@ -241,7 +249,8 @@ class SoloReplDevTest : public testing::Test { req->write_sgs = HSTestHelper::create_sgs(data_size, max_size_per_iov, hdr->data_pattern); - auto& rdev = (rand() % 2) ? m_repl_dev1 : m_repl_dev2; + auto rdev = m_repl_dev1; + if (rand_dev) { rdev = (rand() % 2) ? m_repl_dev1 : m_repl_dev2; } auto const cap = hs()->repl_service().get_cap_stats(); LOGDEBUG("Before write, cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); @@ -256,6 +265,7 @@ class SoloReplDevTest : public testing::Test { RELEASE_ASSERT(!err, "Error during async_write"); rdev->async_write_journal(blkids, *req->header, req->key ? *req->key : sisl::blob{}, data_size, req); }); + return req; } void validate_replay(ReplDev& rdev, int64_t lsn, sisl::blob const& header, sisl::blob const& key, @@ -297,6 +307,22 @@ class SoloReplDevTest : public testing::Test { } } + void validate_sync(shared< ReplDev > rdev, intrusive< test_repl_req > req) { + auto const hdr = r_cast< test_repl_req::journal_header const* >(req->header->cbytes()); + for (const auto& blkid : req->written_blkids) { + uint32_t size = blkid.blk_count() * g_block_size; + auto read_sgs = HSTestHelper::create_sgs(size, size); + auto err = rdev->async_read(blkid, read_sgs, size).get(); + RELEASE_ASSERT(!err, "Error during async_read"); + for (auto const& iov : read_sgs.iovs) { + HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr->data_pattern); + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + LOGDEBUG("[{}] Validating of blkid={} validated successfully", boost::uuids::to_string(rdev->group_id()), + blkid.to_string()); + } + } + void on_write_complete(ReplDev& rdev, intrusive< test_repl_req > req) { if (req->written_blkids.empty()) { m_io_runner.next_task(); @@ -336,6 +362,36 @@ class SoloReplDevTest : public testing::Test { } } } + + void trigger_cp_flush() { homestore::hs()->cp_mgr().trigger_cp_flush(true /* force */).get(); } + void truncate_and_verify(shared< ReplDev > repl_dev) { + auto solo_dev = std::dynamic_pointer_cast< SoloReplDev >(repl_dev); + // Truncate and verify the CP LSN's + solo_dev->truncate(); + + auto& sb = solo_dev->m_rd_sb; + RELEASE_ASSERT(sb->last_checkpoint_lsn_2 <= sb->last_checkpoint_lsn_1, "invalid cp lsn"); + RELEASE_ASSERT(sb->last_checkpoint_lsn_1 <= sb->checkpoint_lsn, "invalid cp lsn"); + + auto [last_trunc_lsn, trunc_ld_key, tail_lsn] = solo_dev->m_data_journal->truncate_info(); + RELEASE_ASSERT(sb->last_checkpoint_lsn_2 == last_trunc_lsn, "invalid trunc lsn"); + } + +#ifdef _PRERELEASE + void set_flip_point(const std::string flip_name) { + flip::FlipCondition null_cond; + flip::FlipFrequency freq; + freq.set_count(2); + freq.set_percent(100); + m_fc.inject_noreturn_flip(flip_name, {null_cond}, freq); + LOGINFO("Flip {} set", flip_name); + } +#endif + +private: +#ifdef _PRERELEASE + flip::FlipClient m_fc{iomgr_flip::instance()}; +#endif }; TEST_F(SoloReplDevTest, TestSingleDataBlock) { @@ -381,6 +437,23 @@ TEST_F(SoloReplDevTest, TestAsyncWriteJournal) { this->m_task_waiter.start([this]() { this->restart(); }).get(); } +#ifdef _PRERELEASE +TEST_F(SoloReplDevTest, TestTruncate) { + // Write and truncate on repl dev. + LOGINFO("Step 1: run on worker threads to schedule write and truncate"); + + set_flip_point("solo_repl_dev_manual_truncate"); + + m_io_runner.set_task([this]() mutable { + this->async_write_data_and_journal(0u, g_block_size, g_block_size, false /* rand_dev */); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + truncate_and_verify(repl_dev1()); + }); + m_io_runner.execute().get(); + std::this_thread::sleep_for(std::chrono::seconds(1)); +} +#endif + SISL_OPTION_GROUP(test_solo_repl_dev, (block_size, "", "block_size", "block size to io", ::cxxopts::value< uint32_t >()->default_value("4096"), "number")); @@ -392,6 +465,13 @@ int main(int argc, char* argv[]) { sisl::logging::SetLogger("test_solo_repl_dev"); spdlog::set_pattern("[%D %T%z] [%^%l%$] [%n] [%t] %v"); + // TODO make it part of the test case. + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + // Checkpoint taken every 1s + s.generic.cp_timer_us = 1000000; + }); + HS_SETTINGS_FACTORY().save(); + g_block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); return RUN_ALL_TESTS(); } From 85f56726452a600ed53cbfcf88bfdbd389286a9f Mon Sep 17 00:00:00 2001 From: yuwmao Date: Mon, 9 Jun 2025 15:15:36 +0800 Subject: [PATCH 135/170] Enhance quorum check in replace_member --- conanfile.py | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 26 +++++++++++++------ src/lib/replication/repl_dev/raft_repl_dev.h | 1 + 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/conanfile.py b/conanfile.py index 027c780bb..e4970c8dd 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.16.1" + version = "6.16.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index de7f9aaca..f6bccae66 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -192,21 +192,22 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m // quorum safety check. TODO currently only consider lsn, need to check last response time. auto active_peers = get_active_peers(); // active_peers doesn't include leader itself. - auto quorum = active_peers.size() + 1; + auto active_num = active_peers.size() + 1; for (const auto& p : active_peers) { - quorum = p == member_out.id ? quorum - 1 : quorum; - quorum = p == member_in.id ? quorum - 1 : quorum; + active_num = p == member_out.id ? active_num - 1 : active_num; + active_num = p == member_in.id ? active_num - 1 : active_num; } RD_LOGD(trace_id, "Step1. Replace member, quorum safety check, active_peers={}, active_peers_exclude_out/in_member={}, " "commit_quorum={}", - active_peers.size(), quorum, commit_quorum); + active_peers.size(), active_num, commit_quorum); // commit_quorum=0 means actual commit quorum is the majority. In this case, active normal member count should be - // greater than 1. To be more specific, if we have S1(leader), S2, S3(out), S4(in), we don't allow + // >= majority. To be more specific, if we have S1(leader), S2, S3(out), S4(in), we don't allow // replace_member(S3, S4) if S2 is down or laggy. Needs to recover S2 first or retry with commit_quorum=1. - if (quorum <= 1 && commit_quorum == 0) { - RD_LOGE(trace_id, "Step1. Replace member, quorum safety check failed, active_peers={}, active_peers_exclude_out/in_member={}, commit_quorum={}", - active_peers.size(), quorum, commit_quorum); + auto quorum = get_quorum_for_commit(); + if (active_num < quorum && commit_quorum == 0) { + RD_LOGE(trace_id, "Step1. Replace member, quorum safety check failed, active_peers={}, active_peers_exclude_out/in_member={}, required_quorum={}, commit_quorum={}", + active_peers.size(), active_num, quorum, commit_quorum); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::QUORUM_NOT_MET); @@ -1541,6 +1542,15 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { return res; } +uint32_t RaftReplDev::get_quorum_for_commit() const { + auto peers = get_replication_status(); + auto quorum = 0; + for (auto& p : peers) { + if (p.can_vote) { quorum++; } + } + return quorum / 2 + 1; +} + uint32_t RaftReplDev::get_blk_size() const { return data_service().get_blk_size(); } nuraft_mesg::repl_service_ctx* RaftReplDev::group_msg_service() { return m_repl_svc_ctx.get(); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index abede36bf..2f4d9c877 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -293,6 +293,7 @@ class RaftReplDev : public ReplDev, repl_lsn_t get_last_append_lsn() override { return raft_server()->get_last_log_idx(); } bool is_destroy_pending() const; bool is_destroyed() const; + uint32_t get_quorum_for_commit() const; Clock::time_point destroyed_time() const { return m_destroyed_time; } bool is_ready_for_traffic() const override; From 5efea90c14f9ccb06c12d0fe0ebda9a8336633af Mon Sep 17 00:00:00 2001 From: yawzhang Date: Fri, 6 Jun 2025 16:26:10 +0800 Subject: [PATCH 136/170] determine proposer based on server_id --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 2 +- src/lib/replication/repl_dev/raft_state_machine.cpp | 12 ++++++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/conanfile.py b/conanfile.py index e4970c8dd..b68276733 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.16.2" + version = "6.16.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index f6bccae66..59e243254 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -882,7 +882,7 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ } // rreq->init will allocate the block if it has linked data. - auto status = init_req_ctx(rreq, rkey, code, false /* is_proposer */, user_header, key, data_size, m_listener); + auto status = init_req_ctx(rreq, rkey, code, m_raft_server_id == rkey.server_id, user_header, key, data_size, m_listener); if (status != ReplServiceError::OK) { RD_LOGD(rkey.traceID, "For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 3c68c07b5..3672cdff8 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -135,13 +135,17 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_finish(nuraft::log_entry // If we are able to locate that req in the map for this entry, it could be one of // a) This is an inline data and don't need any localization // b) This is a proposer and thus don't need any localization - // c) This is an indirect data and we received raft entry append from leader and localized the journal entry. - // d) This is an indirect data and we received only on data channel, but no raft entry append from leader. This + // c) This is a proposer but term has changed. This can happen if the leader re-election happen between + // saving req and proposing it to raft. + // d) This is an indirect data and we received raft entry append from leader and localized the journal entry. + // e) This is an indirect data and we received only on data channel, but no raft entry append from leader. This // would mean _prepare is never called but directly finish is called. This can happen if that the leader is not // the original proposer (perhaps unsupported scenario at this time) // - // On case a), b), we return the rreq as is. For case c), we just need to localize the actual server_id as well (as - // finishing step). For case d), we prepare the localization of journal entry and then finish them + // On case a), b), we return the rreq as is. + // For case c), we localize the actual term and then finish them as proposer. + // For case d), we just need to localize the actual server_id as well (as finishing step). + // For case e), we prepare the localization of journal entry and then finish them // // // If we are not able to locate that req in the map for this entry, it means that no entry from raft leader is From 75b6b54c23ea10124f45c608e6b151ca479446f7 Mon Sep 17 00:00:00 2001 From: ywz <649521587@qq.com> Date: Wed, 11 Jun 2025 18:55:22 +0800 Subject: [PATCH 137/170] remove rreq after listener commit && handle gc free blk error according to error code (#744) Co-authored-by: yawzhang --- conanfile.py | 2 +- .../replication/repl_dev/raft_repl_dev.cpp | 29 ++++++++++++++----- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/conanfile.py b/conanfile.py index b68276733..59d05a89e 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.16.3" + version = "6.16.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 59e243254..0f3bddc94 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1309,11 +1309,6 @@ void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { if (!rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { commit_blk(rreq); } - // Remove the request from repl_key map. - m_repl_key_req_map.erase(rreq->rkey()); - // Remove the request from lsn map. - m_state_machine->unlink_lsn_to_req(rreq->lsn(), rreq); - auto cur_dsn = m_next_dsn.load(std::memory_order_relaxed); while (cur_dsn <= rreq->dsn()) { m_next_dsn.compare_exchange_strong(cur_dsn, rreq->dsn() + 1); @@ -1337,6 +1332,16 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { rreq->lsn(), prev_lsn); } + // Remove the request from repl_key map only after the listener operation is completed. + // This prevents unnecessary block allocation in the following scenario: + // 1. The follower processes a commit for LSN 100 and remove rreq from rep_key map before listener commit + // 2. The follower receives a duplicate append request from leader and attempts to localize it in 'raft_event' step + // 3. since the old rreq has been removed, the follower alloc new blks for LSN 100, resulting in unnecessary garbage + // By deferring the removal of the request until after the listener's commit, the listener can recognize that + // data already exist for duplicated requests, preventing the unnecessary allocation described in step 3. + m_repl_key_req_map.erase(rreq->rkey()); + // Remove the request from lsn map. + m_state_machine->unlink_lsn_to_req(rreq->lsn(), rreq); if (!rreq->is_proposer()) rreq->clear(); } @@ -1993,9 +1998,17 @@ void RaftReplDev::gc_repl_reqs() { if (removing_rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { auto blkid = removing_rreq->local_blkid(); data_service().async_free_blk(blkid).thenValue([this, blkid, removing_rreq](auto&& err) { - HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", - blkid.to_string()); - RD_LOGD(removing_rreq->traceID(), "GC rreq: Releasing blkid={} freed successfully", blkid.to_string()); + if (!err) { + RD_LOGD(removing_rreq->traceID(), "GC rreq: Releasing blkid={} freed successfully", blkid.to_string()); + } else if (err == std::make_error_code(std::errc::operation_canceled)) { + // The gc reaper thread stops after the data service has been stopped, + // leading to a scenario where it attempts to free the blkid while the data service is inactive. + // In this case, we ignore the error and simply log a warning. + RD_LOGW(removing_rreq->traceID(), "GC rreq: Releasing blkid={} canceled", blkid.to_string()); + } else { + HS_LOG_ASSERT(false, "[traceID={}] freeing blkid={} upon error failed, potential to cause blk leak", + removing_rreq->traceID(), blkid.to_string()); + } }); } // 2. remove from the m_repl_key_req_map From c752d357548881a6aa1a47e2e9b6c61406392909 Mon Sep 17 00:00:00 2001 From: yuwmao <148639999+yuwmao@users.noreply.github.com> Date: Tue, 17 Jun 2025 08:02:57 +0800 Subject: [PATCH 138/170] Add a get_replace_member_status API for upper layer to query status (#746) * Add a get_replace_member_status API for upper layer to query status - get_replace_member_status should always be called to leader. - Add a task_id in replace member implementation to track the status. * Pass task_id to ReplDevListener * Inherit priority in replace_member * Upgrade nuraft version and remove set priority logic --- conanfile.py | 4 +- .../homestore/replication/repl_decls.h | 9 + src/include/homestore/replication/repl_dev.h | 4 +- src/include/homestore/replication_service.hpp | 17 +- .../replication/repl_dev/raft_repl_dev.cpp | 345 ++++++++++++------ src/lib/replication/repl_dev/raft_repl_dev.h | 18 +- .../replication/service/generic_repl_svc.cpp | 10 +- .../replication/service/generic_repl_svc.h | 7 +- .../replication/service/raft_repl_service.cpp | 22 +- .../replication/service/raft_repl_service.h | 10 +- src/tests/test_common/raft_repl_test_base.hpp | 36 +- src/tests/test_raft_repl_dev_dynamic.cpp | 112 ++++-- src/tests/test_solo_repl_dev.cpp | 4 +- 13 files changed, 428 insertions(+), 170 deletions(-) diff --git a/conanfile.py b/conanfile.py index 59d05a89e..13933aca1 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.16.4" + version = "6.17.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" @@ -54,7 +54,7 @@ def build_requirements(self): def requirements(self): self.requires("iomgr/[^11.3]@oss/master", transitive_headers=True) self.requires("sisl/[^12.2]@oss/master", transitive_headers=True) - self.requires("nuraft_mesg/[~3.8.0]@oss/main", transitive_headers=True) + self.requires("nuraft_mesg/[~3.8.4]@oss/main", transitive_headers=True) self.requires("farmhash/cci.20190513@", transitive_headers=True) if self.settings.arch in ['x86', 'x86_64']: diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 6094d0ada..50b690d4e 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -38,7 +38,16 @@ VENUM(ReplServiceError, int32_t, DATA_DUPLICATED = -20002, QUIENCE_STATE = -20003, QUORUM_NOT_MET = -20004, + REPLACE_MEMBER_TASK_MISMATCH = -20005, FAILED = -32768); + +VENUM(ReplaceMemberStatus, int32_t, + COMPLETED = 0, + IN_PROGRESS = 1, + NOT_LEADER = 2, + TASK_ID_MISMATCH = 3, + TASK_NOT_FOUND = 4, + UNKNOWN = 5); // clang-format on template < typename V, typename E > diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index eedbebc44..934ba0354 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -369,11 +369,11 @@ class ReplDevListener { virtual void on_destroy(const group_id_t& group_id) = 0; /// @brief Called when start replace member. - virtual void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + virtual void on_start_replace_member(const uuid_t& task_id, const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) = 0; /// @brief Called when complete replace member. - virtual void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + virtual void on_complete_replace_member(const uuid_t& task_id, const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) = 0; /// @brief Called when the snapshot is being created by nuraft diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index f28704546..2adcc3584 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -43,16 +43,31 @@ class ReplicationService { /// @brief Replace one of the members with a new one. /// @param group_id Group where the replace member happens + /// @param task_id Id of the task which is going to be used for this operation. This is used to track the replace member. /// @param member_out The member which is going to be replaced /// @param member_in The member which is going to be added in place of member_out /// @param commit_quorum Commit quorum to be used for this operation. If 0, it will use the default commit quorum. /// @return A Future on replace the member accepted or Future ReplServiceError upon error - virtual AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + virtual AsyncReplResult<> replace_member(group_id_t group_id, uuid_t task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0) const = 0; virtual AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, uint64_t trace_id = 0) const = 0; + + /// @brief Get status of member replacement. + /// @param group_id Group where the replace member happens + /// @param task_id Id of the replace member task. This is used to track the replace + /// @param member_out The member which is going to be replaced + /// @param member_in The member which is going to be added in place of member_out + /// @param others Other members excluding member_out, member_in + /// @return ReplaceMemberStatus + virtual ReplaceMemberStatus get_replace_member_status(group_id_t group_id, uuid_t task_id, + const replica_member_info& member_out, + const replica_member_info& member_in, + const std::vector< replica_member_info >& others, + uint64_t trace_id = 0) const = 0; + /// @brief Get the repl dev for a given group id if it is already created or opened /// @param group_id Group id interested in /// @return ReplDev is opened or ReplServiceError::SERVER_NOT_FOUND if it doesn't exist diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 0f3bddc94..99a4ad44b 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -138,7 +138,7 @@ bool RaftReplDev::join_group() { } // All the steps in the implementation should be idempotent and retryable. -AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& member_out, +AsyncReplResult<> RaftReplDev::start_replace_member(uuid_t task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) { if (is_stopping()) { @@ -147,34 +147,37 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m } incr_pending_request_num(); - RD_LOGI(trace_id, "Start replace member, member_out={} member_in={}", boost::uuids::to_string(member_out.id), - boost::uuids::to_string(member_in.id)); + RD_LOGI(trace_id, "Start replace member, task_id={}, member_out={} member_in={}", boost::uuids::to_string(task_id), + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); - if (commit_quorum >= 1) { - // Two members are down and leader cant form the quorum. Reduce the quorum size. - reset_quorum_size(commit_quorum, trace_id); + // Step1, validate request + // TODO support rollback, this could happen when the first task failed, and we want to launch a new task to + // remediate it. Need to rollback the first task. And for the same task, it's reentrant and idempotent. + if (!m_rd_sb->replace_member_task.task_id.is_nil() && m_rd_sb->replace_member_task.task_id != task_id) { + RD_LOGE(trace_id, "Step1. Replace member, task_id={} is not the same as existing task_id={}", + boost::uuids::to_string(task_id), boost::uuids::to_string(m_rd_sb->replace_member_task.task_id)); + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::REPLACE_MEMBER_TASK_MISMATCH); } - // Step1, validate request auto out_srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member_out.id)); if (!out_srv_cfg) { auto in_srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member_in.id)); if (in_srv_cfg) { - RD_LOGI( - trace_id, - "Step1. Replace member, the intent has already been fulfilled, ignore it, member_out={} member_in={}", - boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); - reset_quorum_size(0, trace_id); + RD_LOGI(trace_id, + "Step1. Replace member, the intent has already been fulfilled, ignore it, task_id={}, " + "member_out={} member_in={}", + boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id), + boost::uuids::to_string(member_in.id)); decr_pending_request_num(); return make_async_success<>(); } - RD_LOGE(trace_id, "Step1. Replace member invalid parameter, out member is not found"); - reset_quorum_size(0, trace_id); + RD_LOGE(trace_id, "Step1. Replace member invalid parameter, out member is not found, task_id={}", + boost::uuids::to_string(task_id)); decr_pending_request_num(); return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } if (m_my_repl_id != get_leader_id()) { - reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::NOT_LEADER); } @@ -184,8 +187,8 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m // until the successor finishes the catch-up of the latest log, and then resign. Return NOT_LEADER and let // client retry. raft_server()->yield_leadership(false /* immediate */, -1 /* successor */); - RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out so yield leadership"); - reset_quorum_size(0, trace_id); + RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out so yield leadership, task_id={}", + boost::uuids::to_string(task_id)); decr_pending_request_num(); return make_async_error<>(ReplServiceError::NOT_LEADER); } @@ -199,20 +202,26 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m } RD_LOGD(trace_id, "Step1. Replace member, quorum safety check, active_peers={}, active_peers_exclude_out/in_member={}, " - "commit_quorum={}", - active_peers.size(), active_num, commit_quorum); + "commit_quorum={}, task_id={}", + active_peers.size(), active_num, commit_quorum, boost::uuids::to_string(task_id)); // commit_quorum=0 means actual commit quorum is the majority. In this case, active normal member count should be // >= majority. To be more specific, if we have S1(leader), S2, S3(out), S4(in), we don't allow // replace_member(S3, S4) if S2 is down or laggy. Needs to recover S2 first or retry with commit_quorum=1. auto quorum = get_quorum_for_commit(); if (active_num < quorum && commit_quorum == 0) { - RD_LOGE(trace_id, "Step1. Replace member, quorum safety check failed, active_peers={}, active_peers_exclude_out/in_member={}, required_quorum={}, commit_quorum={}", - active_peers.size(), active_num, quorum, commit_quorum); - reset_quorum_size(0, trace_id); + RD_LOGE(trace_id, + "Step1. Replace member, quorum safety check failed, active_peers={}, " + "active_peers_exclude_out/in_member={}, required_quorum={}, commit_quorum={}, task_id={}", + active_peers.size(), active_num, quorum, commit_quorum, boost::uuids::to_string(task_id)); decr_pending_request_num(); return make_async_error<>(ReplServiceError::QUORUM_NOT_MET); } + if (commit_quorum >= 1) { + // Two members are down and leader cant form the quorum. Reduce the quorum size. + reset_quorum_size(commit_quorum, trace_id); + } + // Step 2: Handle out member. #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("replace_member_set_learner_failure")) { @@ -220,25 +229,29 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m return make_async_error(ReplServiceError::FAILED); } #endif - RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner"); + RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner, task_id={}", + boost::uuids::to_string(task_id)); auto learner_ret = do_flip_learner(member_out, true, true, trace_id); if (learner_ret != ReplServiceError::OK) { - RD_LOGE(trace_id, "Step2. Replace member, failed to flip out member to learner {}", learner_ret); + RD_LOGE(trace_id, "Step2. Replace member, failed to flip out member to learner {}, task_id={}", learner_ret, + boost::uuids::to_string(task_id)); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error(std::move(learner_ret)); } - RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner and set priority to 0"); + RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner and set priority to 0, task_id={}", + boost::uuids::to_string(task_id)); // Step 3. Append log entry to mark the old member is out and new member is added. - RD_LOGI(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req, group_id={}", - group_id_str()); + RD_LOGI(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req, group_id={}, task_id={}", + boost::uuids::to_string(task_id), group_id_str()); auto rreq = repl_req_ptr_t(new repl_req_ctx{}); - replace_member_ctx members; - members.replica_out = member_out; - members.replica_in = member_in; + replace_member_ctx ctx; + ctx.task_id = task_id; + ctx.replica_out = member_out; + ctx.replica_in = member_in; - sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_member_ctx)); + sisl::blob header(r_cast< uint8_t* >(&ctx), sizeof(replace_member_ctx)); rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1), @@ -247,7 +260,9 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { - RD_LOGE(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req failed {}", err); + RD_LOGE(trace_id, + "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req failed, task_id={}, err={}", + boost::uuids::to_string(task_id), err); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(err)); @@ -260,21 +275,26 @@ AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& m return make_async_error(ReplServiceError::FAILED); } #endif - RD_LOGI(trace_id, "Step4. Replace member, propose to raft to add new member, group_id={}", group_id_str()); - auto ret = do_add_member(member_in, trace_id); + RD_LOGI(trace_id, "Step4. Replace member, propose to raft to add new member, group_id={}, task_id={}", + group_id_str(), boost::uuids::to_string(task_id)); + replica_member_info member_to_add = member_in; + member_to_add.priority = out_srv_cfg.get()->get_priority(); + auto ret = do_add_member(member_to_add, trace_id); if (ret != ReplServiceError::OK) { - RD_LOGE(trace_id, "Step4. Replace member, add member failed {}", ret); + RD_LOGE(trace_id, "Step4. Replace member, add member failed, err={}, task_id={}", ret, + boost::uuids::to_string(task_id)); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(ret)); } - RD_LOGI(trace_id, "Step4. Replace member, proposed to raft to add member, member={}", boost::uuids::to_string(member_in.id)); + RD_LOGI(trace_id, "Step4. Replace member, proposed to raft to add member, task_id={}, member={}", + boost::uuids::to_string(task_id), boost::uuids::to_string(member_in.id)); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_success<>(); } -AsyncReplResult<> RaftReplDev::complete_replace_member(const replica_member_info& member_out, +AsyncReplResult<> RaftReplDev::complete_replace_member(uuid_t task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) { if (is_stopping()) { @@ -283,8 +303,9 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(const replica_member_info } incr_pending_request_num(); - RD_LOGI(trace_id, "Complete replace member, member={}", boost::uuids::to_string(member_out.id), - boost::uuids::to_string(member_out.id)); + RD_LOGI(trace_id, "Complete replace member, task_id={}, member_out={}, member_in={}", + boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id), + boost::uuids::to_string(member_in.id)); if (commit_quorum >= 1) { // Two members are down and leader cant form the quorum. Reduce the quorum size. @@ -292,7 +313,8 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(const replica_member_info } // Step 5: Remove member - RD_LOGI(trace_id, "Step5. Replace member, remove old member, member={}", boost::uuids::to_string(member_out.id)); + RD_LOGI(trace_id, "Step5. Replace member, remove old member, task_id={}, member={}", + boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id)); #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("replace_member_remove_member_failure")) { RD_LOGE(trace_id, "Simulating remove member failure"); @@ -301,14 +323,14 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(const replica_member_info #endif auto ret = do_remove_member(member_out, trace_id); if (ret != ReplServiceError::OK) { - RD_LOGE(trace_id, "Step5. Replace member, failed to remove member, member={}, err={}", - boost::uuids::to_string(member_out.id), ret); + RD_LOGE(trace_id, "Step5. Replace member, failed to remove member, task_id={}, member={}, err={}", + boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id), ret); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(ret)); } - RD_LOGI(trace_id, "Step5. Replace member, proposed to raft to remove member, member={}", - boost::uuids::to_string(member_out.id)); + RD_LOGI(trace_id, "Step5. Replace member, proposed to raft to remove member, task_id={}, member={}", + boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id)); auto timeout = HS_DYNAMIC_CONFIG(consensus.wait_for_config_change_ms); // TODO Move wait logic to nuraft_mesg if (!wait_and_check( @@ -327,20 +349,22 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(const replica_member_info timeout); // If the member_out is down, leader will force remove it after // leave_timeout=leave_limit_(default=5)*heart_beat_interval_, it's better for client to retry it. - return make_async_error<>(ReplServiceError::CANCELLED); + return make_async_error<>(ReplServiceError::RETRY_REQUEST); } - RD_LOGD(trace_id, "Step5. Replace member, old member is removed, member={}", - boost::uuids::to_string(member_out.id)); + RD_LOGD(trace_id, "Step5. Replace member, old member is removed, task_id={}, member={}", + boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id)); // Step 2. Append log entry to complete replace member - RD_LOGI(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req, group_id={}", - group_id_str()); + RD_LOGI(trace_id, + "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req, group_id={}, task_id={}", + boost::uuids::to_string(task_id), group_id_str()); auto rreq = repl_req_ptr_t(new repl_req_ctx{}); - replace_member_ctx members; - members.replica_out = member_out; - members.replica_in = member_in; + replace_member_ctx ctx; + ctx.task_id = task_id; + ctx.replica_out = member_out; + ctx.replica_in = member_in; - sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_member_ctx)); + sisl::blob header(r_cast< uint8_t* >(&ctx), sizeof(replace_member_ctx)); rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1), @@ -349,8 +373,9 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(const replica_member_info auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { - RD_LOGE(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req failed , err={}", - err); + RD_LOGE(trace_id, + "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req failed , task_id={}, err={}", + boost::uuids::to_string(task_id), err); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(err)); @@ -358,11 +383,92 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(const replica_member_info reset_quorum_size(0, trace_id); decr_pending_request_num(); - RD_LOGI(trace_id, "Complete replace member done, group_id={}, member_out={} member_in={}", group_id_str(), - boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + RD_LOGI(trace_id, "Complete replace member done, group_id={}, task_id={}, member_out={} member_in={}", + group_id_str(), boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id), + boost::uuids::to_string(member_in.id)); return make_async_success<>(); } +ReplaceMemberStatus RaftReplDev::get_replace_member_status(uuid_t task_id, const replica_member_info& member_out, + const replica_member_info& member_in, + const std::vector< replica_member_info >& others, + uint64_t trace_id) { + if (is_stopping()) { + RD_LOGI(trace_id, "repl dev is being shutdown!"); + return ReplaceMemberStatus::UNKNOWN; + } + incr_pending_request_num(); + + if (!m_repl_svc_ctx || !is_leader()) { + decr_pending_request_num(); + return ReplaceMemberStatus::NOT_LEADER; + } + + auto peers = get_replication_status(); + peer_info out_peer_info; + bool found_out = false; + bool found_in = false; + for (auto p : peers) { + if (p.id_ == member_out.id) { + out_peer_info = p; + found_out = true; + } else if (p.id_ == member_in.id) { + found_in = true; + } + } + + bool intent_completed = !found_out && found_in; + if (m_rd_sb->replace_member_task.task_id.is_nil()) { + if (intent_completed) { + // If caller doesn't give others, won't check it. + bool others_match = others.size() == 0 || others.size() + 1 == peers.size(); + auto detail = std::string{}; + for (const auto& other : others) { + if (!raft_server()->get_srv_config(nuraft_mesg::to_server_id(other.id))) { + others_match = false; + detail = fmt::format("member {} is not found in raft group", boost::uuids::to_string(other.id)); + break; + } + } + if (!others_match) { + RD_LOGE(trace_id, + "get_replace_member_status failed, other membership mismatch, task_id={}, detail={}, " + "others.size={}, " + "all_peers.size={}", + boost::uuids::to_string(task_id), detail, others.size(), peers.size()); + decr_pending_request_num(); + return ReplaceMemberStatus::UNKNOWN; + } + decr_pending_request_num(); + return ReplaceMemberStatus::COMPLETED; + } + decr_pending_request_num(); + return ReplaceMemberStatus::TASK_NOT_FOUND; + } + if (m_rd_sb->replace_member_task.task_id != task_id) { + RD_LOGE(trace_id, "get_replace_member_status failed, task_id mismatch, persisted={}, received={}", + boost::uuids::to_string(m_rd_sb->replace_member_task.task_id), boost::uuids::to_string(task_id)); + decr_pending_request_num(); + return ReplaceMemberStatus::TASK_ID_MISMATCH; + } + // If the first attempt to remove out_member fails because out_member is down or leader crashes between Step5(remove + // member) and Step6(HS_CTRL_COMPLETE_REPLACE mesg). Replace member intent might be already fulfilled but + // replace_member_task sb still exists. In this case, we honor task sb, return IN_PROGRESS, and wait for reaper + // thread to trigger complete_replace_member again to cleanup the sb. + if (intent_completed) { + RD_LOGI(trace_id, + "Member replacement fulfilled, but task still exists, wait for reaper thread to retry " + "complete_replace_member. task_id={}, out_member={}, in_member={}", + boost::uuids::to_string(m_rd_sb->replace_member_task.task_id), boost::uuids::to_string(member_out.id), + boost::uuids::to_string(member_in.id)); + } + RD_LOGD(trace_id, "Member replacement is in progress. task_id={}, out_member={}, in_member={}", + boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id), + boost::uuids::to_string(member_in.id)); + decr_pending_request_num(); + return ReplaceMemberStatus::IN_PROGRESS; +} + ReplServiceError RaftReplDev::do_add_member(const replica_member_info& member, uint64_t trace_id) { if (m_my_repl_id != get_leader_id()) { RD_LOGI(trace_id, "Member to add failed, not leader"); @@ -370,20 +476,27 @@ ReplServiceError RaftReplDev::do_add_member(const replica_member_info& member, u } auto ret = retry_when_config_changing( [&] { - auto rem_ret = m_msg_mgr.add_member(m_group_id, member.id) + auto srv_config = nuraft::srv_config(nuraft_mesg::to_server_id(member.id), 0, + boost::uuids::to_string(member.id), "", false, member.priority); + auto add_ret = m_msg_mgr.add_member(m_group_id, srv_config) .via(&folly::InlineExecutor::instance()) .thenValue([this, member, trace_id](auto&& e) -> nuraft::cmd_result_code { return e.hasError() ? e.error() : nuraft::cmd_result_code::OK; }); - return rem_ret.value(); + return add_ret.value(); }, trace_id); if (ret == nuraft::cmd_result_code::SERVER_ALREADY_EXISTS) { RD_LOGW(trace_id, "Ignoring error returned from nuraft add_member, member={}, err={}", boost::uuids::to_string(member.id), ret); + } else if (ret == nuraft::cmd_result_code::CANCELLED) { + // nuraft mesg will return cancelled if the change is not commited after waiting for + // raft_leader_change_timeout_ms(default 3200). + RD_LOGE(trace_id, "Add member failed, member={}, err={}", boost::uuids::to_string(member.id), ret); + return ReplServiceError::CANCELLED; } else if (ret != nuraft::cmd_result_code::OK) { - // Its ok to retry this request as the request - // of replace member is idempotent. + // It's ok to retry this request as the request + // replace member is idempotent. RD_LOGE(trace_id, "Add member failed, member={}, err={}", boost::uuids::to_string(member.id), ret); return ReplServiceError::RETRY_REQUEST; } @@ -455,13 +568,6 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, RD_LOGI(trace_id, "flip learner flag failed, not leader"); return ReplServiceError::NOT_LEADER; } - if (!target && member.priority == 0) { - // If the intent is to take the learner back to normal member, then priority should not be 0(never has chance to - // become leader). Client need to trace the peers' priority, and give a meaningful value, currently default - // priorities of the quorum: leader=100, follower=66. - RD_LOGI(trace_id, "clear learner flag failed, priority is 0, member={}", boost::uuids::to_string(member.id)); - return ReplServiceError::BAD_REQUEST; - } // 2. Flip learner RD_LOGI(trace_id, "flip learner flag to {}, member={}", target, boost::uuids::to_string(member.id)); @@ -486,32 +592,18 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, boost::uuids::to_string(member.id)); } - // 3. Set priority - // Based on the current nuraft implementation, learner could be elected as leader, so we set priority to 0 to avoid - // it. And in turn, we need to revert prioiry change if the member is going to become a normal member. - // FIXME after nuraft fixes the bug, we can remove this logic. - auto priority = target ? 0 : member.priority; - RD_LOGI(trace_id, "Set the priority of the member to {}, member={}", priority, boost::uuids::to_string(member.id)); - if (srv_cfg->get_priority() != priority) { - auto priority_ret = set_priority(member.id, priority); - if (priority_ret != ReplServiceError::OK) { return ReplServiceError::NOT_LEADER; } - } else { - RD_LOGD(trace_id, "Priority has already been set to {}, skip, member={}", priority, - boost::uuids::to_string(member.id)); - } - - // 4. Verification + // 3. Verification if (wait_and_verify) { auto timeout = HS_DYNAMIC_CONFIG(consensus.wait_for_config_change_ms); if (!wait_and_check( [&]() { auto srv_conf = raft_server()->get_srv_config(nuraft_mesg::to_server_id(member.id)); - return srv_conf->is_learner() && srv_conf->get_priority() == 0; + return srv_conf->is_learner(); }, timeout)) { - RD_LOGD(trace_id, "Wait for learner and priority config change timed out, cancel the request, timeout: {}", + RD_LOGD(trace_id, "Wait for flipping learner timed out, please retry, timeout: {}", timeout); - return ReplServiceError::CANCELLED; + return ReplServiceError::RETRY_REQUEST; } } @@ -519,7 +611,7 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, } nuraft::cmd_result_code RaftReplDev::retry_when_config_changing(const std::function< nuraft::cmd_result_code() >& func, - uint64_t trace_id) { + uint64_t trace_id) { auto ret = nuraft::cmd_result_code::OK; int32_t retries = HS_DYNAMIC_CONFIG(consensus.config_changing_error_retries); for (auto i = 0; i < retries; i++) { @@ -882,7 +974,8 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ } // rreq->init will allocate the block if it has linked data. - auto status = init_req_ctx(rreq, rkey, code, m_raft_server_id == rkey.server_id, user_header, key, data_size, m_listener); + auto status = + init_req_ctx(rreq, rkey, code, m_raft_server_id == rkey.server_id, user_header, key, data_size, m_listener); if (status != ReplServiceError::OK) { RD_LOGD(rkey.traceID, "For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), @@ -1418,32 +1511,40 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) } void RaftReplDev::start_replace_member(repl_req_ptr_t rreq) { - auto members = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); + auto ctx = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); - RD_LOGI(rreq->traceID(), "Raft repl start_replace_member commit member_out={} member_in={}", - boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); + RD_LOGI(rreq->traceID(), "Raft repl start_replace_member commit, task_id={} member_out={} member_in={}", + boost::uuids::to_string(ctx->task_id), boost::uuids::to_string(ctx->replica_out.id), + boost::uuids::to_string(ctx->replica_in.id)); - m_listener->on_start_replace_member(members->replica_out, members->replica_in, rreq->traceID()); + m_listener->on_start_replace_member(ctx->task_id, ctx->replica_out, ctx->replica_in, rreq->traceID()); // record the replace_member intent std::unique_lock lg{m_sb_mtx}; - m_rd_sb->replace_member_ctx.replica_in = members->replica_in.id; - m_rd_sb->replace_member_ctx.replica_out = members->replica_out.id; + m_rd_sb->replace_member_task.task_id = ctx->task_id; + m_rd_sb->replace_member_task.replica_in = ctx->replica_in.id; + m_rd_sb->replace_member_task.replica_out = ctx->replica_out.id; m_rd_sb.write(); } void RaftReplDev::complete_replace_member(repl_req_ptr_t rreq) { - auto members = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); + auto ctx = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); - RD_LOGI(rreq->traceID(), "Raft repl complete_replace_member commit member_out={} member_in={}", - boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); + RD_LOGI(rreq->traceID(), "Raft repl complete_replace_member commit, task_id={} member_out={} member_in={}", + boost::uuids::to_string(ctx->task_id), boost::uuids::to_string(ctx->replica_out.id), + boost::uuids::to_string(ctx->replica_in.id)); - m_listener->on_complete_replace_member(members->replica_out, members->replica_in, rreq->traceID()); + m_listener->on_complete_replace_member(ctx->task_id, ctx->replica_out, ctx->replica_in, rreq->traceID()); // clear the replace_member intent std::unique_lock lg{m_sb_mtx}; - m_rd_sb->replace_member_ctx = replace_member_ctx_superblk{}; - m_rd_sb.write(); - RD_LOGI(rreq->traceID(), "Raft repl replace_member_ctx has been cleared."); + if (!m_rd_sb->replace_member_task.task_id.is_nil()) { + RD_DBG_ASSERT(m_rd_sb->replace_member_task.task_id == ctx->task_id, + "Invalid task_id in complete_replace_member message, received {}, expected {}", ctx->task_id, + m_rd_sb->replace_member_task.task_id); + m_rd_sb->replace_member_task = replace_member_task_superblk{}; + m_rd_sb.write(); + } + RD_LOGI(rreq->traceID(), "Raft repl replace_member_task has been cleared."); } static bool blob_equals(sisl::blob const& a, sisl::blob const& b) { @@ -1521,7 +1622,7 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { auto repl_status = get_replication_status(); std::set< replica_id_t > res; auto my_committed_idx = m_commit_upto_lsn.load(); - auto laggy=HS_DYNAMIC_CONFIG(consensus.laggy_threshold); + auto laggy = HS_DYNAMIC_CONFIG(consensus.laggy_threshold); uint64_t least_active_repl_idx = my_committed_idx > HS_DYNAMIC_CONFIG(consensus.laggy_threshold) ? my_committed_idx - HS_DYNAMIC_CONFIG(consensus.laggy_threshold) : 0; @@ -1836,21 +1937,22 @@ void RaftReplDev::flush_durable_commit_lsn() { m_rd_sb.write(); } -void RaftReplDev::check_replace_member_status() { +void RaftReplDev::monitor_replace_member_replication_status() { if (is_destroyed()) { RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore check replace member status"); return; } if (!m_repl_svc_ctx || !is_leader()) { return; } - if (m_rd_sb->replace_member_ctx.replica_in == boost::uuids::nil_uuid() || - m_rd_sb->replace_member_ctx.replica_out == boost::uuids::nil_uuid()) { + if (m_rd_sb->replace_member_task.replica_in == boost::uuids::nil_uuid() || + m_rd_sb->replace_member_task.replica_out == boost::uuids::nil_uuid()) { RD_LOGT(NO_TRACE_ID, "No replace member in progress, return"); return; } auto peers = get_replication_status(); - auto replica_in = m_rd_sb->replace_member_ctx.replica_in; - auto replica_out = m_rd_sb->replace_member_ctx.replica_out; + auto task_id = m_rd_sb->replace_member_task.task_id; + auto replica_in = m_rd_sb->replace_member_task.replica_in; + auto replica_out = m_rd_sb->replace_member_task.replica_out; repl_lsn_t in_lsn = 0; repl_lsn_t out_lsn = 0; repl_lsn_t laggy = HS_DYNAMIC_CONFIG(consensus.laggy_threshold); @@ -1868,30 +1970,36 @@ void RaftReplDev::check_replace_member_status() { bool catch_up = in_lsn + laggy >= out_lsn; if (!catch_up) { - RD_LOGD(NO_TRACE_ID, "Checking replace member status, replica_in={} with lsn={}, replica_out={} with lsn={}", - boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); + RD_LOGD(NO_TRACE_ID, + "Checking replace member status, task_id={},replica_in={} with lsn={}, replica_out={} with lsn={}", + boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_in), in_lsn, + boost::uuids::to_string(replica_out), out_lsn); return; } RD_LOGD(NO_TRACE_ID, - "Checking replace member status, new member has caught up, replica_in={} with lsn={}, replica_out={} with " + "Checking replace member status, new member has caught up, task_id={}, replica_in={} with lsn={}, " + "replica_out={} with " "lsn={}", - boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); + boost::uuids::to_string(task_id), boost::uuids::to_string(replica_in), in_lsn, + boost::uuids::to_string(replica_out), out_lsn); trace_id_t trace_id = generateRandomTraceId(); - RD_LOGD(trace_id, "Trigger complete_replace_member, replica_in={}, replica_out={}", - boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out)); + RD_LOGD(trace_id, "Trigger complete_replace_member, task_id={}, replica_in={}, replica_out={}", + boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_in), + boost::uuids::to_string(replica_out)); replica_member_info out{replica_out, ""}; replica_member_info in{replica_in, ""}; - auto ret = complete_replace_member(out, in, 0, trace_id).get(); + auto ret = complete_replace_member(m_rd_sb->replace_member_task.task_id, out, in, 0, trace_id).get(); if (ret.hasError()) { - RD_LOGE(trace_id, "Failed to complete replace member, next time will retry it, error={}", ret.error()); + RD_LOGE(trace_id, "Failed to complete replace member, next time will retry it, task_id={}, error={}", + boost::uuids::to_string(task_id), ret.error()); return; } - RD_LOGI(trace_id, "Complete replace member, replica_in={}, replica_out={}", - boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out)) + RD_LOGI(trace_id, "Complete replace member, task_id={}, replica_in={}, replica_out={}", + boost::uuids::to_string(task_id), boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out)) } /////////////////////////////////// Private metohds //////////////////////////////////// @@ -1999,7 +2107,8 @@ void RaftReplDev::gc_repl_reqs() { auto blkid = removing_rreq->local_blkid(); data_service().async_free_blk(blkid).thenValue([this, blkid, removing_rreq](auto&& err) { if (!err) { - RD_LOGD(removing_rreq->traceID(), "GC rreq: Releasing blkid={} freed successfully", blkid.to_string()); + RD_LOGD(removing_rreq->traceID(), "GC rreq: Releasing blkid={} freed successfully", + blkid.to_string()); } else if (err == std::make_error_code(std::errc::operation_canceled)) { // The gc reaper thread stops after the data service has been stopped, // leading to a scenario where it attempts to free the blkid while the data service is inactive. diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 2f4d9c877..1280790f4 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -15,7 +15,8 @@ #include "replication/log_store/repl_log_store.h" namespace homestore { -struct replace_member_ctx_superblk { +struct replace_member_task_superblk { + uuid_t task_id; replica_id_t replica_out; replica_id_t replica_in; }; @@ -30,7 +31,7 @@ struct raft_repl_dev_superblk : public repl_dev_superblk { uint64_t last_applied_dsn; // Last applied data sequence number uint8_t destroy_pending; // Flag to indicate whether the group is in destroy pending state repl_lsn_t last_snapshot_lsn; // Last snapshot LSN follower received from leader - replace_member_ctx_superblk replace_member_ctx; // Replace members context, used to track the replace member status + replace_member_task_superblk replace_member_task; // Replace members task, used to track the replace member status uint32_t get_raft_sb_version() const { return raft_sb_version; } }; @@ -42,6 +43,7 @@ using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, DESTROYING, DESTROYED, PERMANENT_DESTROYED); struct replace_member_ctx { + uuid_t task_id; replica_member_info replica_out; replica_member_info replica_in; }; @@ -229,11 +231,15 @@ class RaftReplDev : public ReplDev, bool bind_data_service(); bool join_group(); - AsyncReplResult<> start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + AsyncReplResult<> start_replace_member(uuid_t task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0); - AsyncReplResult<> complete_replace_member(const replica_member_info& member_out, + AsyncReplResult<> complete_replace_member(uuid_t task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0); + ReplaceMemberStatus get_replace_member_status(uuid_t task_id, const replica_member_info& member_out, + const replica_member_info& member_in, + const std::vector< replica_member_info >& others, + uint64_t trace_id = 0); AsyncReplResult<> flip_learner_flag(const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, uint64_t trace_id = 0); ReplServiceError do_add_member(const replica_member_info& member, uint64_t trace_id = 0); @@ -373,9 +379,9 @@ class RaftReplDev : public ReplDev, void flush_durable_commit_lsn(); /** - * Check the replace_member status, if the new member is fully synced up and ready to take over, remove the old member. + * Monitor the replace_member replication status, if the new member is fully synced up and ready to take over, remove the old member. */ - void check_replace_member_status(); + void monitor_replace_member_replication_status(); /** * \brief This method is called during restart to notify the upper layer diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 849c91729..a5f635e9b 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -193,7 +193,7 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } } -AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, +AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, uuid_t task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); @@ -204,6 +204,14 @@ AsyncReplResult<> SoloReplService::flip_learner_flag(group_id_t group_id, const return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } +ReplaceMemberStatus SoloReplService::get_replace_member_status(group_id_t group_id, uuid_t task_id, + const replica_member_info& member_out, + const replica_member_info& member_in, + const std::vector< replica_member_info >& others, + uint64_t trace_id) const { + return ReplaceMemberStatus::UNKNOWN; +} + std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return std::make_unique< CPContext >(new_cp); } diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index d7f332d0c..fd4f3732c 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -89,12 +89,17 @@ class SoloReplService : public GenericReplService { std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + AsyncReplResult<> replace_member(group_id_t group_id, uuid_t task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0) const override; AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, uint64_t trace_id = 0) const override; + ReplaceMemberStatus get_replace_member_status(group_id_t group_id, uuid_t task_id, + const replica_member_info& member_out, + const replica_member_info& member_in, + const std::vector< replica_member_info >& others, + uint64_t trace_id = 0) const override; }; class SoloReplServiceCPHandler : public CPCallbacks { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 75d1a766d..f994b1b14 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -482,7 +482,7 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki // In this function, it only invokes replDev start_replace_member. There is // a background reaper thread helps periodically check the member_in replication status, after in_member has caught up, // will trigger replDev complete_replace_member. -AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, +AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, uuid_t task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) const { if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); @@ -494,7 +494,7 @@ AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const rep } return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) - ->start_replace_member(member_out, member_in, commit_quorum, trace_id) + ->start_replace_member(task_id, member_out, member_in, commit_quorum, trace_id) .via(&folly::InlineExecutor::instance()) .thenValue([this](auto&& e) mutable { if (e.hasError()) { @@ -528,6 +528,18 @@ AsyncReplResult<> RaftReplService::flip_learner_flag(group_id_t group_id, const }); } +// This query should always be called on leader to avoid misleading results due to lagging status on some followers. +ReplaceMemberStatus RaftReplService::get_replace_member_status(group_id_t group_id, uuid_t task_id, + const replica_member_info& member_out, + const replica_member_info& member_in, + const std::vector< replica_member_info >& others, + uint64_t trace_id) const { + auto rdev_result = get_repl_dev(group_id); + if (!rdev_result) { return ReplaceMemberStatus::UNKNOWN; } + return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) + ->get_replace_member_status(task_id, member_out, member_in, others, trace_id); +} + ////////////////////// Reaper Thread related ////////////////////////////////// void RaftReplService::start_reaper_thread() { folly::Promise< folly::Unit > p; @@ -562,7 +574,7 @@ void RaftReplService::start_reaper_thread() { // Check replace_member sync status to see a new member is fully synced up and ready to remove the old member m_replace_member_sync_check_timer_hdl = iomanager.schedule_thread_timer( HS_DYNAMIC_CONFIG(consensus.replace_member_sync_check_interval_ms) * 1000 * 1000, true /* recurring */, - nullptr, [this](void*) { check_replace_member_status(); }); + nullptr, [this](void*) { monitor_replace_member_replication_status(); }); p.setValue(); @@ -660,11 +672,11 @@ void RaftReplService::flush_durable_commit_lsn() { } } -void RaftReplService::check_replace_member_status() { +void RaftReplService::monitor_replace_member_replication_status() { std::unique_lock lg(m_rd_map_mtx); for (auto& rdev_parent : m_rd_map) { auto rdev = std::dynamic_pointer_cast< RaftReplDev >(rdev_parent.second); - rdev->check_replace_member_status(); + rdev->monitor_replace_member_replication_status(); } } diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 429ccb295..4108eaf35 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -79,7 +79,7 @@ class RaftReplService : public GenericReplService, std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + AsyncReplResult<> replace_member(group_id_t group_id, uuid_t task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0) const override; @@ -87,6 +87,12 @@ class RaftReplService : public GenericReplService, uint32_t commit_quorum, bool wait_and_verify = true, uint64_t trace_id = 0) const override; + ReplaceMemberStatus get_replace_member_status(group_id_t group_id, uuid_t task_id, + const replica_member_info& member_out, + const replica_member_info& member_in, + const std::vector< replica_member_info >& others, + uint64_t trace_id = 0) const override; + private: RaftReplDev* raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie); void start_reaper_thread(); @@ -95,7 +101,7 @@ class RaftReplService : public GenericReplService, void gc_repl_devs(); void gc_repl_reqs(); void flush_durable_commit_lsn(); - void check_replace_member_status(); + void monitor_replace_member_replication_status(); void monitor_cert_changes(); void restart_raft_svc(const std::string filepath, const bool deleted); bool wait_for_cert(const std::string& filepath); diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 80eeb1573..0046bbd68 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -344,12 +344,14 @@ class TestReplicatedDB : public homestore::ReplDevListener { } return blk_alloc_hints{}; } - void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override { + void on_start_replace_member(const uuid_t& task_id, const replica_member_info& member_out, const replica_member_info& member_in, + trace_id_t tid) override { LOGINFO("[Replica={}] start replace member out {} in {}", g_helper->replica_num(), boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); } - void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override { + void on_complete_replace_member(const uuid_t& task_id, const replica_member_info& member_out, const replica_member_info& member_in, + trace_id_t tid) override { LOGINFO("[Replica={}] complete replace member out {} in {}", g_helper->replica_num(), boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); } @@ -742,15 +744,17 @@ class RaftReplDevTestBase : public testing::Test { void create_snapshot() { dbs_[0]->create_snapshot(); } void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } - void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, - uint32_t commit_quorum = 0, ReplServiceError error = ReplServiceError::OK) { - this->run_on_leader(db, [this, error, db, member_out, member_in, commit_quorum]() { - LOGINFO("Start replace member out={} in={}", boost::uuids::to_string(member_out), - boost::uuids::to_string(member_in)); + void replace_member(std::shared_ptr< TestReplicatedDB > db, uuid_t task_id, replica_id_t member_out, + replica_id_t member_in, uint32_t commit_quorum = 0, + ReplServiceError error = ReplServiceError::OK) { + this->run_on_leader(db, [this, error, db, task_id, member_out, member_in, commit_quorum]() { + LOGINFO("Start replace member task_id={}, out={}, in={}", boost::uuids::to_string(task_id), + boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); replica_member_info out{member_out, ""}; replica_member_info in{member_in, ""}; - auto result = hs()->repl_service().replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); + auto result = + hs()->repl_service().replace_member(db->repl_dev()->group_id(), task_id, out, in, commit_quorum).get(); if (error == ReplServiceError::OK) { ASSERT_EQ(result.hasError(), false) << "Error in replacing member, err=" << result.error(); } else { @@ -760,6 +764,22 @@ class RaftReplDevTestBase : public testing::Test { }); } + ReplaceMemberStatus check_replace_member_status(std::shared_ptr< TestReplicatedDB > db, uuid_t task_id, + replica_id_t member_out, replica_id_t member_in) { + LOGINFO("check replace member status, task_id={}, out={} in={}", boost::uuids::to_string(task_id), + boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); + + replica_member_info out{member_out, ""}; + replica_member_info in{member_in, ""}; + std::vector< replica_member_info > others; + for (auto m : g_helper->members_) { + if (m.first != member_out && m.first != member_in) { + others.emplace_back(replica_member_info{.id = m.first, .name = ""}); + } + } + return hs()->repl_service().get_replace_member_status(db->repl_dev()->group_id(), task_id, out, in, others); + } + protected: std::vector< std::shared_ptr< TestReplicatedDB > > dbs_; uint32_t written_entries_{0}; diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp index 4ae56a9c3..bb56072bd 100644 --- a/src/tests/test_raft_repl_dev_dynamic.cpp +++ b/src/tests/test_raft_repl_dev_dynamic.cpp @@ -39,12 +39,17 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { uint32_t member_in = num_replicas; g_helper->sync_for_test_start(num_members); + auto task_id = boost::uuids::random_generator()(); + this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + ASSERT_EQ( + check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), + ReplaceMemberStatus::TASK_NOT_FOUND); + }); if (g_helper->replica_num() < num_replicas) { // With existing raft repl dev group, write IO's, validate and call replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); - - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -53,6 +58,14 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { g_helper->sync_for_verify_start(num_members); LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + ASSERT_EQ( + check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), + ReplaceMemberStatus::IN_PROGRESS); + auto new_task_id = boost::uuids::random_generator()(); + replace_member(db, new_task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + ReplServiceError::REPLACE_MEMBER_TASK_MISMATCH); + }); if (is_replica_num_in({0, 1, member_in})) { // Skip the member which is going to be replaced. Validate data on all other replica's. LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); @@ -61,7 +74,7 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { g_helper->sync_for_verify_start(num_members); LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); - //wait for background reaper thread to trigger complete_replace_member + // wait for background reaper thread to trigger complete_replace_member if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); @@ -75,6 +88,11 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { } g_helper->sync_for_cleanup_start(num_members); + this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + ASSERT_EQ( + check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), + ReplaceMemberStatus::COMPLETED); + }); LOGINFO("ReplaceMember test done replica={}", g_helper->replica_num()); } @@ -109,11 +127,13 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) { LOGINFO("Shutdown replica 2"); } + auto task_id = boost::uuids::random_generator()(); if (g_helper->replica_num() == 0) { // Replace down replica 2 with spare replica 3 with commit quorum 1 // so that leader can go ahead with replacing member. - LOGINFO("Replace member started"); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/); + LOGINFO("Replace member started, task_id={}", boost::uuids::to_string(task_id)); + replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in), + 1 /* commit quorum*/); this->write_on_leader(num_io_entries, true /* wait_for_commit */); LOGINFO("Leader completed num_io={}", num_io_entries); } @@ -128,6 +148,12 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) { LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); this->validate_data(); } + g_helper->sync_for_verify_start(num_members); + this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + ASSERT_EQ( + check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), + ReplaceMemberStatus::IN_PROGRESS); + }); if (g_helper->replica_num() == 1) { LOGINFO("Start replica 1"); @@ -167,12 +193,13 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) { LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); } - //shut down before replace member + // shut down before replace member this->shutdown_replica(2); LOGINFO("Shutdown replica 2"); + auto task_id = boost::uuids::random_generator()(); if (g_helper->replica_num() == 0) { - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -195,6 +222,11 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) { // data synced, waiting for removing learner LOGINFO("data synced, sync for completing replace member, replica={}", g_helper->replica_num()); g_helper->sync_for_verify_start(num_members); + this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + ASSERT_EQ( + check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), + ReplaceMemberStatus::IN_PROGRESS); + }); // Since the out_member stopped, it cannot response to remove_srv req, as a result the first time will get CANCELLED // error, so waiting time is longer than other tests. if (g_helper->replica_num() == 2) { @@ -211,9 +243,26 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) { LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); db->set_zombie(); } - + g_helper->sync_for_test_start(num_members); + if (g_helper->replica_num() != 2) { + this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + auto status = check_replace_member_status(db, task_id, g_helper->replica_id(member_out), + g_helper->replica_id(member_in)); + // out_member is down, so it can not response to remove req. Based on nuraft logic, leader will wait for + // timeout and remove it automatically. Simulate next complete_replace_member retry. + if (status == ReplaceMemberStatus::IN_PROGRESS) { + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.monitor_replace_member_replication_status(); + LOGINFO("Simulate reaper thread to complete_replace_member"); + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + ASSERT_EQ(check_replace_member_status(db, task_id, g_helper->replica_id(member_out), + g_helper->replica_id(member_in)), + ReplaceMemberStatus::COMPLETED); + }); + } g_helper->sync_for_cleanup_start(num_members); - LOGINFO("OneMemberDown test done replica={}", g_helper->replica_num()); + LOGINFO("OutMemberDown test done replica={}", g_helper->replica_num()); } TEST_F(ReplDevDynamicTest, LeaderReplace) { @@ -233,7 +282,7 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { uint32_t member_in = num_replicas; g_helper->sync_for_test_start(num_members); - + auto task_id = boost::uuids::random_generator()(); if (g_helper->replica_num() == member_out) { LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); // With existing raft repl dev group, write IO's, validate and call replace_member on leader. @@ -242,13 +291,13 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { // Leader will return error NOT_LEADER and yield leadership, sleep and connect again // to the new leader. LOGINFO("Replace old leader"); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, ReplServiceError::NOT_LEADER); LOGINFO("Replace member leader yield done"); } std::this_thread::sleep_for(std::chrono::seconds(3)); if (g_helper->replica_num() != member_in) { - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); LOGINFO("Replace member old leader done"); } @@ -264,8 +313,12 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { this->validate_data(); } - g_helper->sync_for_verify_start(num_members); LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); + this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + ASSERT_EQ( + check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), + ReplaceMemberStatus::IN_PROGRESS); + }); if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); @@ -280,6 +333,11 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { } g_helper->sync_for_cleanup_start(num_members); + this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + ASSERT_EQ( + check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), + ReplaceMemberStatus::COMPLETED); + }); LOGINFO("LeaderReplace test done replica={}", g_helper->replica_num()); } @@ -303,13 +361,13 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { LOGINFO("Restart replica 1, "); this->restart_replica(15); } - + auto task_id = boost::uuids::random_generator()(); if (g_helper->replica_num() == 0) { // With existing raft repl dev group, write IO's, validate and call replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -324,8 +382,12 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { this->validate_data(); } - g_helper->sync_for_verify_start(num_members); LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); + this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + ASSERT_EQ( + check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), + ReplaceMemberStatus::IN_PROGRESS); + }); if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); @@ -339,6 +401,11 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { } g_helper->sync_for_cleanup_start(num_members); + this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + ASSERT_EQ( + check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), + ReplaceMemberStatus::COMPLETED); + }); LOGINFO("OneMemberRestart test done replica={}", g_helper->replica_num()); } @@ -362,11 +429,11 @@ TEST_F(ReplDevDynamicTest, ValidateRequest) { g_helper->sync_for_test_start(num_members); - //shut down before replace member + // shut down before replace member this->shutdown_replica(1); LOGINFO("Shutdown replica 1"); - //wait for shutdown + // wait for shutdown std::this_thread::sleep_for(std::chrono::seconds(3)); g_helper->sync_for_verify_start(num_members); if (g_helper->replica_num() == 0) { @@ -374,17 +441,18 @@ TEST_F(ReplDevDynamicTest, ValidateRequest) { LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); } - g_helper->sync_for_verify_start(num_members); + + auto task_id = boost::uuids::random_generator()(); if (g_helper->replica_num() == 0) { // generate uuid replica_id_t fake_member_out = boost::uuids::random_generator()(); replica_id_t fake_member_in = boost::uuids::random_generator()(); LOGINFO("test SERVER_NOT_FOUND"); - replace_member(db, fake_member_out, fake_member_in, 0, ReplServiceError::SERVER_NOT_FOUND); + replace_member(db, task_id, fake_member_out, fake_member_in, 0, ReplServiceError::SERVER_NOT_FOUND); LOGINFO("test replace_member already complete"); - replace_member(db, fake_member_out, g_helper->replica_id(0)); + replace_member(db, task_id, fake_member_out, g_helper->replica_id(0)); LOGINFO("test QUORUM_NOT_MET", num_io_entries, g_helper->replica_num()); - replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, ReplServiceError::QUORUM_NOT_MET); } diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 935f17230..6d8cac9e3 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -133,9 +133,9 @@ class SoloReplDevTest : public testing::Test { cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received error={} on repl_dev", enum_name(error)); } - void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + void on_start_replace_member(const uuid_t& task_id, const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {} - void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + void on_complete_replace_member(const uuid_t& task_id, const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {} void on_destroy(const group_id_t& group_id) override {} void notify_committed_lsn(int64_t lsn) override {} From 8d9bfdc786b14cdb3b31c87efd30edb76514c2e9 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Wed, 18 Jun 2025 05:01:03 +0800 Subject: [PATCH 139/170] change on_no_space_left signature (#748) --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 20 +++++++++---------- src/lib/blkalloc/append_blk_allocator.cpp | 3 --- src/lib/replication/repl_dev/common.cpp | 4 ++-- .../replication/repl_dev/raft_repl_dev.cpp | 10 +++------- src/tests/test_common/raft_repl_test_base.hpp | 19 +++++++++--------- src/tests/test_solo_repl_dev.cpp | 10 +++++----- 7 files changed, 30 insertions(+), 38 deletions(-) diff --git a/conanfile.py b/conanfile.py index 13933aca1..e3f07b98a 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.17.0" + version = "6.17.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 934ba0354..ce348eac0 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -43,10 +43,10 @@ VENUM(repl_req_state_t, uint32_t, ) VENUM(journal_type_t, uint16_t, - HS_DATA_LINKED = 0, // Linked data where each entry will store physical blkid where data reside - HS_DATA_INLINED = 1, // Data is inlined in the header of journal entry - HS_CTRL_DESTROY = 2, // Control message to destroy the repl_dev - HS_CTRL_START_REPLACE = 3, // Control message to start replace a member + HS_DATA_LINKED = 0, // Linked data where each entry will store physical blkid where data reside + HS_DATA_INLINED = 1, // Data is inlined in the header of journal entry + HS_CTRL_DESTROY = 2, // Control message to destroy the repl_dev + HS_CTRL_START_REPLACE = 3, // Control message to start replace a member HS_CTRL_COMPLETE_REPLACE = 4, // Control message to complete replace a member ) @@ -369,12 +369,12 @@ class ReplDevListener { virtual void on_destroy(const group_id_t& group_id) = 0; /// @brief Called when start replace member. - virtual void on_start_replace_member(const uuid_t& task_id, const replica_member_info& member_out, const replica_member_info& member_in, - trace_id_t tid) = 0; + virtual void on_start_replace_member(const uuid_t& task_id, const replica_member_info& member_out, + const replica_member_info& member_in, trace_id_t tid) = 0; /// @brief Called when complete replace member. - virtual void on_complete_replace_member(const uuid_t& task_id, const replica_member_info& member_out, const replica_member_info& member_in, - trace_id_t tid) = 0; + virtual void on_complete_replace_member(const uuid_t& task_id, const replica_member_info& member_out, + const replica_member_info& member_in, trace_id_t tid) = 0; /// @brief Called when the snapshot is being created by nuraft virtual AsyncReplResult<> create_snapshot(shared< snapshot_context > context) = 0; @@ -414,8 +414,8 @@ class ReplDevListener { /// @brief ask upper layer to handle no_space_left event // @param lsn - on which repl_lsn no_space_left happened - // @param chunk_id - on which chunk no_space_left happened - virtual void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) = 0; + // @param header - on which header no_space_left happened when trying to allocate blk + virtual void on_no_space_left(repl_lsn_t lsn, sisl::blob const& header) = 0; /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer virtual void on_log_replay_done(const group_id_t& group_id) {}; diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index 2f6cec25c..05464d825 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -76,14 +76,11 @@ BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hint LOGERROR("No space left to serve request nblks: {}, available_blks: {}, actual available_blks(exclude reserved " "blks): {}", nblks, available_blks(), avail_blks); - // the caller can know in which chunk no_space_left happened; - out_bid = BlkId{0, 0, m_chunk_id}; return BlkAllocStatus::SPACE_FULL; } else if (nblks > max_blks_per_blkid()) { // consumer(vdev) already handles this case. // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); LOGERROR("Can't serve request nblks: {} larger than max_blks_in_op: {}", nblks, max_blks_per_blkid()); - out_bid = BlkId{0, 0, m_chunk_id}; return BlkAllocStatus::FAILED; } diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 6b8ce122b..fbc732775 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -155,7 +155,6 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list if (status != BlkAllocStatus::SUCCESS) { LOGWARNMOD(replication, "[traceID={}] block allocation failure, repl_key=[{}], status=[{}]", rkey().traceID, rkey(), status); - DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks"); return ReplServiceError::NO_SPACE_LEFT; } @@ -266,7 +265,8 @@ std::string repl_req_ctx::to_string() const { } std::string repl_req_ctx::to_compact_string() const { - if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_START_REPLACE || m_op_code == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { + if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_START_REPLACE || + m_op_code == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { return fmt::format("term={} lsn={} op={}", m_rkey.term, m_lsn, enum_name(m_op_code)); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 99a4ad44b..c1fa31ada 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -601,8 +601,7 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, return srv_conf->is_learner(); }, timeout)) { - RD_LOGD(trace_id, "Wait for flipping learner timed out, please retry, timeout: {}", - timeout); + RD_LOGD(trace_id, "Wait for flipping learner timed out, please retry, timeout: {}", timeout); return ReplServiceError::RETRY_REQUEST; } } @@ -981,11 +980,8 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ RD_LOGD(rkey.traceID, "For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), status); if (status == ReplServiceError::NO_SPACE_LEFT && !is_data_channel && !rreq->is_proposer()) { - const auto& chunk_id = rreq->local_blkid().chunk_num(); - RD_LOGD(rkey.traceID, - "For Repl_key=[{}] alloc hints returned error={} when trying to allocate blk on chunk={}", - rkey.to_string(), status, chunk_id); - m_listener->on_no_space_left(lsn, chunk_id); + RD_LOGD(rkey.traceID, "Repl_key=[{}] got no_space_left error on follower as lsn={}", rkey.to_string(), lsn); + m_listener->on_no_space_left(lsn, user_header); } else { RD_LOGD( rkey.traceID, diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 0046bbd68..934256594 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -183,9 +183,8 @@ class TestReplicatedDB : public homestore::ReplDevListener { void on_config_rollback(int64_t lsn) override { LOGINFOMOD(replication, "[Replica={}] Received config rollback at lsn={}", g_helper->replica_num(), lsn); } - void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) override { - LOGINFOMOD(replication, "[Replica={}] Received no_space_left at lsn={}, chunk_id={}", g_helper->replica_num(), - lsn, chunk_id); + void on_no_space_left(repl_lsn_t lsn, sisl::blob const& header) override { + LOGINFOMOD(replication, "[Replica={}] Received no_space_left at lsn={}", g_helper->replica_num(), lsn); } AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { @@ -336,22 +335,22 @@ class TestReplicatedDB : public homestore::ReplDevListener { auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); Key k{.id_ = jheader->key_id}; auto iter = inmem_db_.find(k); + auto hints = blk_alloc_hints{}; if (iter != inmem_db_.end()) { LOGDEBUG("data already exists in mem db, key={}", k.id_); - auto hints = blk_alloc_hints{}; hints.committed_blk_id = iter->second.blkid_; - return hints; } - return blk_alloc_hints{}; + return hints; } - void on_start_replace_member(const uuid_t& task_id, const replica_member_info& member_out, const replica_member_info& member_in, - trace_id_t tid) override { + + void on_start_replace_member(const uuid_t& task_id, const replica_member_info& member_out, + const replica_member_info& member_in, trace_id_t tid) override { LOGINFO("[Replica={}] start replace member out {} in {}", g_helper->replica_num(), boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); } - void on_complete_replace_member(const uuid_t& task_id, const replica_member_info& member_out, const replica_member_info& member_in, - trace_id_t tid) override { + void on_complete_replace_member(const uuid_t& task_id, const replica_member_info& member_out, + const replica_member_info& member_in, trace_id_t tid) override { LOGINFO("[Replica={}] complete replace member out {} in {}", g_helper->replica_num(), boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); } diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 6d8cac9e3..ffdf00e1d 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -133,14 +133,14 @@ class SoloReplDevTest : public testing::Test { cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received error={} on repl_dev", enum_name(error)); } - void on_start_replace_member(const uuid_t& task_id, const replica_member_info& member_out, const replica_member_info& member_in, - trace_id_t tid) override {} - void on_complete_replace_member(const uuid_t& task_id, const replica_member_info& member_out, const replica_member_info& member_in, - trace_id_t tid) override {} + void on_start_replace_member(const uuid_t& task_id, const replica_member_info& member_out, + const replica_member_info& member_in, trace_id_t tid) override {} + void on_complete_replace_member(const uuid_t& task_id, const replica_member_info& member_out, + const replica_member_info& member_in, trace_id_t tid) override {} void on_destroy(const group_id_t& group_id) override {} void notify_committed_lsn(int64_t lsn) override {} void on_config_rollback(int64_t lsn) override {} - void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) override {} + void on_no_space_left(repl_lsn_t lsn, sisl::blob const& header) override {} }; class Application : public ReplApplication { From 582237251a00fcb218ae6bf926c261fb19f276ee Mon Sep 17 00:00:00 2001 From: yuwmao Date: Wed, 18 Jun 2025 14:18:06 +0800 Subject: [PATCH 140/170] Support setting stage of ReplDev --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 10 ++++++++++ src/lib/replication/repl_dev/raft_repl_dev.cpp | 6 ++++++ src/lib/replication/repl_dev/raft_repl_dev.h | 4 ++-- src/lib/replication/repl_dev/solo_repl_dev.h | 4 ++++ 5 files changed, 23 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index e3f07b98a..a1d2d0122 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.17.1" + version = "6.17.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index ce348eac0..f864e9137 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -50,6 +50,8 @@ VENUM(journal_type_t, uint16_t, HS_CTRL_COMPLETE_REPLACE = 4, // Control message to complete replace a member ) +ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, UNREADY, DESTROYING, DESTROYED, PERMANENT_DESTROYED); + // magic num comes from the first 8 bytes of 'echo homestore_resync_data | md5sum' static constexpr uint64_t HOMESTORE_RESYNC_DATA_MAGIC = 0xa65dbd27c213f327; static constexpr uint32_t HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1 = 0x01; @@ -541,6 +543,14 @@ class ReplDev { /// @return true if ready, false otherwise virtual bool is_ready_for_traffic() const = 0; + /// @brief Set the stage of this repl dev, this helps user to set unready state when the condition is not met(e.g. + /// disk is unhealthy) and vice versa which supports to run in degrade mode. + virtual void set_stage(repl_dev_stage_t stage) = 0; + + /// @brief Get the stage of this repl dev. + /// @return current stage of this repl dev. + virtual repl_dev_stage_t get_stage() const = 0; + /// @brief Clean up resources on this repl dev. virtual void purge() = 0; diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index c1fa31ada..27e12c8b3 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -1779,6 +1779,12 @@ int32_t RaftReplDev::server_id() { return m_raft_server_id; } bool RaftReplDev::is_destroy_pending() const { return (*m_stage.access().get() == repl_dev_stage_t::DESTROYED); } bool RaftReplDev::is_destroyed() const { return (*m_stage.access().get() == repl_dev_stage_t::PERMANENT_DESTROYED); } +repl_dev_stage_t RaftReplDev::get_stage() const { return *m_stage.access().get(); } + +void RaftReplDev::set_stage(repl_dev_stage_t stage) { + m_stage.update([stage](auto* s) { *s = stage; }); +} + /////////////////////////////////// nuraft_mesg::mesg_state_mgr overrides //////////////////////////////////// void RaftReplDev::become_ready() { m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::ACTIVE; }); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 1280790f4..e3e8f20c5 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -40,8 +40,6 @@ struct raft_repl_dev_superblk : public repl_dev_superblk { using raft_buf_ptr_t = nuraft::ptr< nuraft::buffer >; using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; -ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, DESTROYING, DESTROYED, PERMANENT_DESTROYED); - struct replace_member_ctx { uuid_t task_id; replica_member_info replica_out; @@ -299,6 +297,8 @@ class RaftReplDev : public ReplDev, repl_lsn_t get_last_append_lsn() override { return raft_server()->get_last_log_idx(); } bool is_destroy_pending() const; bool is_destroyed() const; + void set_stage(repl_dev_stage_t stage); + repl_dev_stage_t get_stage() const; uint32_t get_quorum_for_commit() const; Clock::time_point destroyed_time() const { return m_destroyed_time; } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index fc37b30b5..2ad78e4da 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -78,6 +78,10 @@ class SoloReplDev : public ReplDev { peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0, .priority_ = 1}}; } bool is_ready_for_traffic() const override { return true; } + void set_stage(repl_dev_stage_t stage) override {} + repl_dev_stage_t get_stage() const override { + return repl_dev_stage_t::ACTIVE; + } void purge() override {} std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { From 761113a4a6d1a54ee1253b883977bcd405e3882f Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Wed, 25 Jun 2025 17:46:07 -0700 Subject: [PATCH 141/170] Issue 753 Make CP number Fibers dynamic config (#754) * Issue 753 Make CP number Fibers dynamic config * fix cast issue * fix a race --- conanfile.py | 2 +- src/lib/checkpoint/cp_mgr.cpp | 4 +++- src/lib/common/homestore_config.fbs | 3 +++ src/lib/common/resource_mgr.cpp | 22 +++++++++++++++---- src/lib/common/resource_mgr.hpp | 1 + .../replication/service/generic_repl_svc.cpp | 14 +++++++----- .../replication/service/generic_repl_svc.h | 6 +++-- 7 files changed, 38 insertions(+), 14 deletions(-) diff --git a/conanfile.py b/conanfile.py index a1d2d0122..f7df38b36 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.17.2" + version = "6.17.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp index cf89d1adf..42b19485e 100644 --- a/src/lib/checkpoint/cp_mgr.cpp +++ b/src/lib/checkpoint/cp_mgr.cpp @@ -302,7 +302,9 @@ void CPManager::start_cp_thread() { // Multiple sync_io fibers may acquire a thread-level mutex and perform synchronous I/O using io_uring. // This can block the fiber and allow other fibers to be scheduled. // If another fiber tries to acquire the same mutex, a deadlock can occur. - iomanager.create_reactor("cp_io", iomgr::INTERRUPT_LOOP, 2u, [this, ctx](bool is_started) { + auto const num_fibers = HS_DYNAMIC_CONFIG(generic.cp_io_fibers); // default: 2 + LOGINFO("Starting CP IO fibers with count: {}", num_fibers); + iomanager.create_reactor("cp_io", iomgr::INTERRUPT_LOOP, num_fibers, [this, ctx](bool is_started) { if (is_started) { { std::unique_lock< std::mutex > lk{ctx->mtx}; diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index b012a8bed..5c6a779b8 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -123,6 +123,9 @@ table Generic { // cp timer in us cp_timer_us: uint64 = 60000000 (hotswap); + // number of fibers for cp_io thread; + cp_io_fibers: uint32 = 2; + // writeback cache flush threads cache_flush_threads : int32 = 1; diff --git a/src/lib/common/resource_mgr.cpp b/src/lib/common/resource_mgr.cpp index 36b2a0a17..7dcb6190f 100644 --- a/src/lib/common/resource_mgr.cpp +++ b/src/lib/common/resource_mgr.cpp @@ -20,6 +20,7 @@ #include "resource_mgr.hpp" #include "homestore_assert.hpp" #include "replication/repl_dev/raft_repl_dev.h" +#include "replication/service/generic_repl_svc.h" namespace homestore { ResourceMgr& resource_mgr() { return hs()->resource_mgr(); } @@ -31,6 +32,7 @@ void ResourceMgr::start(uint64_t total_cap) { void ResourceMgr::stop() { LOGINFO("Cancel resource manager timer."); + m_is_stopped_ = true; if (m_res_audit_timer_hdl != iomgr::null_timer_handle) { iomanager.cancel_timer(m_res_audit_timer_hdl); } m_res_audit_timer_hdl = iomgr::null_timer_handle; } @@ -46,7 +48,21 @@ void ResourceMgr::stop() { // writes on this descriptor; // void ResourceMgr::trigger_truncate() { + if (m_is_stopped_.load()) { + // when we are here, it means HomeStore is shutting down and since this API is called in timer thread, the timer + // thread might already been triggered while RM is tring to cancel it; + // and since shutdown and timer thread happen parallel, by the time we are here, shutdown might already cleaned + // up all replication service instances. and it will throw heap-use-after-free; + LOGINFO("Resource manager is stopped, so not triggering truncate"); + return; + } + if (hs()->has_repl_data_service()) { + auto& repl_svc = dynamic_cast< GenericReplService& >(hs()->repl_service()); + if (repl_svc.get_impl_type() == repl_impl_type::solo) { + // skip truncation from RM for solo repl dev; + return; + } /* * DO NOT NEED : raft will truncate logs. * // first make sure all repl dev's underlying raft log store make corresponding reservation during @@ -96,10 +112,8 @@ void ResourceMgr::dec_dirty_buf_size(const uint32_t size) { HS_REL_ASSERT_GT(size, 0); const int64_t dirty_buf_cnt = m_hs_dirty_buf_cnt.fetch_sub(size, std::memory_order_relaxed); COUNTER_DECREMENT(m_metrics, dirty_buf_cnt, size); - if (dirty_buf_cnt < size) { - LOGERROR("dirty_buf_cnt {} of now is less then size {}", dirty_buf_cnt, size); - } - //HS_REL_ASSERT_GE(dirty_buf_cnt, size); + if (dirty_buf_cnt < size) { LOGERROR("dirty_buf_cnt {} of now is less then size {}", dirty_buf_cnt, size); } + // HS_REL_ASSERT_GE(dirty_buf_cnt, size); } void ResourceMgr::register_dirty_buf_exceed_cb(exceed_limit_cb_t cb) { m_dirty_buf_exceed_cb = std::move(cb); } diff --git a/src/lib/common/resource_mgr.hpp b/src/lib/common/resource_mgr.hpp index 4bdcd4478..6a9023627 100644 --- a/src/lib/common/resource_mgr.hpp +++ b/src/lib/common/resource_mgr.hpp @@ -143,6 +143,7 @@ class ResourceMgr { std::atomic< int64_t > m_hs_ab_cnt; // alloc count std::atomic< int64_t > m_memory_used_in_recovery; std::atomic< uint32_t > m_flush_dirty_buf_q_depth{64}; + std::atomic< bool > m_is_stopped_{false}; uint64_t m_total_cap; // TODO: make it event_cb diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index a5f635e9b..0c547bab4 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -79,7 +79,7 @@ hs_stats GenericReplService::get_cap_stats() const { ///////////////////// SoloReplService specializations and CP Callbacks ///////////////////////////// SoloReplService::SoloReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} {} -SoloReplService::~SoloReplService() {}; +SoloReplService::~SoloReplService(){}; void SoloReplService::start() { for (auto const& [buf, mblk] : m_sb_bufs) { @@ -193,14 +193,16 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } } -AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, uuid_t task_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id) const { +AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, uuid_t task_id, + const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } -AsyncReplResult<> SoloReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, - uint32_t commit_quorum, bool wait_and_verify, uint64_t trace_id) const { +AsyncReplResult<> SoloReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, + bool target, uint32_t commit_quorum, bool wait_and_verify, + uint64_t trace_id) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index fd4f3732c..9f7261a18 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -59,6 +59,8 @@ class GenericReplService : public ReplicationService { // void resource_audit() override; virtual void stop() = 0; + repl_impl_type get_impl_type() const { return m_repl_app->get_impl_type(); } + protected: virtual void add_repl_dev(group_id_t group_id, shared< ReplDev > rdev); virtual void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) = 0; @@ -90,8 +92,8 @@ class SoloReplService : public GenericReplService { folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; AsyncReplResult<> replace_member(group_id_t group_id, uuid_t task_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0) const override; + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, uint64_t trace_id = 0) const override; From 00606a0182e51119f0b5bb8e2a234be7fa688753 Mon Sep 17 00:00:00 2001 From: yuwmao <148639999+yuwmao@users.noreply.github.com> Date: Thu, 26 Jun 2025 09:37:15 +0800 Subject: [PATCH 142/170] Support replDev UNREADY stage (#755) * Support replDev UNREADY stage * check m_repl_svc_ctx --- .../homestore/replication/repl_decls.h | 1 + .../replication/repl_dev/raft_repl_dev.cpp | 36 ++++++++++++++++--- .../replication/service/raft_repl_service.cpp | 11 ++++++ 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 50b690d4e..c52466779 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -39,6 +39,7 @@ VENUM(ReplServiceError, int32_t, QUIENCE_STATE = -20003, QUORUM_NOT_MET = -20004, REPLACE_MEMBER_TASK_MISMATCH = -20005, + UNREADY_STATE = -20006, FAILED = -32768); VENUM(ReplaceMemberStatus, int32_t, diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 27e12c8b3..04248a3b5 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -145,6 +145,10 @@ AsyncReplResult<> RaftReplDev::start_replace_member(uuid_t task_id, const replic RD_LOGI(trace_id, "repl dev is being shutdown!"); return make_async_error<>(ReplServiceError::STOPPING); } + if (get_stage() != repl_dev_stage_t::ACTIVE) { + RD_LOGE(trace_id, "repl dev is not ready, stage={}", static_cast< int >(get_stage())); + return make_async_error<>(ReplServiceError::UNREADY_STATE); + } incr_pending_request_num(); RD_LOGI(trace_id, "Start replace member, task_id={}, member_out={} member_in={}", boost::uuids::to_string(task_id), @@ -301,6 +305,10 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(uuid_t task_id, const rep RD_LOGI(trace_id, "repl dev is being shutdown!"); return make_async_error<>(ReplServiceError::STOPPING); } + if (get_stage() != repl_dev_stage_t::ACTIVE) { + RD_LOGE(trace_id, "repl dev is not ready, stage={}", static_cast< int >(get_stage())); + return make_async_error<>(ReplServiceError::UNREADY_STATE); + } incr_pending_request_num(); RD_LOGI(trace_id, "Complete replace member, task_id={}, member_out={}, member_in={}", @@ -726,8 +734,9 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& if (auto const stage = *guard.get(); stage != repl_dev_stage_t::ACTIVE) { RD_LOGW(tid, "Raft channel: Not ready to accept writes, stage={}", enum_name(stage)); handle_error(rreq, - (stage == repl_dev_stage_t::INIT) ? ReplServiceError::SERVER_IS_JOINING - : ReplServiceError::SERVER_IS_LEAVING); + (stage == repl_dev_stage_t::INIT) ? ReplServiceError::SERVER_IS_JOINING + : (stage == repl_dev_stage_t::UNREADY) ? ReplServiceError::UNREADY_STATE + : ReplServiceError::SERVER_IS_LEAVING); return; } } @@ -1562,6 +1571,10 @@ folly::Future< std::error_code > RaftReplDev::async_read(MultiBlkId const& bid, LOGINFO("repl dev is being shutdown!"); return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); } + if (get_stage() != repl_dev_stage_t::ACTIVE) { + LOGINFO("repl dev is not active!"); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + } return data_service().async_read(bid, sgs, size, part_of_batch); } @@ -1572,6 +1585,10 @@ folly::Future< std::error_code > RaftReplDev::async_free_blks(int64_t, MultiBlkI LOGINFO("repl dev is being shutdown!"); return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); } + if (get_stage() != repl_dev_stage_t::ACTIVE) { + LOGINFO("repl dev is not active!"); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + } return data_service().async_free_blk(bid); } @@ -1593,16 +1610,21 @@ AsyncReplResult<> RaftReplDev::become_leader() { }); } -bool RaftReplDev::is_leader() const { return m_repl_svc_ctx->is_raft_leader(); } +bool RaftReplDev::is_leader() const { return m_repl_svc_ctx && m_repl_svc_ctx->is_raft_leader(); } replica_id_t RaftReplDev::get_leader_id() const { static replica_id_t empty_uuid = boost::uuids::nil_uuid(); + if (!m_repl_svc_ctx) { return empty_uuid; } auto leader = m_repl_svc_ctx->raft_leader_id(); return leader.empty() ? empty_uuid : boost::lexical_cast< replica_id_t >(leader); } std::vector< peer_info > RaftReplDev::get_replication_status() const { std::vector< peer_info > pi; + if (!m_repl_svc_ctx) { + RD_LOGD(NO_TRACE_ID, "m_repl_svc_ctx doesn't exist, returning empty peer info"); + return pi; + } auto rep_status = m_repl_svc_ctx->get_raft_status(); for (auto const& pinfo : rep_status) { pi.emplace_back(peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), @@ -1940,8 +1962,8 @@ void RaftReplDev::flush_durable_commit_lsn() { } void RaftReplDev::monitor_replace_member_replication_status() { - if (is_destroyed()) { - RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore check replace member status"); + if (is_destroyed() || get_stage() == repl_dev_stage_t::UNREADY) { + RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed or unready, ignore check replace member status"); return; } if (!m_repl_svc_ctx || !is_leader()) { return; } @@ -2132,6 +2154,10 @@ void RaftReplDev::gc_repl_reqs() { void RaftReplDev::set_log_store_last_durable_lsn(store_lsn_t lsn) { m_data_journal->set_last_durable_lsn(lsn); } void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { + if (get_stage() == repl_dev_stage_t::UNREADY) { + RD_LOGI(NO_TRACE_ID, "Raft Channel: repl dev is in UNREADY stage, skip log replay."); + return; + } auto repl_lsn = to_repl_lsn(lsn); if (need_skip_processing(repl_lsn)) { RD_LOGI(NO_TRACE_ID, diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index f994b1b14..483098c6a 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -148,6 +148,8 @@ void RaftReplService::start() { } m_config_sb_bufs.clear(); LOGINFO("Repl devs load completed, calling upper layer on_repl_devs_init_completed"); + // The upper layer(m_repl_app) can leverage this cb to initiate and recover its data. + // If some errors occurs, m_repl_app can set back the stage of repl_dev to repl_dev_stage_t::UNREADY. m_repl_app->on_repl_devs_init_completed(); // Step 5: Start the data and logstore service now. This step is essential before we can ask Raft to join groups etc @@ -172,6 +174,10 @@ void RaftReplService::start() { // Step 6: Iterate all the repl devs and ask each one of them to join the raft group concurrently. std::vector< std::future< bool > > join_group_futures; for (const auto& [_, repl_dev] : m_rd_map) { + if (repl_dev->get_stage() == repl_dev_stage_t::UNREADY) { + LOGINFO("Repl dev is unready, skip join group, group_id={}", boost::uuids::to_string(repl_dev->group_id())); + continue; + } join_group_futures.emplace_back(std::async(std::launch::async, [&repl_dev]() { auto rdev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev); rdev->wait_for_logstore_ready(); @@ -668,6 +674,11 @@ void RaftReplService::flush_durable_commit_lsn() { for (auto& rdev_parent : m_rd_map) { // FIXUP: is it safe to access rdev_parent here? auto rdev = std::dynamic_pointer_cast< RaftReplDev >(rdev_parent.second); + if (rdev->get_stage() == repl_dev_stage_t::UNREADY) { + LOGINFOMOD(replication, "ReplDev group_id={} is UNREADY, skip flushing durable commit lsn", + boost::uuids::to_string(rdev->group_id())); + continue; + } rdev->flush_durable_commit_lsn(); } } From 4841880195f4c91131373153c8e7fe857716fdfa Mon Sep 17 00:00:00 2001 From: yawzhang Date: Thu, 26 Jun 2025 11:51:12 +0800 Subject: [PATCH 143/170] Enable log to console, this will help sherlock collect nuraft logs more easily --- conanfile.py | 4 ++-- src/lib/common/homestore_config.fbs | 3 +++ src/lib/replication/service/raft_repl_service.cpp | 3 ++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index f7df38b36..0c1d6fbed 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.17.3" + version = "6.17.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" @@ -54,7 +54,7 @@ def build_requirements(self): def requirements(self): self.requires("iomgr/[^11.3]@oss/master", transitive_headers=True) self.requires("sisl/[^12.2]@oss/master", transitive_headers=True) - self.requires("nuraft_mesg/[~3.8.4]@oss/main", transitive_headers=True) + self.requires("nuraft_mesg/[~3.8.5]@oss/main", transitive_headers=True) self.requires("farmhash/cci.20190513@", transitive_headers=True) if self.settings.arch in ['x86', 'x86_64']: diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index 5c6a779b8..ec103c32e 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -303,6 +303,9 @@ table Consensus { // The interval in ms to check if the new member in replace_member is fully synced and ready to take over replace_member_sync_check_interval_ms: uint64 = 60000; + + // Enable tee logs to console, this is helpful for sherlock to collect logs + enable_console_log: bool = true; } table HomeStoreSettings { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 483098c6a..06c1485a7 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -92,7 +92,8 @@ void RaftReplService::start() { .token_verifier_ = std::dynamic_pointer_cast< sisl::GrpcTokenVerifier >(ioenvironment.get_token_verifier()), .token_client_ = std::dynamic_pointer_cast< sisl::GrpcTokenClient >(ioenvironment.get_token_client()), .max_receive_message_size_ = HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), - .max_send_message_size_ = HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size)}; + .max_send_message_size_ = HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), + .enable_console_log_ = HS_DYNAMIC_CONFIG(consensus.enable_console_log)}; m_msg_mgr = nuraft_mesg::init_messaging(params, weak_from_this(), true /* with_data_channel */); LOGINFO("Starting RaftReplService with server_uuid={} port={}", boost::uuids::to_string(params.server_uuid_), From cff1332dd304d28cef72306fac82da9a35ccd20d Mon Sep 17 00:00:00 2001 From: Sanal Date: Thu, 26 Jun 2025 11:38:31 -0700 Subject: [PATCH 144/170] Add varsize allow partial alloc and chunk selector changes. (#747) Varsize blk alloc have to take account if request is partial and continuous. Add chunk selector apis to notify when a blk is alloced and freed. Fix logdev asan issue. stream tracker create can cause realloc which cause use after free. Disable raft repl dev and dynamic UT temporarily. --- conanfile.py | 2 +- src/include/homestore/checkpoint/cp_mgr.hpp | 1 + src/include/homestore/chunk_selector.h | 4 +- src/lib/blkalloc/varsize_blk_allocator.cpp | 16 ++++--- src/lib/blkdata_svc/blkdata_service.cpp | 6 ++- src/lib/checkpoint/cp_mgr.cpp | 15 +++++- src/lib/device/virtual_dev.cpp | 51 ++++++++++++++++++--- src/lib/device/virtual_dev.hpp | 7 +++ src/lib/logstore/log_dev.cpp | 28 +++++++++-- src/lib/logstore/log_dev.hpp | 1 + src/lib/logstore/log_store_service.cpp | 4 ++ src/tests/CMakeLists.txt | 6 +-- src/tests/test_blkalloc.cpp | 17 +++---- 13 files changed, 124 insertions(+), 34 deletions(-) diff --git a/conanfile.py b/conanfile.py index 0c1d6fbed..d3e8550a4 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.17.4" + version = "6.17.5" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/checkpoint/cp_mgr.hpp b/src/include/homestore/checkpoint/cp_mgr.hpp index b5154404c..634524293 100644 --- a/src/include/homestore/checkpoint/cp_mgr.hpp +++ b/src/include/homestore/checkpoint/cp_mgr.hpp @@ -230,6 +230,7 @@ class CPManager { void on_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie); void start_cp_thread(); folly::Future< bool > do_trigger_cp_flush(bool force, bool flush_on_shutdown); + uint64_t cp_timer_us(); }; extern CPManager& cp_mgr(); diff --git a/src/include/homestore/chunk_selector.h b/src/include/homestore/chunk_selector.h index 1d66f63cc..c04af9991 100644 --- a/src/include/homestore/chunk_selector.h +++ b/src/include/homestore/chunk_selector.h @@ -22,9 +22,11 @@ class ChunkSelector { public: ChunkSelector() = default; virtual void add_chunk(cshared< Chunk >&) = 0; - virtual void remove_chunk(cshared< Chunk >&){}; + virtual void remove_chunk(cshared< Chunk >&) {}; virtual void foreach_chunks(std::function< void(cshared< Chunk >&) >&& cb) = 0; virtual cshared< Chunk > select_chunk(blk_count_t nblks, const blk_alloc_hints& hints) = 0; + virtual void on_alloc_blk(chunk_num_t chunk_num, blk_count_t nblks) {} + virtual void on_free_blk(chunk_num_t chunk_num, blk_count_t nblks) {} virtual ~ChunkSelector() = default; }; diff --git a/src/lib/blkalloc/varsize_blk_allocator.cpp b/src/lib/blkalloc/varsize_blk_allocator.cpp index cf4000898..bfa92902d 100644 --- a/src/lib/blkalloc/varsize_blk_allocator.cpp +++ b/src/lib/blkalloc/varsize_blk_allocator.cpp @@ -586,7 +586,9 @@ blk_count_t VarsizeBlkAllocator::alloc_blks_direct(blk_count_t nblks, blk_alloc_ auto start_portion_num = m_start_portion_num; auto const max_pieces = hints.is_contiguous ? 1u : MultiBlkId::max_pieces; - blk_count_t const min_blks = hints.is_contiguous ? nblks : std::min< blk_count_t >(nblks, hints.min_blks_per_piece); + blk_count_t const min_blks = hints.is_contiguous && !hints.partial_alloc_ok + ? nblks + : std::min< blk_count_t >(nblks, hints.min_blks_per_piece); blk_count_t nblks_remain = nblks; do { BlkAllocPortion& portion = get_blk_portion(portion_num); @@ -619,9 +621,11 @@ blk_count_t VarsizeBlkAllocator::alloc_blks_direct(blk_count_t nblks, blk_alloc_ if (nblks_remain) { auto curr_portion = portion_num; if (++portion_num == get_num_portions()) { portion_num = 0; } - BLKALLOC_LOG( - TRACE, "alloc direct unable to find in curr portion {}, will searching in portion={}, start_portion={},continue={}, out_blkid num_pieces={} , max_pieces={}", - curr_portion, portion_num, start_portion_num, hints.is_contiguous, out_blkid.num_pieces(), max_pieces); + BLKALLOC_LOG(TRACE, + "alloc direct unable to find in curr portion {}, will searching in portion={}, " + "start_portion={},continue={}, out_blkid num_pieces={} , max_pieces={}", + curr_portion, portion_num, start_portion_num, hints.is_contiguous, out_blkid.num_pieces(), + max_pieces); } } while (nblks_remain && (portion_num != start_portion_num) && (out_blkid.num_pieces() < max_pieces)); @@ -775,8 +779,8 @@ void VarsizeBlkAllocator::alloc_sanity_check(blk_count_t nblks, blk_alloc_hints } BLKALLOC_REL_ASSERT((nblks == alloced_nblks), "Requested blks={} alloced_blks={} num_pieces={}", nblks, alloced_nblks, out_blkid.num_pieces()); - BLKALLOC_REL_ASSERT((!hints.is_contiguous || (out_blkid.num_pieces() == 1)), - "Multiple blkids allocated for contiguous request"); + // BLKALLOC_REL_ASSERT((!hints.is_contiguous || (out_blkid.num_pieces() == 1)), + // "Multiple blkids allocated for contiguous request"); } } #endif diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index f327d7834..87a59f8e2 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -215,8 +215,12 @@ BlkDataService::async_write(sisl::sg_list const& sgs, std::vector< MultiBlkId > incr_pending_request_num(); static thread_local std::vector< folly::Future< std::error_code > > s_futs; s_futs.clear(); + sisl::sg_iterator sg_it{sgs.iovs}; for (const auto& blkid : blkids) { - s_futs.emplace_back(async_write(sgs, blkid, part_of_batch)); + auto sgs_size = blkid.blk_count() * data_service().get_blk_size(); + const auto iovs = sg_it.next_iovs(sgs_size); + sisl::sg_list single_sgs{sgs_size, iovs}; + s_futs.emplace_back(async_write(single_sgs, blkid, part_of_batch)); } decr_pending_request_num(); return collect_all_futures(s_futs); diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp index 42b19485e..0acc8588c 100644 --- a/src/lib/checkpoint/cp_mgr.cpp +++ b/src/lib/checkpoint/cp_mgr.cpp @@ -54,10 +54,21 @@ void CPManager::start(bool first_time_boot) { } } +uint64_t CPManager::cp_timer_us() { + if (SISL_OPTIONS.count("cp_timer_ms")) { + auto const n = SISL_OPTIONS["cp_timer_ms"].as< uint64_t >() * 1000; + LOGINFO("Using cp_timer_ms option value: {}", n); + return n; + } else { + return HS_DYNAMIC_CONFIG(generic.cp_timer_us); + } +} + void CPManager::start_timer() { - LOGINFO("cp timer is set to {} usec", HS_DYNAMIC_CONFIG(generic.cp_timer_us)); + auto usecs = cp_timer_us(); + LOGINFO("cp timer is set to {} usec", usecs); m_cp_timer_hdl = iomanager.schedule_global_timer( - HS_DYNAMIC_CONFIG(generic.cp_timer_us) * 1000, true, nullptr /*cookie*/, iomgr::reactor_regex::all_worker, + usecs * 1000, true, nullptr /*cookie*/, iomgr::reactor_regex::all_worker, [this](void*) { trigger_cp_flush(false /* false */); }, true /* wait_to_schedule */); } diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp index ac49f95dd..c6597577c 100644 --- a/src/lib/device/virtual_dev.cpp +++ b/src/lib/device/virtual_dev.cpp @@ -202,6 +202,28 @@ BlkAllocStatus VirtualDev::alloc_contiguous_blks(blk_count_t nblks, blk_alloc_hi return ret; } +BlkAllocStatus VirtualDev::alloc_n_contiguous_blks(blk_count_t nblks, blk_alloc_hints hints, MultiBlkId& out_blkid) { + BlkAllocStatus ret; + try { + MultiBlkId mbid; + if (!hints.is_contiguous) { + HS_DBG_ASSERT(false, "Expected alloc_contiguous_blk call to be with hints.is_contiguous=true"); + hints.is_contiguous = true; + } + ret = alloc_blks(nblks, hints, mbid); + + if (ret == BlkAllocStatus::SUCCESS || (ret == BlkAllocStatus::PARTIAL && hints.partial_alloc_ok)) { + out_blkid = mbid; + } + + // for failure case, fall through and return the status to caller; + } catch (const std::exception& e) { + ret = BlkAllocStatus::FAILED; + HS_DBG_ASSERT(0, "{}", e.what()); + } + return ret; +} + BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId& out_blkid) { try { // First select a chunk to allocate it from @@ -254,19 +276,24 @@ BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& BlkAllocStatus status; do { - out_blkids.emplace_back(); // Put an empty MultiBlkId and use that for allocating them - BlkId& out_bid = out_blkids.back(); - status = alloc_contiguous_blks(nblks_remain, h, out_bid); - - auto nblks_this_iter = out_bid.blk_count(); - nblks_remain = (nblks_remain < nblks_this_iter) ? 0 : (nblks_remain - nblks_this_iter); - + MultiBlkId mbid; + status = alloc_n_contiguous_blks(nblks_remain, h, mbid); if (status != BlkAllocStatus::SUCCESS && status != BlkAllocStatus::PARTIAL) { out_blkids.pop_back(); // all chunks has been tried, but still failed to allocate; // break out and return status to caller; break; } + + blk_count_t nblks_this_iter = 0; + auto it = mbid.iterate(); + while (auto const b = it.next()) { + nblks_this_iter += (*b).blk_count(); + out_blkids.emplace_back(*b); + } + + nblks_remain = (nblks_remain < nblks_this_iter) ? 0 : (nblks_remain - nblks_this_iter); + } while (nblks_remain); return status; @@ -285,6 +312,14 @@ BlkAllocStatus VirtualDev::alloc_blks_from_chunk(blk_count_t nblks, blk_alloc_hi chunk->blk_allocator_mutable()->free(out_blkid); out_blkid = MultiBlkId{}; status = BlkAllocStatus::FAILED; + } else if (status == BlkAllocStatus::SUCCESS || status == BlkAllocStatus::PARTIAL) { + blk_count_t nblks_alloc = 0; + auto it = out_blkid.iterate(); + while (auto const b = it.next()) { + nblks_alloc += (*b).blk_count(); + } + // Inform chunk selector on the number of blks alloced + m_chunk_selector->on_alloc_blk(chunk->chunk_id(), nblks_alloc); } return status; @@ -301,6 +336,8 @@ void VirtualDev::free_blk(BlkId const& bid, VDevCPContext* vctx) { if (!chunk) HS_DBG_ASSERT(false, "chunk is missing for blkid {}", b.to_string()); BlkAllocator* allocator = chunk->blk_allocator_mutable(); allocator->free(b); + // Inform chunk selector on the number of blks freed + m_chunk_selector->on_free_blk(chunk->chunk_id(), b.blk_count()); } }; diff --git a/src/lib/device/virtual_dev.hpp b/src/lib/device/virtual_dev.hpp index 409aa167a..14ea1ee9a 100644 --- a/src/lib/device/virtual_dev.hpp +++ b/src/lib/device/virtual_dev.hpp @@ -138,6 +138,13 @@ class VirtualDev { /// @return BlkAllocStatus : Status about the allocation virtual BlkAllocStatus alloc_contiguous_blks(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid); + /// @brief This method allocates multiple contiguous blocks in the vdev + /// @param nblks : Number of blocks to allocate + /// @param hints : Hints about block allocation, (specific device to allocate, stream etc) + /// @param out_blkid : Reference to where allocated MultiBlkId to be placed + /// @return BlkAllocStatus : Status about the allocation + virtual BlkAllocStatus alloc_n_contiguous_blks(blk_count_t nblks, blk_alloc_hints hints, MultiBlkId& out_blkid); + /// @brief This method allocates blocks in the vdev and it could be non-contiguous, hence multiple BlkIds are /// returned /// @param nblks : Number of blocks to allocate diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index acdedc280..fc3d21184 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -269,9 +269,11 @@ int64_t LogDev::append_async(logstore_id_t store_id, logstore_seq_num_t seq_num, void* cb_context) { if (is_stopping()) return -1; incr_pending_request_num(); + m_stream_tracker_mtx.lock_shared(); const auto idx = m_log_idx.fetch_add(1, std::memory_order_acq_rel); m_pending_flush_size.fetch_add(data.size(), std::memory_order_relaxed); m_log_records->create(idx, store_id, seq_num, data, cb_context); + m_stream_tracker_mtx.unlock_shared(); if (allow_inline_flush()) flush_if_necessary(); decr_pending_request_num(); return idx; @@ -512,12 +514,32 @@ void LogDev::on_flush_completion(LogGroup* lg) { auto upto_indx = lg->m_flush_log_idx_upto; auto dev_offset = lg->m_log_dev_offset; for (auto idx = from_indx; idx <= upto_indx; ++idx) { - auto& record = m_log_records->at(idx); - logstore_req* req = s_cast< logstore_req* >(record.context); + logstore_req* req; + logstore_id_t store_id; +#ifdef _PRERELEASE + uint64_t lock_latency; + auto lock_start_time = Clock::now(); +#endif + { + // both flush completion and async_append can happen in parallel and + // during async_append stream tracker create log entry can cause + // resize and realloc of memory. So take a lock so that log records + // point to valid memory. + folly::SharedMutexWritePriority::WriteHolder holder(m_stream_tracker_mtx); +#ifdef _PRERELEASE + lock_latency = get_elapsed_time_us(lock_start_time); +#endif + auto& record = m_log_records->at(idx); + req = s_cast< logstore_req* >(record.context); + store_id = record.store_id; + } HomeLogStore* log_store = req->log_store; - HS_LOG_ASSERT_EQ(log_store->get_store_id(), record.store_id, + HS_LOG_ASSERT_EQ(log_store->get_store_id(), store_id, "Expecting store id in log store and flush completion to match"); HISTOGRAM_OBSERVE(logstore_service().m_metrics, logstore_append_latency, get_elapsed_time_us(req->start_time)); +#ifdef _PRERELEASE + HISTOGRAM_OBSERVE(logstore_service().m_metrics, logstore_stream_tracker_lock_latency, lock_latency); +#endif log_store->on_write_completion(req, logdev_key{idx, dev_offset}, logdev_key{from_indx, dev_offset}); req_map[idx] = req; } diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index 8a5954f67..e0b160a75 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -805,6 +805,7 @@ class LogDev : public std::enable_shared_from_this< LogDev > { // same thread. iomgr::FiberManagerLib::mutex m_flush_mtx; std::atomic_uint64_t m_pending_callback{0}; + folly::SharedMutexWritePriority m_stream_tracker_mtx; private: // graceful shutdown related fields diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index 9f656d5c9..abb266101 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -403,6 +403,10 @@ LogStoreServiceMetrics::LogStoreServiceMetrics() : sisl::MetricsGroup("LogStores REGISTER_COUNTER(logstore_read_count, "Total number of read requests to log stores", "logstore_op_count", {"op", "read"}); REGISTER_HISTOGRAM(logstore_append_latency, "Logstore append latency", "logstore_op_latency", {"op", "write"}); +#ifdef _PRERELEASE + REGISTER_HISTOGRAM(logstore_stream_tracker_lock_latency, "Logstore stream tracker lock latency", + "logstore_stream_tracker_lock_latency"); +#endif REGISTER_HISTOGRAM(logstore_read_latency, "Logstore read latency", "logstore_op_latency", {"op", "read"}); REGISTER_HISTOGRAM(logdev_flush_size_distribution, "Distribution of flush data size", HistogramBucketsType(ExponentialOfTwoBuckets)); diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 0ceaf090b..916364b41 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -129,8 +129,8 @@ if (${io_tests}) add_test(NAME HomeRaftLogStore-Epoll COMMAND test_home_raft_logstore) add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr) add_test(NAME DataService-Epoll COMMAND test_data_service) - add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev) - add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic --override_config homestore_config.consensus.replace_member_sync_check_interval_ms=1000) + # add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev) + # add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic --override_config homestore_config.consensus.replace_member_sync_check_interval_ms=1000) add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) endif() @@ -143,7 +143,7 @@ if (${io_tests}) add_test(NAME SoloReplDev-Spdk COMMAND test_solo_repl_dev -- --spdk "true") add_test(NAME HomeRaftLogStore-Spdk COMMAND test_home_raft_logstore -- --spdk "true") add_test(NAME RaftReplDev-Spdk COMMAND test_raft_repl_dev -- --spdk "true") - # add_test(NAME RaftReplDevDynamic-Spdk COMMAND test_raft_repl_dev_dynamic -- --spdk "true") + add_test(NAME RaftReplDevDynamic-Spdk COMMAND test_raft_repl_dev_dynamic -- --spdk "true") if(${epoll_tests}) SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk) SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk) diff --git a/src/tests/test_blkalloc.cpp b/src/tests/test_blkalloc.cpp index b91addbb4..1860441f1 100644 --- a/src/tests/test_blkalloc.cpp +++ b/src/tests/test_blkalloc.cpp @@ -434,8 +434,7 @@ struct VarsizeBlkAllocatorTest : public ::testing::Test, BlkAllocatorTest { void create_allocator(const bool use_slabs = true, uint64_t size = 0) { if (size == 0) { size = static_cast< uint64_t >(m_total_count); } - VarsizeBlkAllocConfig cfg{4096, 4096, 4096u, size * 4096, - false, "", use_slabs}; + VarsizeBlkAllocConfig cfg{4096, 4096, 4096u, size * 4096, false, "", use_slabs}; m_allocator = std::make_unique< VarsizeBlkAllocator >(cfg, true, 0); } @@ -456,6 +455,7 @@ struct VarsizeBlkAllocatorTest : public ::testing::Test, BlkAllocatorTest { return false; } if (ret == BlkAllocStatus::SUCCESS) { +#if 0 if (is_contiguous) { if (bids.size() != 1) { { @@ -466,6 +466,7 @@ struct VarsizeBlkAllocatorTest : public ::testing::Test, BlkAllocatorTest { return false; } } +#endif blk_count_t sz{0}; for (auto& bid : bids) { @@ -639,18 +640,15 @@ namespace { void alloc_free_var_contiguous_unirandsize(VarsizeBlkAllocatorTest* const block_test_pointer, uint64_t capacity) { const auto nthreads{ std::clamp< uint32_t >(std::thread::hardware_concurrency(), 2, SISL_OPTIONS["num_threads"].as< uint32_t >())}; - auto max_rand_size{std::max(capacity/4096, uint64_t(2))}; + auto max_rand_size{std::max(capacity / 4096, uint64_t(2))}; std::uniform_int_distribution< blk_count_t > s_rand_size_generator{1, static_cast< blk_count_t >(max_rand_size)}; - auto rand_func = [&s_rand_size_generator]() -> blk_count_t { - return s_rand_size_generator(g_re); - }; + auto rand_func = [&s_rand_size_generator]() -> blk_count_t { return s_rand_size_generator(g_re); }; const uint8_t prealloc_pct{5}; LOGINFO("Step 1: Pre allocate {}% of total blks which is {} blks in {} threads", prealloc_pct, capacity * prealloc_pct / 100, nthreads); [[maybe_unused]] const auto preload_alloced{ - block_test_pointer->preload(capacity * prealloc_pct / 100, true /* is_contiguous */, - rand_func, true)}; + block_test_pointer->preload(capacity * prealloc_pct / 100, true /* is_contiguous */, rand_func, true)}; auto num_iters{SISL_OPTIONS["iters"].as< uint64_t >()}; const uint64_t divisor{1024}; @@ -662,8 +660,7 @@ void alloc_free_var_contiguous_unirandsize(VarsizeBlkAllocatorTest* const block_ const uint8_t runtime_pct{10}; LOGINFO("Step 2: Do alloc/free contiguous blks with completely random size ratio_range=[{}-{}] threads={} iters={}", prealloc_pct, runtime_pct, nthreads, num_iters); - const auto result{block_test_pointer->do_alloc_free(num_iters, true /* is_contiguous */, - rand_func, runtime_pct, + const auto result{block_test_pointer->do_alloc_free(num_iters, true /* is_contiguous */, rand_func, runtime_pct, false /* round_blks */, true)}; } } // namespace From 327ab3263b5995e9afdd6a5c794e7df53eca15d2 Mon Sep 17 00:00:00 2001 From: yuwmao <148639999+yuwmao@users.noreply.github.com> Date: Fri, 27 Jun 2025 14:17:55 +0800 Subject: [PATCH 145/170] Change task_id to string (#758) --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 4 +- src/include/homestore/replication_service.hpp | 4 +- .../replication/repl_dev/raft_repl_dev.cpp | 106 +++++++++--------- src/lib/replication/repl_dev/raft_repl_dev.h | 24 +++- .../replication/service/generic_repl_svc.cpp | 4 +- .../replication/service/generic_repl_svc.h | 4 +- .../replication/service/raft_repl_service.cpp | 4 +- .../replication/service/raft_repl_service.h | 4 +- src/tests/test_common/raft_repl_test_base.hpp | 14 +-- src/tests/test_raft_repl_dev_dynamic.cpp | 36 +++--- src/tests/test_solo_repl_dev.cpp | 4 +- 12 files changed, 110 insertions(+), 100 deletions(-) diff --git a/conanfile.py b/conanfile.py index d3e8550a4..cc05287e6 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.17.5" + version = "6.18.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index f864e9137..aef7fcf3b 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -371,11 +371,11 @@ class ReplDevListener { virtual void on_destroy(const group_id_t& group_id) = 0; /// @brief Called when start replace member. - virtual void on_start_replace_member(const uuid_t& task_id, const replica_member_info& member_out, + virtual void on_start_replace_member(const std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) = 0; /// @brief Called when complete replace member. - virtual void on_complete_replace_member(const uuid_t& task_id, const replica_member_info& member_out, + virtual void on_complete_replace_member(const std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) = 0; /// @brief Called when the snapshot is being created by nuraft diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index 2adcc3584..b31541686 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -48,7 +48,7 @@ class ReplicationService { /// @param member_in The member which is going to be added in place of member_out /// @param commit_quorum Commit quorum to be used for this operation. If 0, it will use the default commit quorum. /// @return A Future on replace the member accepted or Future ReplServiceError upon error - virtual AsyncReplResult<> replace_member(group_id_t group_id, uuid_t task_id, const replica_member_info& member_out, + virtual AsyncReplResult<> replace_member(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0) const = 0; @@ -62,7 +62,7 @@ class ReplicationService { /// @param member_in The member which is going to be added in place of member_out /// @param others Other members excluding member_out, member_in /// @return ReplaceMemberStatus - virtual ReplaceMemberStatus get_replace_member_status(group_id_t group_id, uuid_t task_id, + virtual ReplaceMemberStatus get_replace_member_status(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, const std::vector< replica_member_info >& others, diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 04248a3b5..fd7951434 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -138,7 +138,7 @@ bool RaftReplDev::join_group() { } // All the steps in the implementation should be idempotent and retryable. -AsyncReplResult<> RaftReplDev::start_replace_member(uuid_t task_id, const replica_member_info& member_out, +AsyncReplResult<> RaftReplDev::start_replace_member(std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) { if (is_stopping()) { @@ -151,15 +151,16 @@ AsyncReplResult<> RaftReplDev::start_replace_member(uuid_t task_id, const replic } incr_pending_request_num(); - RD_LOGI(trace_id, "Start replace member, task_id={}, member_out={} member_in={}", boost::uuids::to_string(task_id), + RD_LOGI(trace_id, "Start replace member, task_id={}, member_out={} member_in={}", task_id, boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); // Step1, validate request // TODO support rollback, this could happen when the first task failed, and we want to launch a new task to // remediate it. Need to rollback the first task. And for the same task, it's reentrant and idempotent. - if (!m_rd_sb->replace_member_task.task_id.is_nil() && m_rd_sb->replace_member_task.task_id != task_id) { + auto existing_task_id = get_replace_member_task_id(); + if (!existing_task_id.empty() && existing_task_id != task_id) { RD_LOGE(trace_id, "Step1. Replace member, task_id={} is not the same as existing task_id={}", - boost::uuids::to_string(task_id), boost::uuids::to_string(m_rd_sb->replace_member_task.task_id)); + task_id, existing_task_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::REPLACE_MEMBER_TASK_MISMATCH); } @@ -171,13 +172,13 @@ AsyncReplResult<> RaftReplDev::start_replace_member(uuid_t task_id, const replic RD_LOGI(trace_id, "Step1. Replace member, the intent has already been fulfilled, ignore it, task_id={}, " "member_out={} member_in={}", - boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id), + task_id, boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); decr_pending_request_num(); return make_async_success<>(); } RD_LOGE(trace_id, "Step1. Replace member invalid parameter, out member is not found, task_id={}", - boost::uuids::to_string(task_id)); + task_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } @@ -192,7 +193,7 @@ AsyncReplResult<> RaftReplDev::start_replace_member(uuid_t task_id, const replic // client retry. raft_server()->yield_leadership(false /* immediate */, -1 /* successor */); RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out so yield leadership, task_id={}", - boost::uuids::to_string(task_id)); + task_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::NOT_LEADER); } @@ -207,7 +208,7 @@ AsyncReplResult<> RaftReplDev::start_replace_member(uuid_t task_id, const replic RD_LOGD(trace_id, "Step1. Replace member, quorum safety check, active_peers={}, active_peers_exclude_out/in_member={}, " "commit_quorum={}, task_id={}", - active_peers.size(), active_num, commit_quorum, boost::uuids::to_string(task_id)); + active_peers.size(), active_num, commit_quorum, task_id); // commit_quorum=0 means actual commit quorum is the majority. In this case, active normal member count should be // >= majority. To be more specific, if we have S1(leader), S2, S3(out), S4(in), we don't allow // replace_member(S3, S4) if S2 is down or laggy. Needs to recover S2 first or retry with commit_quorum=1. @@ -216,7 +217,7 @@ AsyncReplResult<> RaftReplDev::start_replace_member(uuid_t task_id, const replic RD_LOGE(trace_id, "Step1. Replace member, quorum safety check failed, active_peers={}, " "active_peers_exclude_out/in_member={}, required_quorum={}, commit_quorum={}, task_id={}", - active_peers.size(), active_num, quorum, commit_quorum, boost::uuids::to_string(task_id)); + active_peers.size(), active_num, quorum, commit_quorum, task_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::QUORUM_NOT_MET); } @@ -234,26 +235,23 @@ AsyncReplResult<> RaftReplDev::start_replace_member(uuid_t task_id, const replic } #endif RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner, task_id={}", - boost::uuids::to_string(task_id)); + task_id); auto learner_ret = do_flip_learner(member_out, true, true, trace_id); if (learner_ret != ReplServiceError::OK) { RD_LOGE(trace_id, "Step2. Replace member, failed to flip out member to learner {}, task_id={}", learner_ret, - boost::uuids::to_string(task_id)); + task_id); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error(std::move(learner_ret)); } RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner and set priority to 0, task_id={}", - boost::uuids::to_string(task_id)); + task_id); // Step 3. Append log entry to mark the old member is out and new member is added. RD_LOGI(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req, group_id={}, task_id={}", - boost::uuids::to_string(task_id), group_id_str()); + task_id, group_id_str()); auto rreq = repl_req_ptr_t(new repl_req_ctx{}); - replace_member_ctx ctx; - ctx.task_id = task_id; - ctx.replica_out = member_out; - ctx.replica_in = member_in; + auto ctx = replace_member_ctx(task_id, member_out, member_in); sisl::blob header(r_cast< uint8_t* >(&ctx), sizeof(replace_member_ctx)); rreq->init(repl_key{.server_id = server_id(), @@ -266,7 +264,7 @@ AsyncReplResult<> RaftReplDev::start_replace_member(uuid_t task_id, const replic if (err != ReplServiceError::OK) { RD_LOGE(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req failed, task_id={}, err={}", - boost::uuids::to_string(task_id), err); + task_id, err); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(err)); @@ -280,25 +278,25 @@ AsyncReplResult<> RaftReplDev::start_replace_member(uuid_t task_id, const replic } #endif RD_LOGI(trace_id, "Step4. Replace member, propose to raft to add new member, group_id={}, task_id={}", - group_id_str(), boost::uuids::to_string(task_id)); + group_id_str(), task_id); replica_member_info member_to_add = member_in; member_to_add.priority = out_srv_cfg.get()->get_priority(); auto ret = do_add_member(member_to_add, trace_id); if (ret != ReplServiceError::OK) { RD_LOGE(trace_id, "Step4. Replace member, add member failed, err={}, task_id={}", ret, - boost::uuids::to_string(task_id)); + task_id); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(ret)); } RD_LOGI(trace_id, "Step4. Replace member, proposed to raft to add member, task_id={}, member={}", - boost::uuids::to_string(task_id), boost::uuids::to_string(member_in.id)); + task_id, boost::uuids::to_string(member_in.id)); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_success<>(); } -AsyncReplResult<> RaftReplDev::complete_replace_member(uuid_t task_id, const replica_member_info& member_out, +AsyncReplResult<> RaftReplDev::complete_replace_member(std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) { if (is_stopping()) { @@ -312,7 +310,7 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(uuid_t task_id, const rep incr_pending_request_num(); RD_LOGI(trace_id, "Complete replace member, task_id={}, member_out={}, member_in={}", - boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id), + task_id, boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); if (commit_quorum >= 1) { @@ -322,7 +320,7 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(uuid_t task_id, const rep // Step 5: Remove member RD_LOGI(trace_id, "Step5. Replace member, remove old member, task_id={}, member={}", - boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id)); + task_id, boost::uuids::to_string(member_out.id)); #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("replace_member_remove_member_failure")) { RD_LOGE(trace_id, "Simulating remove member failure"); @@ -332,13 +330,13 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(uuid_t task_id, const rep auto ret = do_remove_member(member_out, trace_id); if (ret != ReplServiceError::OK) { RD_LOGE(trace_id, "Step5. Replace member, failed to remove member, task_id={}, member={}, err={}", - boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id), ret); + task_id, boost::uuids::to_string(member_out.id), ret); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(ret)); } RD_LOGI(trace_id, "Step5. Replace member, proposed to raft to remove member, task_id={}, member={}", - boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id)); + task_id, boost::uuids::to_string(member_out.id)); auto timeout = HS_DYNAMIC_CONFIG(consensus.wait_for_config_change_ms); // TODO Move wait logic to nuraft_mesg if (!wait_and_check( @@ -360,17 +358,14 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(uuid_t task_id, const rep return make_async_error<>(ReplServiceError::RETRY_REQUEST); } RD_LOGD(trace_id, "Step5. Replace member, old member is removed, task_id={}, member={}", - boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id)); + task_id, boost::uuids::to_string(member_out.id)); // Step 2. Append log entry to complete replace member RD_LOGI(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req, group_id={}, task_id={}", - boost::uuids::to_string(task_id), group_id_str()); + task_id, group_id_str()); auto rreq = repl_req_ptr_t(new repl_req_ctx{}); - replace_member_ctx ctx; - ctx.task_id = task_id; - ctx.replica_out = member_out; - ctx.replica_in = member_in; + auto ctx = replace_member_ctx(task_id, member_out, member_in); sisl::blob header(r_cast< uint8_t* >(&ctx), sizeof(replace_member_ctx)); rreq->init(repl_key{.server_id = server_id(), @@ -383,7 +378,7 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(uuid_t task_id, const rep if (err != ReplServiceError::OK) { RD_LOGE(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req failed , task_id={}, err={}", - boost::uuids::to_string(task_id), err); + task_id, err); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(err)); @@ -392,12 +387,12 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(uuid_t task_id, const rep reset_quorum_size(0, trace_id); decr_pending_request_num(); RD_LOGI(trace_id, "Complete replace member done, group_id={}, task_id={}, member_out={} member_in={}", - group_id_str(), boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id), + group_id_str(), task_id, boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); return make_async_success<>(); } -ReplaceMemberStatus RaftReplDev::get_replace_member_status(uuid_t task_id, const replica_member_info& member_out, +ReplaceMemberStatus RaftReplDev::get_replace_member_status(std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, const std::vector< replica_member_info >& others, uint64_t trace_id) { @@ -426,7 +421,8 @@ ReplaceMemberStatus RaftReplDev::get_replace_member_status(uuid_t task_id, const } bool intent_completed = !found_out && found_in; - if (m_rd_sb->replace_member_task.task_id.is_nil()) { + auto persisted_task_id = get_replace_member_task_id(); + if (persisted_task_id.empty()) { if (intent_completed) { // If caller doesn't give others, won't check it. bool others_match = others.size() == 0 || others.size() + 1 == peers.size(); @@ -443,7 +439,7 @@ ReplaceMemberStatus RaftReplDev::get_replace_member_status(uuid_t task_id, const "get_replace_member_status failed, other membership mismatch, task_id={}, detail={}, " "others.size={}, " "all_peers.size={}", - boost::uuids::to_string(task_id), detail, others.size(), peers.size()); + task_id, detail, others.size(), peers.size()); decr_pending_request_num(); return ReplaceMemberStatus::UNKNOWN; } @@ -455,7 +451,7 @@ ReplaceMemberStatus RaftReplDev::get_replace_member_status(uuid_t task_id, const } if (m_rd_sb->replace_member_task.task_id != task_id) { RD_LOGE(trace_id, "get_replace_member_status failed, task_id mismatch, persisted={}, received={}", - boost::uuids::to_string(m_rd_sb->replace_member_task.task_id), boost::uuids::to_string(task_id)); + persisted_task_id, task_id); decr_pending_request_num(); return ReplaceMemberStatus::TASK_ID_MISMATCH; } @@ -467,11 +463,11 @@ ReplaceMemberStatus RaftReplDev::get_replace_member_status(uuid_t task_id, const RD_LOGI(trace_id, "Member replacement fulfilled, but task still exists, wait for reaper thread to retry " "complete_replace_member. task_id={}, out_member={}, in_member={}", - boost::uuids::to_string(m_rd_sb->replace_member_task.task_id), boost::uuids::to_string(member_out.id), + persisted_task_id, boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); } RD_LOGD(trace_id, "Member replacement is in progress. task_id={}, out_member={}, in_member={}", - boost::uuids::to_string(task_id), boost::uuids::to_string(member_out.id), + task_id, boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); decr_pending_request_num(); return ReplaceMemberStatus::IN_PROGRESS; @@ -1519,13 +1515,13 @@ void RaftReplDev::start_replace_member(repl_req_ptr_t rreq) { auto ctx = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); RD_LOGI(rreq->traceID(), "Raft repl start_replace_member commit, task_id={} member_out={} member_in={}", - boost::uuids::to_string(ctx->task_id), boost::uuids::to_string(ctx->replica_out.id), + ctx->task_id, boost::uuids::to_string(ctx->replica_out.id), boost::uuids::to_string(ctx->replica_in.id)); m_listener->on_start_replace_member(ctx->task_id, ctx->replica_out, ctx->replica_in, rreq->traceID()); // record the replace_member intent std::unique_lock lg{m_sb_mtx}; - m_rd_sb->replace_member_task.task_id = ctx->task_id; + std::strncpy(m_rd_sb->replace_member_task.task_id, ctx->task_id, max_replace_member_task_id_len); m_rd_sb->replace_member_task.replica_in = ctx->replica_in.id; m_rd_sb->replace_member_task.replica_out = ctx->replica_out.id; m_rd_sb.write(); @@ -1533,17 +1529,17 @@ void RaftReplDev::start_replace_member(repl_req_ptr_t rreq) { void RaftReplDev::complete_replace_member(repl_req_ptr_t rreq) { auto ctx = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); - - RD_LOGI(rreq->traceID(), "Raft repl complete_replace_member commit, task_id={} member_out={} member_in={}", - boost::uuids::to_string(ctx->task_id), boost::uuids::to_string(ctx->replica_out.id), - boost::uuids::to_string(ctx->replica_in.id)); + auto task_id = std::string(ctx->task_id); + RD_LOGI(rreq->traceID(), "Raft repl complete_replace_member commit, task_id={} member_out={} member_in={}", task_id, + boost::uuids::to_string(ctx->replica_out.id), boost::uuids::to_string(ctx->replica_in.id)); m_listener->on_complete_replace_member(ctx->task_id, ctx->replica_out, ctx->replica_in, rreq->traceID()); // clear the replace_member intent std::unique_lock lg{m_sb_mtx}; - if (!m_rd_sb->replace_member_task.task_id.is_nil()) { - RD_DBG_ASSERT(m_rd_sb->replace_member_task.task_id == ctx->task_id, + auto persisted_task_id = get_replace_member_task_id(); + if (!persisted_task_id.empty()) { + RD_DBG_ASSERT(persisted_task_id == task_id, "Invalid task_id in complete_replace_member message, received {}, expected {}", ctx->task_id, m_rd_sb->replace_member_task.task_id); m_rd_sb->replace_member_task = replace_member_task_superblk{}; @@ -1974,7 +1970,7 @@ void RaftReplDev::monitor_replace_member_replication_status() { } auto peers = get_replication_status(); - auto task_id = m_rd_sb->replace_member_task.task_id; + auto task_id = std::string(m_rd_sb->replace_member_task.task_id); auto replica_in = m_rd_sb->replace_member_task.replica_in; auto replica_out = m_rd_sb->replace_member_task.replica_out; repl_lsn_t in_lsn = 0; @@ -2005,7 +2001,7 @@ void RaftReplDev::monitor_replace_member_replication_status() { "Checking replace member status, new member has caught up, task_id={}, replica_in={} with lsn={}, " "replica_out={} with " "lsn={}", - boost::uuids::to_string(task_id), boost::uuids::to_string(replica_in), in_lsn, + task_id, boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); trace_id_t trace_id = generateRandomTraceId(); @@ -2016,14 +2012,14 @@ void RaftReplDev::monitor_replace_member_replication_status() { replica_member_info out{replica_out, ""}; replica_member_info in{replica_in, ""}; - auto ret = complete_replace_member(m_rd_sb->replace_member_task.task_id, out, in, 0, trace_id).get(); + auto ret = complete_replace_member(task_id, out, in, 0, trace_id).get(); if (ret.hasError()) { - RD_LOGE(trace_id, "Failed to complete replace member, next time will retry it, task_id={}, error={}", - boost::uuids::to_string(task_id), ret.error()); + RD_LOGE(trace_id, "Failed to complete replace member, next time will retry it, task_id={}, error={}", task_id, + ret.error()); return; } - RD_LOGI(trace_id, "Complete replace member, task_id={}, replica_in={}, replica_out={}", - boost::uuids::to_string(task_id), boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out)) + RD_LOGI(trace_id, "Complete replace member, task_id={}, replica_in={}, replica_out={}", task_id, + boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out)) } /////////////////////////////////// Private metohds //////////////////////////////////// diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index e3e8f20c5..8bd7639e5 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -15,8 +15,11 @@ #include "replication/log_store/repl_log_store.h" namespace homestore { + +static constexpr uint64_t max_replace_member_task_id_len = 64; + struct replace_member_task_superblk { - uuid_t task_id; + char task_id[max_replace_member_task_id_len]; replica_id_t replica_out; replica_id_t replica_in; }; @@ -41,9 +44,18 @@ using raft_buf_ptr_t = nuraft::ptr< nuraft::buffer >; using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; struct replace_member_ctx { - uuid_t task_id; + char task_id[max_replace_member_task_id_len]; replica_member_info replica_out; replica_member_info replica_in; + + replace_member_ctx() = default; + replace_member_ctx(const std::string& id, const replica_member_info& out, const replica_member_info& in) { + auto len = std::min(id.length(), max_replace_member_task_id_len - 1); + std::strncpy(task_id, id.c_str(), len); + task_id[len] = '\0'; + replica_out = out; + replica_in = in; + } }; class RaftReplDevMetrics : public sisl::MetricsGroup { @@ -229,12 +241,12 @@ class RaftReplDev : public ReplDev, bool bind_data_service(); bool join_group(); - AsyncReplResult<> start_replace_member(uuid_t task_id, const replica_member_info& member_out, const replica_member_info& member_in, + AsyncReplResult<> start_replace_member(std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0); - AsyncReplResult<> complete_replace_member(uuid_t task_id, const replica_member_info& member_out, + AsyncReplResult<> complete_replace_member(std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0); - ReplaceMemberStatus get_replace_member_status(uuid_t task_id, const replica_member_info& member_out, + ReplaceMemberStatus get_replace_member_status(std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, const std::vector< replica_member_info >& others, uint64_t trace_id = 0); @@ -249,6 +261,8 @@ class RaftReplDev : public ReplDev, uint64_t trace_id = 0); bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms = 100); + std::string get_replace_member_task_id() const { return {m_rd_sb->replace_member_task.task_id}; } + folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 0c547bab4..155090411 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -193,7 +193,7 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } } -AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, uuid_t task_id, +AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) const { @@ -206,7 +206,7 @@ AsyncReplResult<> SoloReplService::flip_learner_flag(group_id_t group_id, const return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } -ReplaceMemberStatus SoloReplService::get_replace_member_status(group_id_t group_id, uuid_t task_id, +ReplaceMemberStatus SoloReplService::get_replace_member_status(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, const std::vector< replica_member_info >& others, diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index 9f7261a18..91aba9f80 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -91,13 +91,13 @@ class SoloReplService : public GenericReplService { std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, uuid_t task_id, const replica_member_info& member_out, + AsyncReplResult<> replace_member(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0) const override; AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, uint64_t trace_id = 0) const override; - ReplaceMemberStatus get_replace_member_status(group_id_t group_id, uuid_t task_id, + ReplaceMemberStatus get_replace_member_status(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, const std::vector< replica_member_info >& others, diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 06c1485a7..ad67bf91d 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -489,7 +489,7 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki // In this function, it only invokes replDev start_replace_member. There is // a background reaper thread helps periodically check the member_in replication status, after in_member has caught up, // will trigger replDev complete_replace_member. -AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, uuid_t task_id, const replica_member_info& member_out, +AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) const { if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); @@ -536,7 +536,7 @@ AsyncReplResult<> RaftReplService::flip_learner_flag(group_id_t group_id, const } // This query should always be called on leader to avoid misleading results due to lagging status on some followers. -ReplaceMemberStatus RaftReplService::get_replace_member_status(group_id_t group_id, uuid_t task_id, +ReplaceMemberStatus RaftReplService::get_replace_member_status(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, const std::vector< replica_member_info >& others, diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 4108eaf35..187bd5f74 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -79,7 +79,7 @@ class RaftReplService : public GenericReplService, std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, uuid_t task_id, const replica_member_info& member_out, + AsyncReplResult<> replace_member(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0) const override; @@ -87,7 +87,7 @@ class RaftReplService : public GenericReplService, uint32_t commit_quorum, bool wait_and_verify = true, uint64_t trace_id = 0) const override; - ReplaceMemberStatus get_replace_member_status(group_id_t group_id, uuid_t task_id, + ReplaceMemberStatus get_replace_member_status(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, const std::vector< replica_member_info >& others, diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 934256594..f05baf381 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -343,13 +343,13 @@ class TestReplicatedDB : public homestore::ReplDevListener { return hints; } - void on_start_replace_member(const uuid_t& task_id, const replica_member_info& member_out, + void on_start_replace_member(const std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override { LOGINFO("[Replica={}] start replace member out {} in {}", g_helper->replica_num(), boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); } - void on_complete_replace_member(const uuid_t& task_id, const replica_member_info& member_out, + void on_complete_replace_member(const std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override { LOGINFO("[Replica={}] complete replace member out {} in {}", g_helper->replica_num(), boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); @@ -743,11 +743,11 @@ class RaftReplDevTestBase : public testing::Test { void create_snapshot() { dbs_[0]->create_snapshot(); } void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } - void replace_member(std::shared_ptr< TestReplicatedDB > db, uuid_t task_id, replica_id_t member_out, + void replace_member(std::shared_ptr< TestReplicatedDB > db, std::string& task_id, replica_id_t member_out, replica_id_t member_in, uint32_t commit_quorum = 0, ReplServiceError error = ReplServiceError::OK) { - this->run_on_leader(db, [this, error, db, task_id, member_out, member_in, commit_quorum]() { - LOGINFO("Start replace member task_id={}, out={}, in={}", boost::uuids::to_string(task_id), + this->run_on_leader(db, [this, error, db, &task_id, member_out, member_in, commit_quorum]() { + LOGINFO("Start replace member task_id={}, out={}, in={}", task_id, boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); replica_member_info out{member_out, ""}; @@ -763,9 +763,9 @@ class RaftReplDevTestBase : public testing::Test { }); } - ReplaceMemberStatus check_replace_member_status(std::shared_ptr< TestReplicatedDB > db, uuid_t task_id, + ReplaceMemberStatus check_replace_member_status(std::shared_ptr< TestReplicatedDB > db, std::string& task_id, replica_id_t member_out, replica_id_t member_in) { - LOGINFO("check replace member status, task_id={}, out={} in={}", boost::uuids::to_string(task_id), + LOGINFO("check replace member status, task_id={}, out={} in={}", task_id, boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); replica_member_info out{member_out, ""}; diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp index bb56072bd..71e9d0821 100644 --- a/src/tests/test_raft_repl_dev_dynamic.cpp +++ b/src/tests/test_raft_repl_dev_dynamic.cpp @@ -39,8 +39,8 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { uint32_t member_in = num_replicas; g_helper->sync_for_test_start(num_members); - auto task_id = boost::uuids::random_generator()(); - this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + std::string task_id = "task_id"; + this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { ASSERT_EQ( check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), ReplaceMemberStatus::TASK_NOT_FOUND); @@ -58,11 +58,11 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { g_helper->sync_for_verify_start(num_members); LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); - this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { ASSERT_EQ( check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), ReplaceMemberStatus::IN_PROGRESS); - auto new_task_id = boost::uuids::random_generator()(); + std::string new_task_id = "mismatched_task_id"; replace_member(db, new_task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, ReplServiceError::REPLACE_MEMBER_TASK_MISMATCH); }); @@ -88,7 +88,7 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { } g_helper->sync_for_cleanup_start(num_members); - this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { ASSERT_EQ( check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), ReplaceMemberStatus::COMPLETED); @@ -127,11 +127,11 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) { LOGINFO("Shutdown replica 2"); } - auto task_id = boost::uuids::random_generator()(); + std::string task_id = "task_id"; if (g_helper->replica_num() == 0) { // Replace down replica 2 with spare replica 3 with commit quorum 1 // so that leader can go ahead with replacing member. - LOGINFO("Replace member started, task_id={}", boost::uuids::to_string(task_id)); + LOGINFO("Replace member started, task_id={}", task_id); replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/); this->write_on_leader(num_io_entries, true /* wait_for_commit */); @@ -149,7 +149,7 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) { this->validate_data(); } g_helper->sync_for_verify_start(num_members); - this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { ASSERT_EQ( check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), ReplaceMemberStatus::IN_PROGRESS); @@ -197,7 +197,7 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) { this->shutdown_replica(2); LOGINFO("Shutdown replica 2"); - auto task_id = boost::uuids::random_generator()(); + std::string task_id = "task_id"; if (g_helper->replica_num() == 0) { replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); @@ -222,7 +222,7 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) { // data synced, waiting for removing learner LOGINFO("data synced, sync for completing replace member, replica={}", g_helper->replica_num()); g_helper->sync_for_verify_start(num_members); - this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { ASSERT_EQ( check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), ReplaceMemberStatus::IN_PROGRESS); @@ -245,7 +245,7 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) { } g_helper->sync_for_test_start(num_members); if (g_helper->replica_num() != 2) { - this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { auto status = check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); // out_member is down, so it can not response to remove req. Based on nuraft logic, leader will wait for @@ -282,7 +282,7 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { uint32_t member_in = num_replicas; g_helper->sync_for_test_start(num_members); - auto task_id = boost::uuids::random_generator()(); + std::string task_id = "task_id"; if (g_helper->replica_num() == member_out) { LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); // With existing raft repl dev group, write IO's, validate and call replace_member on leader. @@ -314,7 +314,7 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { } LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); - this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { ASSERT_EQ( check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), ReplaceMemberStatus::IN_PROGRESS); @@ -333,7 +333,7 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { } g_helper->sync_for_cleanup_start(num_members); - this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { ASSERT_EQ( check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), ReplaceMemberStatus::COMPLETED); @@ -361,7 +361,7 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { LOGINFO("Restart replica 1, "); this->restart_replica(15); } - auto task_id = boost::uuids::random_generator()(); + std::string task_id = "task_id"; if (g_helper->replica_num() == 0) { // With existing raft repl dev group, write IO's, validate and call replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); @@ -383,7 +383,7 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { } LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); - this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { ASSERT_EQ( check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), ReplaceMemberStatus::IN_PROGRESS); @@ -401,7 +401,7 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { } g_helper->sync_for_cleanup_start(num_members); - this->run_on_leader(db, [this, db, task_id, member_out, member_in] { + this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { ASSERT_EQ( check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), ReplaceMemberStatus::COMPLETED); @@ -442,7 +442,7 @@ TEST_F(ReplDevDynamicTest, ValidateRequest) { this->write_on_leader(num_io_entries, true /* wait_for_commit */); } - auto task_id = boost::uuids::random_generator()(); + std::string task_id = "task_id"; if (g_helper->replica_num() == 0) { // generate uuid replica_id_t fake_member_out = boost::uuids::random_generator()(); diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index ffdf00e1d..501871ec1 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -133,9 +133,9 @@ class SoloReplDevTest : public testing::Test { cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received error={} on repl_dev", enum_name(error)); } - void on_start_replace_member(const uuid_t& task_id, const replica_member_info& member_out, + void on_start_replace_member(const std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {} - void on_complete_replace_member(const uuid_t& task_id, const replica_member_info& member_out, + void on_complete_replace_member(const std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {} void on_destroy(const group_id_t& group_id) override {} void notify_committed_lsn(int64_t lsn) override {} From 7b9e224c9a3c52777b69a763cb9e17a9b6548a59 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Tue, 8 Jul 2025 18:08:03 +0800 Subject: [PATCH 146/170] minor fix on log dev 1. do not truncate/flush if the log dev is not ready (e.g., concurrency issue for resource_mgr's device_truncate and create_repl_dev) 2. make sure all pending writes flushed when the log dev is stopping --- conanfile.py | 2 +- src/lib/logstore/log_dev.cpp | 15 +++++++++++++++ src/lib/logstore/log_dev.hpp | 6 ++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index cc05287e6..892dd206f 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.18.0" + version = "6.18.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index fc3d21184..764259756 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -86,6 +86,9 @@ void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) { m_last_flush_idx = m_log_idx - 1; } + // Now that we have create/load logdev metablk, so the log dev is ready to be used + m_is_ready = true; + if (allow_timer_flush()) start_timer(); handle_unopened_log_stores(format); @@ -112,6 +115,7 @@ LogDev::~LogDev() { m_log_records.reset(nullptr); m_logdev_meta.reset(); m_log_idx.store(0); + m_is_ready = false; m_pending_flush_size.store(0); m_last_flush_idx = -1; m_last_flush_ld_key = logdev_key{0, 0}; @@ -147,6 +151,9 @@ void LogDev::stop() { store.log_store->stop(); } + // trigger a new flush to make sure all pending writes are flushed + flush_under_guard(); + // after we call stop, we need to do any pending device truncations truncate(); m_id_logstore_map.clear(); @@ -458,6 +465,10 @@ bool LogDev::flush_under_guard() { } bool LogDev::flush() { + if (!is_ready()) { + THIS_LOGDEV_LOG(INFO, "LogDev is not ready to flush, log_dev={}", m_logdev_id); + return false; + } m_last_flush_time = Clock::now(); // We were able to win the flushing competition and now we gather all the flush data and reserve a slot. auto new_idx = m_log_idx.load(std::memory_order_acquire) - 1; @@ -566,6 +577,10 @@ void LogDev::on_flush_completion(LogGroup* lg) { } uint64_t LogDev::truncate() { + if (!is_ready()) { + THIS_LOGDEV_LOG(INFO, "LogDev is not ready to truncate, log_dev={}", m_logdev_id); + return 0; + } auto stopping = is_stopping(); incr_pending_request_num(); // Order of this lock has to be preserved. We take externally visible lock which is flush lock first. This diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index e0b160a75..5b18f981b 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -723,6 +723,8 @@ class LogDev : public std::enable_shared_from_this< LogDev > { void start_timer(); folly::Future< int > stop_timer(); + bool is_ready() const { return m_is_ready.load(); } + bool allow_inline_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::INLINE); } bool allow_timer_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::TIMER); } bool allow_explicit_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::EXPLICIT); } @@ -807,6 +809,10 @@ class LogDev : public std::enable_shared_from_this< LogDev > { std::atomic_uint64_t m_pending_callback{0}; folly::SharedMutexWritePriority m_stream_tracker_mtx; + // This is used to ensure that the logdev meta is created/loaded + // to avoid other threads accessing it before it is ready (e.g., resource_mgr's device truncate thread) + std::atomic_bool m_is_ready{false}; + private: // graceful shutdown related fields std::atomic_bool m_stopping{false}; From f7fd108e7c6867b404276371b9687c446420f9a7 Mon Sep 17 00:00:00 2001 From: yawzhang Date: Thu, 3 Jul 2025 22:35:24 +0800 Subject: [PATCH 147/170] support effective log truncation --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 11 +- src/lib/common/homestore_config.fbs | 7 +- .../replication/log_store/repl_log_store.cpp | 10 +- .../replication/repl_dev/raft_repl_dev.cpp | 76 ++++++++++ src/lib/replication/repl_dev/raft_repl_dev.h | 12 ++ .../repl_dev/raft_state_machine.cpp | 3 + src/tests/test_common/raft_repl_test_base.hpp | 8 ++ src/tests/test_raft_repl_dev.cpp | 132 ++++++++++++++++++ 9 files changed, 250 insertions(+), 11 deletions(-) diff --git a/conanfile.py b/conanfile.py index 892dd206f..8cd093227 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.18.1" + version = "6.18.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index aef7fcf3b..aaf76b975 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -43,11 +43,12 @@ VENUM(repl_req_state_t, uint32_t, ) VENUM(journal_type_t, uint16_t, - HS_DATA_LINKED = 0, // Linked data where each entry will store physical blkid where data reside - HS_DATA_INLINED = 1, // Data is inlined in the header of journal entry - HS_CTRL_DESTROY = 2, // Control message to destroy the repl_dev - HS_CTRL_START_REPLACE = 3, // Control message to start replace a member - HS_CTRL_COMPLETE_REPLACE = 4, // Control message to complete replace a member + HS_DATA_LINKED = 0, // Linked data where each entry will store physical blkid where data reside + HS_DATA_INLINED = 1, // Data is inlined in the header of journal entry + HS_CTRL_DESTROY = 2, // Control message to destroy the repl_dev + HS_CTRL_START_REPLACE = 3, // Control message to start replace a member + HS_CTRL_COMPLETE_REPLACE = 4, // Control message to complete replace a member, + HS_CTRL_UPDATE_TRUNCATION_BOUNDARY = 5, // Control message to update truncation boundary ) ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, UNREADY, DESTROYING, DESTROYED, PERMANENT_DESTROYED); diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index ec103c32e..5a63bb9d5 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -184,7 +184,8 @@ table ResourceLimits { /* num entries that raft logstore wants to reserve -- its truncate should not across this */ /* 0 means HomeStore doesn't reserve anything and let nuraft controlls the truncation */ - raft_logstore_reserve_threshold: uint32 = 0 (hotswap); + /* default reserve 1 million logs */ + raft_logstore_reserve_threshold: uint32 = 1000000 (hotswap); /* resource audit timer in ms */ resource_audit_timer_ms: uint32 = 120000; @@ -237,7 +238,9 @@ table Consensus { snapshot_freq_distance: uint32 = 2000; // Num reserved log items while triggering compact from raft server, only consumed by nuraft server; - num_reserved_log_items: uint32 = 20000; + // Set it same as snapshot_freq_distance, so that every create_snapshot will trigger compact + // which is helpful for truncate unused logs + num_reserved_log_items: uint32 = 2000; // Max append batch size max_append_batch_size: int32 = 64; diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index f9b3d454e..41de00b6e 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -114,8 +114,12 @@ std::string ReplLogStore::rdev_name() const { return m_rd.rdev_name(); } std::string ReplLogStore::identify_str() const { return m_rd.identify_str(); } bool ReplLogStore::compact(ulong compact_upto_lsn) { - RD_LOGD(NO_TRACE_ID, "Raft Channel: compact_to_lsn={}", compact_upto_lsn); - m_rd.on_compact(compact_upto_lsn); - return HomeRaftLogStore::compact(compact_upto_lsn); + auto truncation_upper_limit = m_rd.get_truncation_upper_limit(); + auto effective_compact_lsn = std::min(static_cast< repl_lsn_t >(compact_upto_lsn), truncation_upper_limit); + RD_LOGD(NO_TRACE_ID, + "Raft Channel: effective_compact_lsn={}, raft compact_to_lsn={}, local truncation_upper_limit={}", + effective_compact_lsn, compact_upto_lsn, truncation_upper_limit); + m_rd.on_compact(effective_compact_lsn); + return HomeRaftLogStore::compact(effective_compact_lsn); } } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index fd7951434..e84638542 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -711,10 +711,52 @@ void RaftReplDev::on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< auto null_except = std::shared_ptr< std::exception >(); HS_REL_ASSERT(result.hasError() == false, "Not expecting creating snapshot to return false. "); + // propose truncate boundary on leader if needed + if (is_leader()) { + propose_truncate_boundary(); + } + auto ret_val{true}; if (when_done) { when_done(ret_val, null_except); } } +void RaftReplDev::propose_truncate_boundary() { + incr_pending_request_num(); + auto repl_status = get_replication_status(); + repl_lsn_t leader_commit_idx = m_commit_upto_lsn.load(); + repl_lsn_t minimum_repl_idx = leader_commit_idx; + for (auto p : repl_status) { + if (p.id_ == m_my_repl_id) { continue; } + RD_LOGD(NO_TRACE_ID, "peer_repl_idx={}, minimum_repl_idx={}", p.replication_idx_, minimum_repl_idx); + minimum_repl_idx = std::min(minimum_repl_idx, static_cast< repl_lsn_t >(p.replication_idx_)); + + } + repl_lsn_t raft_logstore_reserve_threshold = HS_DYNAMIC_CONFIG(resource_limits.raft_logstore_reserve_threshold); + repl_lsn_t truncation_upper_limit = std::max(leader_commit_idx - raft_logstore_reserve_threshold, minimum_repl_idx); + RD_LOGD(NO_TRACE_ID, "calculated truncation_upper_limit={}, " + "leader_commit_idx={}, raft_logstore_reserve_threshold={}, minimum_repl_idx={}", + truncation_upper_limit, leader_commit_idx, raft_logstore_reserve_threshold, minimum_repl_idx); + if (truncation_upper_limit > 0) { + auto rreq = repl_req_ptr_t(new repl_req_ctx{}); + auto ctx = truncate_ctx(truncation_upper_limit); + + sisl::blob header(r_cast< uint8_t* >(&ctx), sizeof(truncate_ctx)); + rreq->init(repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = std::numeric_limits< uint64_t >::max()}, + journal_type_t::HS_CTRL_UPDATE_TRUNCATION_BOUNDARY, true, header, sisl::blob{}, 0, m_listener); + + auto err = m_state_machine->propose_to_raft(std::move(rreq)); + if (err != ReplServiceError::OK) { + // failed to propose to raft to update truncation boundary + // the update will be retried next create_snapshot, so we just log the error + RD_LOGW(NO_TRACE_ID, "propose to raft for HS_CTRL_UPDATE_TRUNCATION_BOUNDARY req failed, err={}", err); + } + } + decr_pending_request_num(); +} + // 1 before repl_dev.stop() is called, the upper layer should make sure that there is no pending request. so graceful // shutdown can consider when stopping repl_dev, there is no pending request. // 2 before the log is appended to log store, repl_dev will guarantee the corresponding data is persisted on disk. so @@ -1415,6 +1457,8 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { start_replace_member(rreq); } else if (rreq->op_code() == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { complete_replace_member(rreq); + } else if (rreq->op_code() == journal_type_t::HS_CTRL_UPDATE_TRUNCATION_BOUNDARY) { + update_truncation_boundary(rreq); } else { m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), {rreq->local_blkid()}, rreq); } @@ -1548,6 +1592,38 @@ void RaftReplDev::complete_replace_member(repl_req_ptr_t rreq) { RD_LOGI(rreq->traceID(), "Raft repl replace_member_task has been cleared."); } +void RaftReplDev::update_truncation_boundary(repl_req_ptr_t rreq) { + repl_lsn_t cur_checkpoint_lsn = 0; + { + std::unique_lock lg{m_sb_mtx}; + cur_checkpoint_lsn = m_rd_sb->checkpoint_lsn; + } + // expected truncation_upper_limit should not larger than the current checkpoint_lsn, this is to ensure that + // when a crash happens before index flushed to disk, all the logs larger than checkpoint_lsn are still available + // to replay. + auto ctx = r_cast< const truncate_ctx* >(rreq->header().cbytes()); + auto exp_truncation_upper_limit = std::min(ctx->truncation_upper_limit, cur_checkpoint_lsn); + auto cur_truncation_upper_limit = m_truncation_upper_limit.load(); + // exp_truncation_upper_limit might be less or equal to cur_truncation_upper_limit after Baseline Re-sync, + // we should skip update to ensure the truncation_upper_limit is always increasing. + // for example: + // T1: Leader commits upto 10000, truncate logs upto 5000, while one of followers F1 is lagging behind with lsn 100 + // T2: F1 receives a snapshot with lsn 10000, start catching up + // T3: Leader commits upto 11000, propose truncation_upper_limit as 6000 + // T4: F1 catches up and commits upto 10000, this time truncation_upper_limit is updated as 10000 + // T5: F1 doing incremental re-sync, applies the log with truncation_upper_limit=6000, which is less than 10000 + if (exp_truncation_upper_limit <= cur_truncation_upper_limit) { + RD_LOGW(NO_TRACE_ID, + "exp_truncation_upper_limit {} is no larger than cur_truncation_upper_limit {}", + exp_truncation_upper_limit, cur_truncation_upper_limit); + return; + } + + while (cur_truncation_upper_limit < exp_truncation_upper_limit && + !m_truncation_upper_limit.compare_exchange_weak(cur_truncation_upper_limit, exp_truncation_upper_limit)) {} + RD_LOGI(NO_TRACE_ID, "Raft repl update truncation_upper_limit to {}", exp_truncation_upper_limit); +} + static bool blob_equals(sisl::blob const& a, sisl::blob const& b) { if (a.size() != b.size()) { return false; } return (std::memcmp(a.cbytes(), b.cbytes(), a.size()) == 0); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 8bd7639e5..e48511656 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -58,6 +58,13 @@ struct replace_member_ctx { } }; +struct truncate_ctx { + repl_lsn_t truncation_upper_limit = 0; + + truncate_ctx() = default; + explicit truncate_ctx(repl_lsn_t limit) : truncation_upper_limit(limit) {} +}; + class RaftReplDevMetrics : public sisl::MetricsGroup { public: explicit RaftReplDevMetrics(const char* inst_name) : sisl::MetricsGroup("RaftReplDev", inst_name) { @@ -212,6 +219,7 @@ class RaftReplDev : public ReplDev, // the state machine should committed to before accepting traffic. This threshold ensures that // all potential committed log be committed before handling incoming requests. std::atomic< repl_lsn_t > m_traffic_ready_lsn{0}; + std::atomic< repl_lsn_t > m_truncation_upper_limit{0}; // LSN upto which it can truncate the logs in log store std::mutex m_sb_mtx; // Lock to protect the repl dev superblock @@ -309,6 +317,7 @@ class RaftReplDev : public ReplDev, repl_lsn_t get_last_commit_lsn() const override { return m_commit_upto_lsn.load(); } void set_last_commit_lsn(repl_lsn_t lsn) { m_commit_upto_lsn.store(lsn); } repl_lsn_t get_last_append_lsn() override { return raft_server()->get_last_log_idx(); } + repl_lsn_t get_truncation_upper_limit() const { return m_truncation_upper_limit.load(); } bool is_destroy_pending() const; bool is_destroyed() const; void set_stage(repl_dev_stage_t stage); @@ -470,6 +479,9 @@ class RaftReplDev : public ReplDev, void create_snp_resync_data(raft_buf_ptr_t& data_out); bool save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s); + void update_truncation_boundary(repl_req_ptr_t rreq); + void propose_truncate_boundary(); + void report_blk_metrics_if_needed(repl_req_ptr_t rreq); ReplServiceError init_req_ctx(repl_req_ptr_t rreq, repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 3672cdff8..2217f3e3e 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -401,6 +401,9 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, m_rd.m_listener->write_snapshot_obj(snp_ctx, snp_data); if (is_last_obj) { + // Nuraft will compact and truncate all logs when processeing the last obj. + // Update the truncation upper limit here to ensure all stale logs are truncated. + m_rd.m_truncation_upper_limit.exchange(s_cast< repl_lsn_t >(s.get_last_log_idx())); hs()->cp_mgr().trigger_cp_flush(true).wait(); // ensure DSN is flushed to disk } diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index f05baf381..4fa73c817 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -444,6 +444,13 @@ class TestReplicatedDB : public homestore::ReplDevListener { LOGINFO("Manually truncated"); } + repl_lsn_t get_truncation_upper_limit() { + auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); + auto limit = raft_repl_dev->get_truncation_upper_limit(); + LOGINFO("Truncation upper limit is {}", limit); + return limit; + } + void set_zombie() { zombie_ = true; } bool is_zombie() { // Wether a group is zombie(non recoverable) @@ -742,6 +749,7 @@ class RaftReplDevTestBase : public testing::Test { void create_snapshot() { dbs_[0]->create_snapshot(); } void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } + repl_lsn_t get_truncation_upper_limit() { return dbs_[0]->get_truncation_upper_limit(); } void replace_member(std::shared_ptr< TestReplicatedDB > db, std::string& task_id, replica_id_t member_out, replica_id_t member_in, uint32_t commit_quorum = 0, diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index f6d458943..e52c5f00f 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -558,6 +558,138 @@ TEST_F(RaftReplDevTest, ComputePriority) { g_helper->sync_for_cleanup_start(); } + +TEST_F(RaftReplDevTest, RaftLogTruncationTest) { + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + + auto pre_raft_logstore_reserve_threshold = 0; + HS_SETTINGS_FACTORY().modifiable_settings([&pre_raft_logstore_reserve_threshold](auto& s) { + pre_raft_logstore_reserve_threshold = s.resource_limits.raft_logstore_reserve_threshold; + s.resource_limits.raft_logstore_reserve_threshold = 200; + }); + HS_SETTINGS_FACTORY().save(); + + uint64_t entries_per_attempt = 100; + uint64_t total_entires = 0; + + LOGINFO("Write on leader num_entries={}", entries_per_attempt); + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + total_entires += entries_per_attempt; + // wait for commmit on all members + this->wait_for_commits(total_entires); + test_common::HSTestHelper::trigger_cp(true /* wait */); + g_helper->sync_for_verify_start(); + + // trigger snapshot to update log truncation upper limit + // sleep 1s to ensure the new truncation upper limit is updated + this->create_snapshot(); + std::this_thread::sleep_for(std::chrono::seconds{1}); + ASSERT_GT(this->get_truncation_upper_limit(), 0); + LOGINFO("After 100 entries written, truncation upper limit became {}", this->get_truncation_upper_limit()); + + // shutdown replica 1. + LOGINFO("Shutdown replica 1"); + this->shutdown_replica(1); + + // write another 100 entries on leader. + LOGINFO("Write on leader num_entries={}", entries_per_attempt); + if (g_helper->replica_num() == 0 || g_helper->replica_num() == 2) { + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + // Wait for commmit on leader and follower 2 + this->wait_for_all_commits(); + LOGINFO("Got all commits for replica 0 and 2"); + test_common::HSTestHelper::trigger_cp(true /* wait */); + LOGINFO("Trigger cp after writing 100 entries for replica 0 and 2"); + } + total_entires += entries_per_attempt; + + // trigger snapshot and check the truncation upper limit on leader + // it should not larger than 200 because replica 1 is shutdown + if (g_helper->replica_num() == 0) { + this->create_snapshot(); + std::this_thread::sleep_for(std::chrono::seconds{1}); + ASSERT_LT(this->get_truncation_upper_limit(), 200); + LOGINFO("After another 100 entries written, truncation upper limit {}", this->get_truncation_upper_limit()); + } + + g_helper->sync_for_test_start(); + + // start replica 1 after this. + LOGINFO("Start replica 1"); + this->start_replica(1); + + // write on leader to have some entries saved in raft log store. + entries_per_attempt = 50; + LOGINFO("Write on leader num_entries={}", entries_per_attempt); + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + total_entires += entries_per_attempt; + + // wait till all writes are down. + this->wait_for_commits(total_entires); + test_common::HSTestHelper::trigger_cp(true /* wait */); + g_helper->sync_for_verify_start(); + + // trigger snapshot and check the truncation upper limit + // it should no less than 250 on because all replicas has committed upto 250 + this->create_snapshot(); + std::this_thread::sleep_for(std::chrono::seconds{1}); + ASSERT_GE(this->get_truncation_upper_limit(), 250); + LOGINFO("After another 50 entries written, truncation upper limit became {}", this->get_truncation_upper_limit()); + + // wait all members sync and test raft_logstore_reserve_threshold limitation + g_helper->sync_for_test_start(); + + // shutdown replica1 again + LOGINFO("Shutdown replica 1 again"); + this->shutdown_replica(1); + + // write another 300 entries on leader to test one member lagged too much + entries_per_attempt = 300; + LOGINFO("Write on leader num_entries={}", entries_per_attempt); + if (g_helper->replica_num() == 0 || g_helper->replica_num() == 2) { + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + // Wait for commmit on leader and follower 2 + this->wait_for_all_commits(); + LOGINFO("Got all commits for replica 0 and 2"); + test_common::HSTestHelper::trigger_cp(true /* wait */); + LOGINFO("Trigger cp after writing 300 entries for replica 0 and 2"); + } + total_entires += entries_per_attempt; + + // trigger snapshot and check the truncation upper limit on leader + // this time leader will use its commit_idx - resource_limits.raft_logstore_reserve_threshold >= 550 - 200 = 350 + if (g_helper->replica_num() == 0) { + this->create_snapshot(); + std::this_thread::sleep_for(std::chrono::seconds{1}); + ASSERT_GE(this->get_truncation_upper_limit(), 350); + ASSERT_LT(this->get_truncation_upper_limit(), 550); + LOGINFO("After another 300 entries written, truncation upper limit {}", this->get_truncation_upper_limit()); + } + g_helper->sync_for_verify_start(); + + // start replica1 again, wait for replica1 catch up + LOGINFO("Start replica 1 again"); + this->start_replica(1); + g_helper->sync_for_test_start(); + this->wait_for_commits(total_entires); + g_helper->sync_for_verify_start(); + + // validate all data written so far by reading them + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + + // set the settings back and save. + LOGINFO("Set the raft_logstore_reserve_threshold back to previous value={}", pre_raft_logstore_reserve_threshold); + HS_SETTINGS_FACTORY().modifiable_settings([pre_raft_logstore_reserve_threshold](auto& s) { + s.resource_limits.raft_logstore_reserve_threshold = pre_raft_logstore_reserve_threshold; + }); + HS_SETTINGS_FACTORY().save(); + + g_helper->sync_for_cleanup_start(); + LOGINFO("RaftLogTruncationTest done"); +} + int main(int argc, char* argv[]) { int parsed_argc = argc; char** orig_argv = argv; From 2d26b250df1fdccbaffb271a39473eecfac730e1 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Thu, 10 Jul 2025 11:00:23 +0800 Subject: [PATCH 148/170] do not use memory_order_relax for get_used_blks (#739) --- conanfile.py | 2 +- src/lib/blkalloc/append_blk_allocator.cpp | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/conanfile.py b/conanfile.py index 8cd093227..e64815c40 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.18.2" + version = "6.18.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index 05464d825..2cabcd9f2 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -86,8 +86,9 @@ BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hint // Push 1 blk to the vector which has all the requested nblks; out_bid = BlkId{m_last_append_offset.fetch_add(nblks), nblks, m_chunk_id}; - - // COUNTER_INCREMENT(m_metrics, num_alloc, 1); + LOGDEBUG("chunk {} has successfully allocated nblks: {}, totally used blks: {}, available_blks: {}, actual " + "available_blks(exclude reserved blks): {}, last_append_offset: {}", + m_chunk_id, nblks, get_used_blks(), available_blks(), avail_blks, m_last_append_offset.load()); return BlkAllocStatus::SUCCESS; } @@ -164,9 +165,9 @@ std::string AppendBlkAllocator::to_string() const { blk_num_t AppendBlkAllocator::available_blks() const { return get_total_blks() - get_used_blks(); } -blk_num_t AppendBlkAllocator::get_used_blks() const { return m_last_append_offset.load(std::memory_order_relaxed); } +blk_num_t AppendBlkAllocator::get_used_blks() const { return m_last_append_offset.load(); } -blk_num_t AppendBlkAllocator::get_defrag_nblks() const { return m_freeable_nblks.load(std::memory_order_relaxed); } +blk_num_t AppendBlkAllocator::get_defrag_nblks() const { return m_freeable_nblks.load(); } nlohmann::json AppendBlkAllocator::get_status(int log_level) const { nlohmann::json j; From f9e4340a94f2b0c23dd9cf07fbc71aa615bbcf4e Mon Sep 17 00:00:00 2001 From: yuwmao <148639999+yuwmao@users.noreply.github.com> Date: Thu, 10 Jul 2025 16:24:09 +0800 Subject: [PATCH 149/170] Support pausing/resuming state machine (#769) --- conanfile.py | 2 +- src/include/homestore/replication/repl_dev.h | 9 +++++++++ src/lib/replication/repl_dev/raft_repl_dev.cpp | 15 +++++++++++++++ src/lib/replication/repl_dev/raft_repl_dev.h | 3 +++ src/lib/replication/repl_dev/solo_repl_dev.h | 10 ++++++++-- 5 files changed, 36 insertions(+), 3 deletions(-) diff --git a/conanfile.py b/conanfile.py index e64815c40..2fed20568 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.18.3" + version = "6.18.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index aaf76b975..0217b66f5 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -555,6 +555,15 @@ class ReplDev { /// @brief Clean up resources on this repl dev. virtual void purge() = 0; + /// @brief Pause repl dev state machine, timeout is in milliseconds. + virtual void pause_state_machine(size_t timeout) = 0; + + /// @brief Resume repl dev state machine. + virtual void resume_state_machine() = 0; + + /// @brief Check if the state machine is paused. + virtual bool is_state_machine_paused() = 0; + virtual std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) = 0; virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index e84638542..da7efaf38 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -2489,4 +2489,19 @@ bool RaftReplDev::is_ready_for_traffic() const { } return ready; } + +void RaftReplDev::pause_state_machine(size_t timeout) { + RD_LOGI(NO_TRACE_ID, "Pause state machine for group_id={}", group_id_str()); + raft_server()->pause_state_machine_execution(timeout); +} + +bool RaftReplDev::is_state_machine_paused() { + return raft_server()->is_state_machine_execution_paused(); +} + +void RaftReplDev::resume_state_machine() { + RD_LOGI(NO_TRACE_ID, "Resume state machine execution for group_id={}", group_id_str()); + raft_server()->resume_state_machine_execution(); +} + } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index e48511656..76144b236 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -328,6 +328,9 @@ class RaftReplDev : public ReplDev, bool is_ready_for_traffic() const override; // purge all resources (e.g., logs in logstore) is a very dangerous operation, it is not supported yet. void purge() override { RD_REL_ASSERT(false, "NOT SUPPORTED YET"); } + void pause_state_machine(size_t timeout) override; + void resume_state_machine() override; + bool is_state_machine_paused() override; std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { return std::make_shared< nuraft_snapshot_context >(snp_ctx); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 2ad78e4da..cf961dabe 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -48,6 +48,7 @@ class SoloReplDev : public ReplDev { uuid_t m_group_id; std::atomic< logstore_seq_num_t > m_commit_upto{-1}; std::atomic< bool > m_is_recovered{false}; + std::atomic< bool > m_paused{false}; public: SoloReplDev(superblk< solo_repl_dev_superblk >&& rd_sb, bool load_existing); @@ -80,10 +81,15 @@ class SoloReplDev : public ReplDev { bool is_ready_for_traffic() const override { return true; } void set_stage(repl_dev_stage_t stage) override {} repl_dev_stage_t get_stage() const override { - return repl_dev_stage_t::ACTIVE; - } + return repl_dev_stage_t::ACTIVE; } void purge() override {} + void pause_state_machine(size_t timeout) override { m_paused.store(true); } + + void resume_state_machine() override { m_paused.store(false); } + + bool is_state_machine_paused() override { return m_paused.load(); } + std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { return nullptr; } From cea838bf66e3b299a99456da3ea53b26064b51a8 Mon Sep 17 00:00:00 2001 From: raakella1 <114193113+raakella1@users.noreply.github.com> Date: Fri, 11 Jul 2025 12:41:07 -0700 Subject: [PATCH 150/170] Fix bugs in rapair links during crash recovery of the btree (#749) * fix for put remove crash test bug * Fix repair_links during crash recovery * update root when the current root splits during repair_links * add more trace logging for btree recovery * call repair links on the buffer which is priuned due to zero down buffers * upgrade conan version * fix an issue in the prune buffer code * Relax the sanity check condition about child key and previous parent key comparision * add more comments to the code --------- Co-authored-by: Ravi Nagarjun Akella --- conanfile.py | 2 +- .../homestore/btree/detail/btree_common.ipp | 11 +- src/include/homestore/index/index_table.hpp | 215 +++++++++++------- src/lib/index/index_cp.cpp | 5 + src/lib/index/index_cp.hpp | 1 + src/lib/index/wb_cache.cpp | 56 +++-- src/lib/index/wb_cache.hpp | 1 + src/tests/test_index_crash_recovery.cpp | 18 +- src/tests/test_scripts/index_test.py | 8 +- 9 files changed, 203 insertions(+), 114 deletions(-) diff --git a/conanfile.py b/conanfile.py index 2fed20568..a84974fba 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.18.4" + version = "6.18.6" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/detail/btree_common.ipp b/src/include/homestore/btree/detail/btree_common.ipp index 43e0c7c60..ec553396a 100644 --- a/src/include/homestore/btree/detail/btree_common.ipp +++ b/src/include/homestore/btree/detail/btree_common.ipp @@ -292,9 +292,14 @@ void Btree< K, V >::validate_sanity_child(const BtreeNodePtr& parent_node, uint3 if(ind < parent_node->total_entries()){ BT_REL_ASSERT_LE(cur_child_key.compare(parent_key), 0, " child {} {}-th key is greater than its parent's {} {}-th key", child_node->to_string(), i , parent_node->to_string(), ind); if(ind>0) { - BT_REL_ASSERT_GT(cur_child_key.compare(previous_parent_key), 0, - " child {} {}-th key is less than its parent's {} {}-th key", child_node->to_string(), - i, parent_node->to_string(), ind - 1); + if(cur_child_key.compare(previous_parent_key) <= 0){ + // there can be a transient case where a key appears in two children. When the replay is done, it should be fixed + // Consider the example Parent P, children C1, C2, C3, C4. A key is deleted resulting in a merge and C3 deleted, and the same key is inserted in the current cp + // Our case is that P is dirtied, C3 deleted, C4 updated and flushed. During recover, we will keep C3 and P remains the same. + // Since C4 is flushed, the key that was removd and inserted will showup in C3 and C4. + // After the replay post recovery, C3 should be gone and the tree is valid again. + BT_LOG(DEBUG, "child {} {}-th key is less than or equal to its parent's {} {}-th key", child_node->to_string(), i, parent_node->to_string(), ind - 1); + } } }else diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 121a136fd..30ba32321 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -438,6 +438,58 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return btree_status_t::success; } + bnodeid_t true_sibling_first_child(BtreeNodePtr const& parent_node) { + bnodeid_t sibling_first_child_id = empty_bnodeid; + if (!parent_node->is_leaf() && !parent_node->has_valid_edge()) { + BtreeNodePtr parent_right_sibling; + if (auto parent_right_sibling_id = find_true_sibling(parent_node); parent_right_sibling_id != empty_bnodeid) { + if (auto ret = read_node_impl(parent_right_sibling_id, parent_right_sibling); ret == btree_status_t::success) { + if (parent_right_sibling->total_entries() > 0) { + BtreeLinkInfo sibling_first_child_info; + parent_right_sibling->get_nth_value(0, &sibling_first_child_info, false); + sibling_first_child_id = sibling_first_child_info.bnode_id(); + } else if (parent_right_sibling->has_valid_edge()) { + // If the right sibling has an edge, we can use that as the first child + sibling_first_child_id = parent_right_sibling->get_edge_value().bnode_id(); + } + } + } + } + return sibling_first_child_id; + } + + void update_root(BtreeNodePtr const& left_child, BtreeNodeList& new_nodes, void* cp_ctx) { + auto new_root = this->alloc_interior_node(); + if (new_root == nullptr) { + return; + } + new_root->set_level(left_child->level() + 1); + auto cur_child = left_child; + uint32_t i = 0; + LOGTRACEMOD(wbcache, "Updating new root node={}", new_root->to_string()); + do { + LOGTRACEMOD(wbcache, "Processiog child {}", cur_child->to_string()); + if (cur_child->has_valid_edge()) { + new_root->set_edge_value(BtreeLinkInfo{cur_child->node_id(), cur_child->link_version()}); + } else { + auto child_last_key = cur_child->get_last_key< K >(); + new_root->insert(new_root->total_entries(), child_last_key, + BtreeLinkInfo{cur_child->node_id(), cur_child->link_version()}); + } + if (i == new_nodes.size()) { break; } + auto next_child_id = cur_child->next_bnode(); + cur_child = new_nodes[i++]; + DEBUG_ASSERT_EQ(next_child_id, cur_child->node_id(), + "Next node id {} does not match current child node id {}", + next_child_id, cur_child->node_id()); + } while (true); + + new_nodes.push_back(new_root); + LOGTRACEMOD(wbcache, "New root node created {}", new_root->to_string()); + on_root_changed(new_root, cp_ctx); + this->set_root_node_info(BtreeLinkInfo{new_root->node_id(), new_root->link_version()}); + } + // btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { LOGTRACEMOD(wbcache, "Repairing links for parent node [{}]", parent_node->to_string()); @@ -445,7 +497,11 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // needs to be handled. Get the last key in the node auto last_parent_key = parent_node->get_last_key< K >(); - auto const is_parent_edge_node = parent_node->has_valid_edge(); + auto sibling_node_id = find_true_sibling(parent_node); + // during delete stale links, the current edge node can be deleted and its left sibling will become edge node. + // While repairing the left sibling, has_valid_edge() is false but we need to make it an edge node. + // So we check if the true_sibling is empty to determine if we need to make it an edge node. + auto const is_parent_edge_node = (sibling_node_id == empty_bnodeid); if ((parent_node->total_entries() == 0) && !is_parent_edge_node) { BT_LOG_ASSERT(false, "Parent node={} is empty and not an edge node but was asked to repair", parent_node->node_id()); @@ -458,6 +514,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { BtreeLinkInfo link_info; parent_node->get_nth_value(i, &link_info, true); orig_child_infos[link_info.bnode_id()] = parent_node->get_nth_key< K >(i, false /* copy */); + LOGTRACEMOD(wbcache, "Child node [{}] with key [{}] at index [{}]", link_info.bnode_id(), + orig_child_infos[link_info.bnode_id()].to_string(), i); } LOGTRACEMOD(wbcache, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), last_parent_key.to_string()); @@ -482,43 +540,45 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // siblings which has keys more than Y or end of list (name this parent sibling node F), // 2-2- Put last key of F to last key of P // 2-3 - set F as Next of A - BtreeNodeList siblings; BtreeNodePtr next_cur_child; BT_DBG_ASSERT(parent_node->has_valid_edge() || parent_node->total_entries(), "parent node {} doesn't have valid edge and no entries ", parent_node->to_string()); if (parent_node->total_entries() > 0) { - auto updated_last_key = last_parent_key; K last_child_last_key; K last_child_neighbor_key; - BtreeNodePtr cur_child; - BtreeLinkInfo cur_child_info; + BtreeNodePtr cur_child = child_node; + // We find the last child node by starting from the leftmost child and traversing through the + // next_bnode links until we reach the end or find a sibling first child. bool found_child = false; - uint32_t nentries = parent_node->total_entries() + parent_node->has_valid_edge() ? 1 : 0; - - for (uint32_t i = nentries; i-- > 0;) { - parent_node->get_nth_value(i, &cur_child_info, false /* copy */); - if (auto ret = read_node_impl(cur_child_info.bnode_id(), cur_child); ret == btree_status_t::success) { - if (!cur_child->is_node_deleted() && cur_child->total_entries()) { - last_child_last_key = cur_child->get_last_key< K >(); - if (cur_child->next_bnode() != empty_bnodeid && - read_node_impl(cur_child->next_bnode(), next_cur_child) == btree_status_t::success) { - LOGTRACEMOD( - wbcache, - "Last child last key {} for child_node [{}] parent node [{}], next neigbor is [{}]", - last_child_last_key.to_string(), cur_child->to_string(), parent_node->to_string(), - next_cur_child->to_string()); - found_child = true; - break; - } - found_child = true; - break; - } - LOGTRACEMOD(wbcache, "PASSING child node {} so we need to check next child node", - cur_child->to_string()); + auto sibling_first_child = true_sibling_first_child(parent_node); + LOGTRACEMOD(wbcache, "Sibling first child id is {}", sibling_first_child); + while (cur_child != nullptr) { + LOGTRACEMOD(wbcache, "Processing child node [{}]", cur_child->to_string()); + if (!cur_child->is_node_deleted() && cur_child->total_entries() > 0) { + last_child_last_key = cur_child->get_last_key< K >(); + found_child = true; + } + + next_cur_child = nullptr; + if(cur_child->next_bnode() == empty_bnodeid || + read_node_impl(cur_child->next_bnode(), next_cur_child) != btree_status_t::success) { + break; // No next child, so we can stop here } + + if (sibling_first_child != empty_bnodeid && sibling_first_child == cur_child->next_bnode()) { + LOGTRACEMOD( + wbcache, + "Last child last key {} for child_node [{}] parent node [{}], next neigbor is [{}]", + last_child_last_key.to_string(), cur_child->to_string(), parent_node->to_string(), + next_cur_child->to_string()); + break; + } + cur_child = next_cur_child; } + // If we found a valid last child node, we adjust the parent_last_key by comparing it with the last + // child last key. if (found_child) { LOGTRACEMOD(wbcache, "Last child last key {} for parent node {}, child_node {}", last_child_last_key.to_string(), parent_node->to_string(), cur_child->to_string()); @@ -540,46 +600,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // 2-1 traverse for sibling of parent until reaches to siblings which has keys more than 7563 // or end // of list (put all siblings in a list, here is F) , - BtreeNodePtr sibling; BtreeNodePtr true_sibling; - BtreeLinkInfo sibling_info; - - auto sibling_node_id = parent_node->next_bnode(); - while (sibling_node_id != empty_bnodeid) { - if (auto ret = read_node_impl(sibling_node_id, sibling); ret == btree_status_t::success) { - if (sibling->is_node_deleted()) { - // Do we need to free the sibling node here? - siblings.push_back(sibling); - sibling_node_id = sibling->next_bnode(); - LOGTRACEMOD(wbcache, "Sibling node [{}] is deleted, continue to next sibling", - sibling->to_string()); - continue; - } - auto sibling_last_key = sibling->get_last_key< K >(); - if (next_cur_child && sibling_last_key.compare(last_child_neighbor_key) < 0) { - siblings.push_back(sibling); - sibling_node_id = sibling->next_bnode(); - } else { - true_sibling = sibling; - break; - } - } - } - if (true_sibling) { - LOGTRACEMOD(wbcache, "True sibling [{}] for parent_node {}", true_sibling->to_string(), - parent_node->to_string()); - } else { - LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", parent_node->to_string()); - } - if (sibling_node_id != empty_bnodeid) { + if (sibling_node_id != empty_bnodeid && + read_node_impl(sibling_node_id, true_sibling) == btree_status_t::success) { last_parent_key = last_child_last_key; parent_node->set_next_bnode(true_sibling->node_id()); - for (auto sibling : siblings) { - LOGTRACEMOD(wbcache, "Sibling list [{}]", sibling->to_string()); - } - LOGTRACEMOD(wbcache, "True sibling [{}]", true_sibling->to_string()); - BtreeLinkInfo first_child_info; - parent_node->get_nth_value(0, &first_child_info, false); + LOGTRACEMOD(wbcache, "True sibling [{}] for parent_node {}", true_sibling->to_string(), + parent_node->to_string()); + } + if (!true_sibling) { + LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", parent_node->to_string()); } } else { LOGTRACEMOD(wbcache, @@ -589,7 +619,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } } - + // Keep a copy of the node buffer, in case we need to revert back uint8_t* tmp_buffer = new uint8_t[this->m_node_size]; std::memcpy(tmp_buffer, parent_node->m_phys_node_buf, this->m_node_size); @@ -602,6 +632,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { BtreeNodeList new_parent_nodes; do { if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) { + LOGTRACEMOD(wbcache, "Child node [{}] is an edge node or a leaf with no next", + child_node->to_string()); if (child_node->is_node_deleted()) { // Edge node is merged, we need to set the current last entry as edge if (cur_parent->total_entries() > 0) { @@ -619,14 +651,14 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } else { // Update edge and finish if (is_parent_edge_node) { + cur_parent->set_next_bnode(empty_bnodeid); cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); } else { - auto tsib_id = find_true_sibling(cur_parent); - if (tsib_id != empty_bnodeid) { - cur_parent->set_next_bnode(tsib_id); + if (sibling_node_id != empty_bnodeid) { + cur_parent->set_next_bnode(sibling_node_id); LOGTRACEMOD(wbcache, "True sibling [{}] for parent_node [{}], So don't add child [{}] here ", - tsib_id, cur_parent->to_string(), child_node->to_string()); + sibling_node_id, cur_parent->to_string(), child_node->to_string()); } else { cur_parent->set_next_bnode(empty_bnodeid); // if this child node previously belonged to this parent node, we need to add it but as edge @@ -647,8 +679,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } - // - // } + LOGTRACEMOD(wbcache, "Repairing node=[{}], child_node=[{}] is an edge node, end loop", + cur_parent->to_string(), child_node->to_string()); break; } break; @@ -665,6 +697,11 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // last_parent_key. That's why here we have to check if the child node is one of the original child // nodes first. if (!is_parent_edge_node && !orig_child_infos.contains(child_node->node_id())) { + LOGTRACEMOD( + wbcache, + "Child node [{}] is not one of the original child nodes, so we need to check if it is beyond the " + "last parent key {}", + child_node->to_string(), last_parent_key.to_string()); if (child_last_key.compare(last_parent_key) > 0) { // We have reached a child beyond this parent, we can stop now // TODO this case if child last key is less than last parent key to update the parent node. @@ -695,13 +732,13 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } if (valid_sibling != empty_bnodeid) { cur_parent->set_next_bnode(valid_sibling); - LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", - cur_parent->node_id(), child_node->to_string()); + LOGTRACEMOD(wbcache, "Repairing node=[{}], child_node=[{}] is an edge node, end loop", + cur_parent->to_string(), child_node->to_string()); } else { cur_parent->set_next_bnode(empty_bnodeid); - LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", - cur_parent->node_id(), child_node->to_string()); + LOGTRACEMOD(wbcache, "Repairing node=[{}], child_node=[{}] is an edge node, end loop", + cur_parent->to_string(), child_node->to_string()); } break; @@ -711,6 +748,9 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(), BtreeLinkInfo::get_fixed_size())) { // No room in the parent_node, let us split the parent_node and continue + LOGTRACEMOD(wbcache, + "Repairing node={}, child_node=[{}] has no room for put, so we need to split the parent " + "node", cur_parent->node_id(), child_node->to_string()); auto new_parent = this->alloc_interior_node(); if (new_parent == nullptr) { ret = btree_status_t::space_not_avail; @@ -721,7 +761,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { cur_parent->set_next_bnode(new_parent->node_id()); new_parent->set_level(cur_parent->level()); cur_parent->inc_link_version(); - new_parent_nodes.push_back(new_parent); cur_parent = std::move(new_parent); } @@ -818,14 +857,20 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // if last parent has the key less than the last child key, then we need to update the parent node with // the last child key if it doesn't have edge. auto last_parent = parent_node; - if (new_parent_nodes.size() > 0) { last_parent = new_parent_nodes[new_parent_nodes.size() - 1]; } + if (new_parent_nodes.size() > 0) { + last_parent = new_parent_nodes.back(); + // handle the case where we are splitting the root node + if (m_sb->root_node == parent_node->node_id()) { + update_root(parent_node, new_parent_nodes, cp_ctx); + } + } if (last_parent->total_entries() && !last_parent->has_valid_edge()) { if (last_parent->compare_nth_key(last_parent_key, last_parent->total_entries() - 1) < 0) { BtreeLinkInfo child_info; last_parent->get_nth_value(last_parent->total_entries() - 1, &child_info, false /* copy */); - parent_node->update(parent_node->total_entries() - 1, last_parent_key, child_info); + last_parent->update(last_parent->total_entries() - 1, last_parent_key, child_info); LOGTRACEMOD(wbcache, "Repairing parent node={} with last_parent_key={} and child_info={}", - parent_node->node_id(), last_parent_key.to_string(), child_info.to_string()); + last_parent->node_id(), last_parent_key.to_string(), child_info.to_string()); } // if last key of children is less than the last key of parent, then we need to update the last key of non // interior child @@ -871,17 +916,15 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { bnodeid_t find_true_sibling(BtreeNodePtr const& node) { if (node == nullptr) return empty_bnodeid; - bnodeid_t sibling_id = empty_bnodeid; - if (node->has_valid_edge()) { - sibling_id = node->get_edge_value().bnode_id(); - } else { - sibling_id = node->next_bnode(); - } + bnodeid_t sibling_id = node->next_bnode(); if (sibling_id == empty_bnodeid) { return empty_bnodeid; } else { BtreeNodePtr sibling_node; - if (read_node_impl(sibling_id, sibling_node) != btree_status_t::success) { return empty_bnodeid; } + if (read_node_impl(sibling_id, sibling_node) != btree_status_t::success) { + LOGTRACEMOD(wbcache, "Failed to read sibling node with id {}", sibling_id); + return empty_bnodeid; + } if (sibling_node->is_node_deleted()) { LOGTRACEMOD(wbcache, "Sibling node [{}] is not the sibling for parent_node {}", diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index fd411526a..df41d0799 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -81,6 +81,11 @@ std::optional< IndexBufferPtr > IndexCPContext::next_dirty() { return ret; } +std::string IndexCPContext::to_string_small() { + return fmt::format("IndexCPContext cpid={}, dirty_buf_count={}, dirty_buf_list_size={}", m_cp->id(), m_dirty_buf_count.get(), + m_dirty_buf_list.size()); +} + std::string IndexCPContext::to_string() { std::string str{fmt::format("IndexCPContext cpid={} dirty_buf_count={} dirty_buf_list_size={}\n", m_cp->id(), m_dirty_buf_count.get(), m_dirty_buf_list.size())}; diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/index_cp.hpp index dffb3113c..ad29fe1c4 100644 --- a/src/lib/index/index_cp.hpp +++ b/src/lib/index/index_cp.hpp @@ -163,6 +163,7 @@ struct IndexCPContext : public VDevCPContext { void prepare_flush_iteration(); std::optional< IndexBufferPtr > next_dirty(); std::string to_string(); + std::string to_string_small(); std::string to_string_with_dags(); uint16_t num_dags(); void to_string_dot(const std::string& filename); diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 4c81bd8c0..a935a311a 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -515,6 +515,30 @@ std::string IndexWBCache::to_string_dag_bufs(DagMap& dags, cp_id_t cp_id) { return str; } +void IndexWBCache::prune_up_buffers(IndexBufferPtr const& buf, std::vector< IndexBufferPtr >& pruned_bufs_to_repair) { + auto up_buf = buf->m_up_buffer; + auto grand_up_buf = up_buf->m_up_buffer; + if (!up_buf || !up_buf->m_wait_for_down_buffers.testz()) { + return; + } + + // if up buffer has up buffer, then we need to decrement its wait_for_down_buffers + LOGINFOMOD(wbcache, + "\n\npruning up_buffer due to zero dependency of child\n up buffer {}\n buffer {}", + up_buf->to_string(), buf->to_string()); + update_up_buffer_counters(up_buf); + + pruned_bufs_to_repair.push_back(up_buf); + if (grand_up_buf && !grand_up_buf->is_meta_buf() && grand_up_buf->m_wait_for_down_buffers.testz()) { + LOGTRACEMOD( + wbcache, + "\nadding grand_buffer to repair list due to zero dependency of child\n grand buffer {}\n buffer {}", + grand_up_buf->to_string(), + buf->to_string()); + pruned_bufs_to_repair.push_back(grand_up_buf); + } +} + void IndexWBCache::recover(sisl::byte_view sb) { // If sb is empty, its possible a first time boot. if ((sb.bytes() == nullptr) || (sb.size() == 0)) { @@ -573,6 +597,7 @@ void IndexWBCache::recover(sisl::byte_view sb) { potential_parent_recovered_bufs( [](const IndexBufferPtr& a, const IndexBufferPtr& b) { return a->m_node_level < b->m_node_level; }); + std::vector< IndexBufferPtr > pruned_bufs_to_repair; LOGTRACEMOD(wbcache, "\n\n\nRecovery processing begins\n\n\n"); for (auto const& [_, buf] : bufs) { load_buf(buf); @@ -612,14 +637,7 @@ void IndexWBCache::recover(sisl::byte_view sb) { LOGTRACEMOD(wbcache, "remove_down_buffer {} from up buffer {}", buf->to_string(), buf->m_up_buffer->to_string()); buf->m_up_buffer->remove_down_buffer(buf); - if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) { - // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers - LOGINFOMOD(wbcache, - "\n\npruning up_buffer due to zero dependency of child\n up buffer {}\n buffer {}", - buf->m_up_buffer ? buf->m_up_buffer->to_string() : std::string("nullptr"), - buf->to_string()); - update_up_buffer_counters(buf->m_up_buffer /*,visited_bufs*/); - } + prune_up_buffers(buf, pruned_bufs_to_repair); buf->m_up_buffer = nullptr; } } @@ -628,18 +646,16 @@ void IndexWBCache::recover(sisl::byte_view sb) { // New node if (was_node_committed(buf) && was_node_committed(buf->m_up_buffer)) { // Both current and up buffer is commited, we can safely commit the current block + LOGTRACEMOD(wbcache, "New buffer {} and the up buffer {} are committed", buf->to_string(), + buf->m_up_buffer->to_string()); m_vdev->commit_blk(buf->m_blkid); pending_bufs.push_back(buf->m_up_buffer); } else { // Up buffer is not committed, we need to repair it first + LOGTRACEMOD(wbcache, "The up buffer {} is not committed for the new buffer {}", + buf->m_up_buffer->to_string(), buf->to_string()); buf->m_up_buffer->remove_down_buffer(buf); - if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) { - // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers - LOGINFOMOD(wbcache, "\npruning due to zero dependency of child\n up buffer {} \n buffer \n{}", - buf->m_up_buffer ? buf->m_up_buffer->to_string() : std::string("nullptr"), - buf->to_string()); - update_up_buffer_counters(buf->m_up_buffer); - } + prune_up_buffers(buf, pruned_bufs_to_repair); // buf->m_up_buffer = nullptr; } } @@ -651,6 +667,7 @@ void IndexWBCache::recover(sisl::byte_view sb) { // add deleted bufs to logs here as well auto modified_dags = generate_dag_buffers(bufs); LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log({}, pending_bufs)); + LOGTRACEMOD(wbcache, "All pruned bufs for recovery\n{}", detailed_log({}, pruned_bufs_to_repair)); LOGTRACEMOD(wbcache, "After recovery: {}", to_string_dag_bufs(modified_dags, icp_ctx->id())); #endif @@ -686,6 +703,15 @@ void IndexWBCache::recover(sisl::byte_view sb) { recover_buf(buf); } + // When we prune a buffer due to zero down dependency, there is a case where the key range of the parent needs to be adjusted. + // This can happen when a child is merged and its right sibling is flushed before the parent is flushed. + // And during recovery, we prune the node and keep the deleted child and keep the parent as is. + // We need to call repair_links directly on them as the recovery_buf() path will not trigger it. + for (auto const& buf : pruned_bufs_to_repair) { + LOGTRACEMOD(wbcache, "pruned buf {} is repaired", buf->to_string()); + index_service().repair_index_node(buf->m_index_ordinal, buf); + } + for (auto const& buf : deleted_bufs) { LOGTRACEMOD(wbcache, "freeing buf after repairing (last step) {}", buf->to_string()); m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); diff --git a/src/lib/index/wb_cache.hpp b/src/lib/index/wb_cache.hpp index 82221ab0d..42ec5270c 100644 --- a/src/lib/index/wb_cache.hpp +++ b/src/lib/index/wb_cache.hpp @@ -90,5 +90,6 @@ class IndexWBCache : public IndexWBCacheBase { bool was_node_committed(IndexBufferPtr const& buf); void load_buf(IndexBufferPtr const& buf); void update_up_buffer_counters(IndexBufferPtr const& buf); + void prune_up_buffers(IndexBufferPtr const& buf, std::vector< IndexBufferPtr >& bufs_to_repair); }; } // namespace homestore diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 254432cb5..06a2f26d2 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -60,6 +60,7 @@ SISL_OPTION_GROUP( (save_to_file, "", "save_to_file", "save to file", ::cxxopts::value< bool >()->default_value("0"), ""), (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", ::cxxopts::value< bool >()->default_value("1"), ""), + (print_keys_verbose_logging, "", "print_keys_verbose_logging", "print_keys_verbose_logging", ::cxxopts::value< bool >()->default_value("0"), ""), (seed, "", "seed", "random engine seed, use random if not defined", ::cxxopts::value< uint64_t >()->default_value("0"), "number")) @@ -71,6 +72,11 @@ void log_obj_life_counter() { LOGINFO("Object Life Counter\n:{}", str); } +#define print_keys_logging(msg) \ + if (SISL_OPTIONS.count("print_keys_verbose_logging")) { \ + this->print_keys(msg); \ + } + enum class OperationType { Put, Remove, @@ -481,7 +487,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } void crash_and_recover_common(OperationList& operations, std::string filename = "") { - // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + print_keys_logging("Btree prior to CP and susbsequent simulated crash: "); LOGINFO("Before Crash: {} keys in shadow map and it is actually {} keys in tree - operations size {}", this->m_shadow_map.size(), tree_key_count(), operations.size()); @@ -491,6 +497,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->visualize_keys(b_filename); } + print_keys_logging("Before crash"); trigger_cp(false); LOGINFO("waiting for crash to recover"); this->wait_for_crash_recovery(true); @@ -500,7 +507,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Visualize the tree file after recovery : {}", rec_filename); this->visualize_keys(rec_filename); } - // this->print_keys("Post crash and recovery, btree structure: "); + print_keys_logging("Post crash and recovery, btree structure: "); sanity_check(operations); // Added to the index service right after recovery. Not needed here // test_common::HSTestHelper::trigger_cp(true); @@ -512,7 +519,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Visualize the tree after reapply {}", re_filename); this->visualize_keys(re_filename); } - // this->print_keys("Post reapply, btree structure: "); + print_keys_logging("Post reapply, btree structure: "); this->get_all(); LOGINFO("After reapply: {} keys in shadow map and actually {} in tress", this->m_shadow_map.size(), @@ -584,13 +591,14 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT test_common::HSTestHelper::trigger_cp(true); this->get_all(); this->m_shadow_map.save(this->m_shadow_filename); - // this->print_keys("reapply: after preload"); + print_keys_logging("reapply: after preload"); this->visualize_keys("tree_after_preload.dot"); for (uint32_t round = 1; round <= crash_test_options.rounds && !time_to_stop(); round++) { LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, crash_test_options.rounds); bool print_time = false; elapsed_time = get_elapsed_time_sec(m_start_time); + print_keys_logging(fmt::format("Round {}: before crash", round)); if (crash_test_options.load_mode) { operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); @@ -732,7 +740,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->m_run_time, elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), crash_test_options.num_entries, this->tree_key_count() * 100.0 / crash_test_options.num_entries); } - // this->print_keys(fmt::format("reapply: after round {}", round)); + print_keys_logging(fmt::format("reapply: after round {}", round)); if (renew_btree_after_crash) { this->reset_btree(); }; } this->destroy_btree(); diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index df55a30b9..25060a89d 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -12,7 +12,7 @@ class TestFailedError(Exception): def run_test(options, type): cmd_opts = f"--gtest_filter=BtreeConcurrentTest/{type}.ConcurrentAllOps --gtest_break_on_failure --cleanup_after_shutdown={options['cleanup_after_shutdown']} --init_device={options['init_device']} --preload_size={options['preload_size']} {options['log_mods']} --run_time={options['run_time']} --num_iters={options['num_iters']} --num_entries={options['num_entries']} --num_threads={options['threads']} --num_fibers={options['fibers']} {options['dev_list']} {options['op_list']}" - # print(f"Running test with options: {cmd_opts}") + print(f"Running test with options: {cmd_opts}") try: subprocess.check_call(f"{options['dirpath']}test_index_btree {cmd_opts}", stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError as e: @@ -23,7 +23,7 @@ def run_test(options, type): def run_crash_test(options, crash_type='put', type=0): cmd_opts = f"--gtest_filter=IndexCrashTest/{type}.long_running_{crash_type}_crash --gtest_break_on_failure --min_keys_in_node={options['min_keys_in_node']} --max_keys_in_node={options['max_keys_in_node']} --num_entries_per_rounds={options['num_entries_per_rounds']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} --num_rounds={options['num_rounds']} {options['dev_list']} " - # print(f"Running test with options: {cmd_opts}") + print(f"Running test with options: {cmd_opts}") try: subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT, shell=True) @@ -146,8 +146,8 @@ def long_running(*args): options = parse_arguments() long_runnig_index(options, 0) long_running_clean_shutdown(options, 0) - long_runnig_index(options, 1) - long_running_clean_shutdown(options, 1) + # long_runnig_index(options, 1) + # long_running_clean_shutdown(options, 1) for i in range(20): print(f"Iteration {i + 1}") long_running_crash_put_remove(options) From 898479211f07ab66bb61c0ddeb4b833151eb941b Mon Sep 17 00:00:00 2001 From: yuwmao <148639999+yuwmao@users.noreply.github.com> Date: Tue, 15 Jul 2025 23:17:17 +0800 Subject: [PATCH 151/170] Support formatting new disk after bootstrap (#766) In the disk replacement scenario, we will replace a bad device with a brand new device which needs to be formatted. Formatting has three logical parts: first block: This can be recovered from other existing devices. pdev info: 2.1 pdev header 2.2 format chunk slots vdev info: Vdev info can be recovered from other existing devices, and the missing chunks can be inferred. Add new pdev into the vdev as needed. Note: The pdev_id is monotonically increasing. The chunk_id might be reused, so custom_chunk_selector should pay more attention to it. Expose pdev_name in VChunk, so users can get the logic entity to physical devices map which is helpful for admin and operators. --- conanfile.py | 2 +- src/include/homestore/vchunk.h | 1 + src/lib/device/device.h | 10 +- src/lib/device/device_manager.cpp | 353 ++++++++++++------ src/lib/device/hs_super_blk.h | 3 + src/lib/device/vchunk.cpp | 2 + .../replication/repl_dev/raft_repl_dev.cpp | 2 +- src/tests/test_device_manager.cpp | 214 ++++++++++- 8 files changed, 473 insertions(+), 114 deletions(-) diff --git a/conanfile.py b/conanfile.py index a84974fba..6e220bb31 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.18.6" + version = "6.19.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/vchunk.h b/src/include/homestore/vchunk.h index c3f020aa1..9dc0dd208 100644 --- a/src/include/homestore/vchunk.h +++ b/src/include/homestore/vchunk.h @@ -34,6 +34,7 @@ class VChunk { blk_num_t get_used_blks() const; blk_num_t get_defrag_nblks() const; uint32_t get_pdev_id() const; + const std::string& get_pdev_name() const; uint16_t get_chunk_id() const; cshared< Chunk > get_internal_chunk() const; uint64_t size() const; diff --git a/src/lib/device/device.h b/src/lib/device/device.h index 1c3843534..fa907f002 100644 --- a/src/lib/device/device.h +++ b/src/lib/device/device.h @@ -133,7 +133,8 @@ class DeviceManager { sisl::sparse_vector< std::unique_ptr< PhysicalDev > > m_all_pdevs; std::map< HSDevType, std::vector< PhysicalDev* > > m_pdevs_by_type; - uint32_t m_cur_pdev_id{0}; + uint32_t m_cur_pdev_id{0}; // This is a monotonically increasing value. In case of disk replacement, this value is + // not inherited, new device will get a new id. std::map< uint16_t, shared< Chunk > > m_chunks; // Chunks organized as array (indexed on chunk id) sisl::Bitset m_chunk_id_bm{hs_super_blk::MAX_CHUNKS_IN_SYSTEM}; // Bitmap to keep track of chunk ids available @@ -155,6 +156,7 @@ class DeviceManager { bool is_first_time_boot() const { return m_first_time_boot; } void format_devices(); + uint32_t format_single_device(dev_info& dinfo); void commit_formatting(); void load_devices(); void close_devices(); @@ -165,7 +167,11 @@ class DeviceManager { /// @param event_cb Event handler in case of /// @return shared< VirtualDev > create_vdev(vdev_parameters&& vdev_param); - + void compose_vparam(uint64_t vdev_id, vdev_parameters& vparam, std::vector< PhysicalDev* > pdevs); + std::map< PhysicalDev*, uint32_t > calculate_vdev_chunk_num_on_new_pdevs(shared< VirtualDev > vdev, + std::vector< PhysicalDev* > pdevs, + uint64_t total_chunk_num); + void add_pdev_to_vdev(shared< VirtualDev > vdev, PhysicalDev* pdev, uint32_t total_chunk_num_in_pdev); const Chunk* get_chunk(uint16_t chunk_id) { return get_chunk_mutable(chunk_id); } Chunk* get_chunk_mutable(uint16_t chunk_id) { diff --git a/src/lib/device/device_manager.cpp b/src/lib/device/device_manager.cpp index 28eb37e33..76169f15a 100644 --- a/src/lib/device/device_manager.cpp +++ b/src/lib/device/device_manager.cpp @@ -29,6 +29,8 @@ #include "common/homestore_utils.hpp" #include "common/homestore_assert.hpp" +#include + namespace homestore { static int determine_open_flags(io_flag oflags) { @@ -60,6 +62,8 @@ static bool is_hdd(const std::string& devname) { static void populate_vdev_info(const vdev_parameters& vparam, uint32_t vdev_id, const std::vector< PhysicalDev* >& pdevs, vdev_info* out_info); +static void populate_vparam(vdev_parameters& vparam, vdev_info& vinfo); + DeviceManager::DeviceManager(const std::vector< dev_info >& devs, vdev_create_cb_t vdev_create_cb) : m_dev_infos{devs}, m_vdev_create_cb{std::move(vdev_create_cb)} { bool found_hdd_dev{false}; @@ -96,6 +100,8 @@ DeviceManager::DeviceManager(const std::vector< dev_info >& devs, vdev_create_cb } void DeviceManager::format_devices() { + // Only the first time boot, we will generate the first block header. After that, the first block header will be + // loaded from the existing devices. ++m_first_blk_hdr.gen_number; m_first_blk_hdr.version = first_block_header::CURRENT_SUPERBLOCK_VERSION; std::strncpy(m_first_blk_hdr.product_name, first_block_header::PRODUCT_NAME, @@ -108,38 +114,44 @@ void DeviceManager::format_devices() { // Get common iomgr_attributes for (auto& dinfo : m_dev_infos) { - auto attr = iomgr::DriveInterface::get_attributes(dinfo.dev_name); - if (dinfo.dev_size == 0) { dinfo.dev_size = PhysicalDev::get_dev_size(dinfo.dev_name); } - auto sb_size = hs_super_blk::total_used_size(dinfo); - auto buf = hs_utils::iobuf_alloc(sb_size, sisl::buftag::superblk, attr.align_size); - std::memset(buf, 0, sb_size); - - first_block* fblk = r_cast< first_block* >(buf); - fblk->magic = first_block::HOMESTORE_MAGIC; - fblk->checksum = 0; // Computed while writing the first block - fblk->formatting_done = 0x0; // Formatting is not done yet, until homestore is completely started - fblk->hdr = m_first_blk_hdr; // Entire header is copied as is - auto pdev_id = populate_pdev_info(dinfo, attr, m_first_blk_hdr.system_uuid, fblk->this_pdev_hdr); - fblk->checksum = crc32_ieee(init_crc32, uintptr_cast(fblk), first_block::s_atomic_fb_size); - - auto pdev = std::make_unique< PhysicalDev >(dinfo, device_open_flags(dinfo.dev_name), fblk->this_pdev_hdr); - - LOGINFO("Formatting Homestore on Device={} with first block as: [{}] total_super_blk_size={}", dinfo.dev_name, - fblk->to_string(), sb_size); - pdev->write_super_block(buf, sb_size, hs_super_blk::first_block_offset()); + format_single_device(dinfo); + } +} - auto it = m_pdevs_by_type.find(dinfo.dev_type); - if (it == m_pdevs_by_type.end()) { - bool happened; - std::tie(it, happened) = m_pdevs_by_type.insert(std::pair{dinfo.dev_type, std::vector< PhysicalDev* >{}}); - } - it->second.push_back(pdev.get()); +uint32_t DeviceManager::format_single_device(dev_info& dinfo) { + HS_LOG_ASSERT(!m_first_blk_hdr.is_empty(), "Empty first block header, cannot format device {}", dinfo.dev_name); + auto attr = iomgr::DriveInterface::get_attributes(dinfo.dev_name); + if (dinfo.dev_size == 0) { dinfo.dev_size = PhysicalDev::get_dev_size(dinfo.dev_name); } + auto sb_size = hs_super_blk::total_used_size(dinfo); + auto buf = hs_utils::iobuf_alloc(sb_size, sisl::buftag::superblk, attr.align_size); + std::memset(buf, 0, sb_size); + + first_block* fblk = r_cast< first_block* >(buf); + fblk->magic = first_block::HOMESTORE_MAGIC; + fblk->checksum = 0; // Computed while writing the first block + fblk->formatting_done = 0x0; // Formatting is not done yet, until homestore is completely started + fblk->hdr = m_first_blk_hdr; // Entire header is copied as is + auto pdev_id = populate_pdev_info(dinfo, attr, m_first_blk_hdr.system_uuid, fblk->this_pdev_hdr); + fblk->checksum = crc32_ieee(init_crc32, uintptr_cast(fblk), first_block::s_atomic_fb_size); + + auto pdev = std::make_unique< PhysicalDev >(dinfo, device_open_flags(dinfo.dev_name), fblk->this_pdev_hdr); + + LOGINFO("Formatting Homestore on Device[dev_name={}, pdev_id={}] with first block as: [{}] total_super_blk_size={}", + dinfo.dev_name, pdev_id, fblk->to_string(), sb_size); + pdev->write_super_block(buf, sb_size, hs_super_blk::first_block_offset()); + + auto it = m_pdevs_by_type.find(dinfo.dev_type); + if (it == m_pdevs_by_type.end()) { + bool happened; + std::tie(it, happened) = m_pdevs_by_type.insert(std::pair{dinfo.dev_type, std::vector< PhysicalDev* >{}}); + } + it->second.push_back(pdev.get()); - pdev->format_chunks(); - m_all_pdevs[pdev_id] = std::move(pdev); + pdev->format_chunks(); + m_all_pdevs[pdev_id] = std::move(pdev); - hs_utils::iobuf_free(buf, sisl::buftag::superblk); - } + hs_utils::iobuf_free(buf, sisl::buftag::superblk); + return pdev_id; } void DeviceManager::load_devices() { @@ -153,29 +165,96 @@ void DeviceManager::load_devices() { m_boot_in_degraded_mode = true; } - for (const auto& d : m_dev_infos) { + // 1. Load all physical devices. + std::vector< dev_info > pdevs_to_format; + for (auto& d : m_dev_infos) { first_block fblk = PhysicalDev::read_first_block(d.dev_name, device_open_flags(d.dev_name)); pdev_info_header* pinfo = &fblk.this_pdev_hdr; - RELEASE_ASSERT_EQ(pinfo->get_system_uuid_str(), m_first_blk_hdr.get_system_uuid_str(), - "Device {} has uuid stamp different than this instance uuid. Perhaps device from other " - "homestore is provided?", - d.dev_name); - - auto pdev = std::make_unique< PhysicalDev >(d, device_open_flags(d.dev_name), *pinfo); - LOGINFO("Loading Homestore from Device={} with first block as: [{}]", d.dev_name, fblk.to_string()); - - auto it = m_pdevs_by_type.find(d.dev_type); - if (it == m_pdevs_by_type.end()) { - bool happened; - std::tie(it, happened) = m_pdevs_by_type.insert(std::pair{d.dev_type, std::vector< PhysicalDev* >{}}); + if (!fblk.is_valid()) { + pdevs_to_format.emplace_back(d); + LOGINFO("Empty first block found on device {}, format it", d.dev_name); + } else { + RELEASE_ASSERT_EQ(pinfo->get_system_uuid_str(), m_first_blk_hdr.get_system_uuid_str(), + "Device {} has uuid stamp different than this instance uuid. Perhaps device from other " + "homestore is provided?", + d.dev_name); + + auto pdev = std::make_unique< PhysicalDev >(d, device_open_flags(d.dev_name), *pinfo); + LOGINFO("Loading Homestore from Device={} with first block as: [{}]", d.dev_name, fblk.to_string()); + + auto it = m_pdevs_by_type.find(d.dev_type); + if (it == m_pdevs_by_type.end()) { + bool happened; + std::tie(it, happened) = m_pdevs_by_type.insert(std::pair{d.dev_type, std::vector< PhysicalDev* >{}}); + } + it->second.push_back(pdev.get()); + m_all_pdevs[pinfo->pdev_id] = std::move(pdev); + m_cur_pdev_id = std::max(m_cur_pdev_id, pinfo->pdev_id); } - it->second.push_back(pdev.get()); + } - m_all_pdevs[pinfo->pdev_id] = std::move(pdev); + // 2. format new devices. + for (auto& d : pdevs_to_format) { + auto pdev_id = format_single_device(d); + LOGINFO("Device {} has been formatted, pdev_id {}", d.dev_name, pdev_id); } + // 3. Recover vdevs from the physical devices. load_vdevs(); + + if (pdevs_to_format.empty()) return; + + // 4. Add new physical devices to existing vdevs. + for (auto vdev : m_vdevs) { + vdev_parameters vparam; + auto vinfo = vdev->info(); + populate_vparam(vparam, vinfo); + if (vparam.size_type == vdev_size_type_t::VDEV_SIZE_DYNAMIC || + vparam.multi_pdev_opts == vdev_multi_pdev_opts_t::SINGLE_FIRST_PDEV || + vparam.multi_pdev_opts == vdev_multi_pdev_opts_t::SINGLE_ANY_PDEV) { + LOGINFO("Skipping adding new devices to vdev {}, as it is dynamic or single pdev type", vinfo.get_name()); + continue; + } + + std::vector< PhysicalDev* > pdevs = pdevs_by_type_internal(vparam.dev_type); + RELEASE_ASSERT_GT(pdevs.size(), 0, + "vdev is loaded from at least one pdev, but unable to find any pdevs for given vdev type"); + RELEASE_ASSERT(vparam.blk_size % pdevs[0]->align_size() == 0, "blk_size should be multiple of pdev align_size"); + + // vparam.num_chunks will be inferred. + compose_vparam(vdev->info().vdev_id, vparam, pdevs); + if (vdev->get_pdevs().size() == pdevs.size()) { + LOGDEBUG("Virtual device {} is already sized correctly, no new devices to add", + vdev->info().get_name()); + continue; + } + LOGINFO( + "Virtual device {} is undersized, pdevs already added={}, qualified pdevs ={}, need to add new devices to it", + vdev->info().get_name(), vdev->get_pdevs().size(), pdevs.size()); + + // calculate the number of chunks to be created in each new pdev + auto pdev_chunk_num_map = calculate_vdev_chunk_num_on_new_pdevs(vdev, pdevs, vparam.num_chunks); + + std::unique_lock lg{m_vdev_mutex}; + auto buf = hs_utils::iobuf_alloc(vdev_info::size, sisl::buftag::superblk, pdevs[0]->align_size()); + std::memcpy(buf, &vinfo, sizeof(vdev_info)); + uint64_t offset = hs_super_blk::vdev_sb_offset() + (vdev->info().vdev_id * vdev_info::size); + + // add the new pdevs to the vdev + for (auto pdev_to_add : pdev_chunk_num_map) { + auto pdev = pdev_to_add.first; + add_pdev_to_vdev(vdev, pdev_to_add.first, pdev_to_add.second); + LOGINFO("Added pdev[name={}, id={}] with total_chunk_num_in_pdev={} to vdev {}", pdev->get_devname(), + pdev->pdev_id(), pdev_to_add.second, vdev->info().get_name()); + + // Update vdev info in the super block area of the pdev + pdev->write_super_block(buf, vdev_info::size, offset); + } + + hs_utils::iobuf_free(buf, sisl::buftag::superblk); + commit_formatting(); + } } void DeviceManager::commit_formatting() { @@ -212,10 +291,62 @@ shared< VirtualDev > DeviceManager::create_vdev(vdev_parameters&& vparam) { auto vdev_id = m_vdev_id_bm.get_next_reset_bit(0u); if (vdev_id == sisl::Bitset::npos) { throw std::out_of_range("System has no room for additional vdev"); } m_vdev_id_bm.set_bit(vdev_id); - std::vector< PhysicalDev* > pdevs = pdevs_by_type_internal(vparam.dev_type); RELEASE_ASSERT_GT(pdevs.size(), 0, "Unable to find any pdevs for given vdev type, can't create vdev"); RELEASE_ASSERT(vparam.blk_size % pdevs[0]->align_size() == 0, "blk_size should be multiple of pdev align_size"); + + // Populate the vdev parameters based on the given cfg and pdevs + compose_vparam(vdev_id, vparam, pdevs); + + // Convert the vparameters to the vdev_info + auto buf = hs_utils::iobuf_alloc(vdev_info::size, sisl::buftag::superblk, pdevs[0]->align_size()); + auto vinfo = new (buf) vdev_info(); + populate_vdev_info(vparam, vdev_id, pdevs, vinfo); + + // Do a callback for the upper layer to create the vdev instance from vdev_info + shared< VirtualDev > vdev = m_vdev_create_cb(*vinfo, false /* load_existing */); + m_vdevs[vdev_id] = vdev; + + // different type might have different capacity, so we need to spread all the newly created chunks to all pdevs + // according to their capacity + + auto pdev_chunk_num_map = calculate_vdev_chunk_num_on_new_pdevs(vdev, pdevs, vparam.num_chunks); + + uint32_t total_created_chunks{0}; + + for (auto& pdev : pdevs) { + if (total_created_chunks >= vparam.num_chunks) break; + + // the total number of chunks will be created in this pdev + auto total_chunk_num_in_pdev = pdev_chunk_num_map[pdev]; + + RELEASE_ASSERT(vparam.num_chunks >= total_chunk_num_in_pdev, + "chunks in pdev {} is {}, larger than total chunks {} , which is expected to be created ", + pdev->get_devname(), total_chunk_num_in_pdev, vparam.num_chunks); + + LOGINFO("{} chunks is created on pdev {} for vdev {}, pdev data size is {}", total_chunk_num_in_pdev, + pdev->get_devname(), vparam.vdev_name, pdev->data_size()); + + add_pdev_to_vdev(vdev, pdev, total_chunk_num_in_pdev); + total_created_chunks += total_chunk_num_in_pdev; + } + + LOGINFO("{} chunks is created for vdev {}, expected {}", total_created_chunks, vparam.vdev_name, vparam.num_chunks); + // Handle any initialization needed. + vdev->init(); + // Locate and write the vdev info in the super blk area of all pdevs this vdev will be created on + for (auto& pdev : pdevs) { + uint64_t offset = hs_super_blk::vdev_sb_offset() + (vdev_id * vdev_info::size); + pdev->write_super_block(buf, vdev_info::size, offset); + } + + vinfo->~vdev_info(); + hs_utils::iobuf_free(buf, sisl::buftag::superblk); + LOGINFO("Virtal Dev={} of size={} successfully created", vparam.vdev_name, in_bytes(vparam.vdev_size)); + return vdev; +} + +void DeviceManager::compose_vparam(uint64_t vdev_id, vdev_parameters& vparam, std::vector< PhysicalDev* > pdevs) { // Identify the number of chunks if (vparam.multi_pdev_opts == vdev_multi_pdev_opts_t::ALL_PDEV_STRIPED) { auto total_streams = std::accumulate(pdevs.begin(), pdevs.end(), 0u, @@ -317,74 +448,66 @@ shared< VirtualDev > DeviceManager::create_vdev(vdev_parameters&& vparam) { "adjusted as follows: VDev_Size={} Num_pdevs={} Total_chunks_across_all_pdevs={} Each_Chunk_Size={}", vparam.vdev_name, in_bytes(input_vdev_size), vdev_id, vparam.multi_pdev_opts, in_bytes(vparam.vdev_size), pdevs.size(), vparam.num_chunks, in_bytes(vparam.chunk_size)); +} - // Convert the vparameters to the vdev_info - auto buf = hs_utils::iobuf_alloc(vdev_info::size, sisl::buftag::superblk, pdevs[0]->align_size()); - auto vinfo = new (buf) vdev_info(); - populate_vdev_info(vparam, vdev_id, pdevs, vinfo); - - // Do a callback for the upper layer to create the vdev instance from vdev_info - shared< VirtualDev > vdev = m_vdev_create_cb(*vinfo, false /* load_existing */); - m_vdevs[vdev_id] = vdev; - - // different type might have different capacity, so we need to spread all the newly created chunks to all pdevs - // according to their capacity - - // the total size of all pdevs of a certain type - uint64_t total_type_size = std::accumulate(pdevs.begin(), pdevs.end(), 0ull, +// The actual total chunk num might be not the same as vdev.num_chunks, as it is calculated based on the pdevs data +// size proportion. +std::map< PhysicalDev*, uint32_t > +DeviceManager::calculate_vdev_chunk_num_on_new_pdevs(shared< VirtualDev > vdev, std::vector< PhysicalDev* > pdevs, + uint64_t total_chunk_num) { + auto added_pdevs = vdev->get_pdevs(); + uint64_t total_pdev_data_size = 0; + uint32_t chunk_num = 0; + if (added_pdevs.size() == 0) { + // vdev is created newly, so we need to calculate the total bytes of all pdevs + total_pdev_data_size = std::accumulate(pdevs.begin(), pdevs.end(), 0ull, [](uint64_t r, const PhysicalDev* a) { return r + a->data_size(); }); - - LOGINFO("total size of type {} in this homestore is {}", vparam.dev_type, total_type_size) - - uint32_t total_created_chunks{0}; - - for (auto& pdev : pdevs) { - if (total_created_chunks >= vparam.num_chunks) break; - std::vector< uint32_t > chunk_ids; - - // the total number of chunks will be created in this pdev - auto total_chunk_num_in_pdev = - static_cast< uint32_t >(vparam.num_chunks * (pdev->data_size() / static_cast< float >(total_type_size))); - - RELEASE_ASSERT(vparam.num_chunks >= total_chunk_num_in_pdev, - "chunks in pdev {} is {}, larger than total chunks {} , which is expected to be created ", - pdev->get_devname(), total_chunk_num_in_pdev, vparam.num_chunks); - - LOGINFO("{} chunks is created on pdev {} for vdev {}, pdev data size is {}", total_chunk_num_in_pdev, - pdev->get_devname(), vparam.vdev_name, pdev->data_size()); - - // Create chunk ids for all chunks in each of these pdevs - for (uint32_t c{0}; c < total_chunk_num_in_pdev; ++c) { - auto chunk_id = m_chunk_id_bm.get_next_reset_bit(0u); - if (chunk_id == sisl::Bitset::npos) { throw std::out_of_range("System has no room for additional chunks"); } - m_chunk_id_bm.set_bit(chunk_id); - chunk_ids.push_back(chunk_id); - } - - // Create all chunks at one shot and add each one to the vdev - auto chunks = pdev->create_chunks(chunk_ids, vdev_id, vparam.chunk_size); - for (auto& chunk : chunks) { - vdev->add_chunk(chunk, true /* fresh_chunk */); - m_chunks[chunk->chunk_id()] = chunk; + chunk_num = total_chunk_num; + LOGDEBUG("total size of type {} in this homestore is {}", vdev->get_dev_type(), total_pdev_data_size) + } else { + // vdev is recovered from existing pdevs, in this case, calculate the number of chunks needed based on the + // proportional relationship between the size of the new disk and the existing disks. + total_pdev_data_size = std::accumulate(added_pdevs.begin(), added_pdevs.end(), 0ull, + [](uint64_t r, const PhysicalDev* a) { return r + a->data_size(); }); + chunk_num = vdev->get_total_chunk_num(); + LOGDEBUG("size of all added pdevs={}, current_chunk_num={} of type {} in vdev {}", total_pdev_data_size, + chunk_num, vdev->get_dev_type(), vdev->info().get_name()); + } + std::map< PhysicalDev*, uint32_t > pdev_chunk_num_map; + for (auto pdev : pdevs) { + if (added_pdevs.contains(pdev)) { + LOGDEBUG("pdev {} is already added to vdev {}, skip it", pdev->get_devname(), vdev->info().get_name()); + continue; } - - total_created_chunks += total_chunk_num_in_pdev; + auto expect_chunk_num_on_pdev = + static_cast< uint32_t >(chunk_num * (pdev->data_size() / static_cast< float >(total_pdev_data_size))); + auto available_chunks_on_pdev = static_cast< uint32_t >(pdev->data_size() / vdev->info().chunk_size); + pdev_chunk_num_map[pdev] = std::min(expect_chunk_num_on_pdev, available_chunks_on_pdev); + LOGINFO("pdev {} should add {} chunks to vdev {} , expect_chunk_num_on_pdev={}, available_chunks_on_pdev={}", + pdev->get_devname(), pdev_chunk_num_map[pdev], vdev->info().get_name(), expect_chunk_num_on_pdev, + available_chunks_on_pdev); } + return pdev_chunk_num_map; +} - LOGINFO("{} chunks is created for vdev {}, expected {}", total_created_chunks, vparam.vdev_name, vparam.num_chunks); - // Handle any initialization needed. - vdev->init(); +void DeviceManager::add_pdev_to_vdev(shared< VirtualDev > vdev, PhysicalDev* pdev, uint32_t chunks_on_pdev) { + std::vector< uint32_t > chunk_ids; - // Locate and write the vdev info in the super blk area of all pdevs this vdev will be created on - for (auto& pdev : pdevs) { - uint64_t offset = hs_super_blk::vdev_sb_offset() + (vdev_id * vdev_info::size); - pdev->write_super_block(buf, vdev_info::size, offset); + LOGINFO("Add pdev {} to vdev {}, chunks_on_pdev={}", pdev->get_devname(), vdev->info().get_name(), chunks_on_pdev); + // Create chunk ids for all chunks in each of these pdevs + for (uint32_t c{0}; c < chunks_on_pdev; ++c) { + auto chunk_id = m_chunk_id_bm.get_next_reset_bit(0u); + if (chunk_id == sisl::Bitset::npos) { throw std::out_of_range("System has no room for additional chunks"); } + m_chunk_id_bm.set_bit(chunk_id); + chunk_ids.push_back(chunk_id); } - vinfo->~vdev_info(); - hs_utils::iobuf_free(buf, sisl::buftag::superblk); - LOGINFO("Virtal Dev={} of size={} successfully created", vparam.vdev_name, in_bytes(vparam.vdev_size)); - return vdev; + // Create all chunks at one shot and add each one to the vdev + auto chunks = pdev->create_chunks(chunk_ids, vdev->info().get_vdev_id(), vdev->info().chunk_size); + for (auto& chunk : chunks) { + vdev->add_chunk(chunk, true /* fresh_chunk */); + m_chunks[chunk->chunk_id()] = chunk; + } } void DeviceManager::load_vdevs() { @@ -404,6 +527,7 @@ void DeviceManager::load_vdevs() { for (auto& pdev : m_all_pdevs) { // we might have some missing pdevs in the sparse_vector m_all_pdevs, so skip them if (!pdev) continue; + // Empty device will skip this callback. pdev->load_chunks([this](cshared< Chunk >& chunk) -> bool { // Found a chunk for which vdev information is missing if (m_vdevs[chunk->vdev_id()] == nullptr) { @@ -520,7 +644,7 @@ uint32_t DeviceManager::populate_pdev_info(const dev_info& dinfo, const iomgr::d const uuid_t& uuid, pdev_info_header& pinfo) { bool hdd = is_hdd(dinfo.dev_name); - pinfo.pdev_id = m_cur_pdev_id++; + pinfo.pdev_id = ++m_cur_pdev_id; pinfo.mirror_super_block = hdd ? 0x01 : 0x00; pinfo.max_pdev_chunks = hs_super_blk::max_chunks_in_pdev(dinfo); @@ -572,6 +696,23 @@ static void populate_vdev_info(const vdev_parameters& vparam, uint32_t vdev_id, out_info->compute_checksum(); } +// This function populates the vdev_parameters from the vdev_info(loaded from existing disks) in the vdev recovery +// process. Because vdev_info doesn't store chunk_num, leave vparam.chunk_num empty and it will be calculated in +// `compose_vparam` as an intermediate param to calculate the chunk num on each pdev. +static void populate_vparam(vdev_parameters& vparam, vdev_info& vinfo) { + vparam.vdev_size = vinfo.vdev_size; + vparam.chunk_size = vinfo.chunk_size; + vparam.blk_size = vinfo.blk_size; + vparam.multi_pdev_opts = static_cast< vdev_multi_pdev_opts_t >(vinfo.multi_pdev_choice); + vparam.dev_type = static_cast< HSDevType >(vinfo.hs_dev_type); + vparam.vdev_name = vinfo.name; + vparam.context_data = sisl::blob{vinfo.get_user_private(), vinfo.user_private_size}; + vparam.alloc_type = static_cast< blk_allocator_type_t >(vinfo.alloc_type); + vparam.chunk_sel_type = static_cast< chunk_selector_type_t >(vinfo.chunk_sel_type); + vparam.size_type = vinfo.size_type; + vparam.use_slab_allocator = vinfo.use_slab_allocator == 1; +} + std::vector< vdev_info > DeviceManager::read_vdev_infos(const std::vector< PhysicalDev* >& pdevs) { std::vector< vdev_info > ret_vinfos; auto buf = diff --git a/src/lib/device/hs_super_blk.h b/src/lib/device/hs_super_blk.h index 9d0a3140d..bdce07a4d 100644 --- a/src/lib/device/hs_super_blk.h +++ b/src/lib/device/hs_super_blk.h @@ -100,6 +100,9 @@ struct first_block_header { get_product_name(), get_system_uuid_str()); return str; } + bool is_empty() const { + return gen_number == 0 && version == 0 && std::string(product_name).empty(); + } }; struct pdev_info_header { diff --git a/src/lib/device/vchunk.cpp b/src/lib/device/vchunk.cpp index 69d8b9579..ecc9c132b 100644 --- a/src/lib/device/vchunk.cpp +++ b/src/lib/device/vchunk.cpp @@ -35,6 +35,8 @@ blk_num_t VChunk::get_defrag_nblks() const { return m_internal_chunk->blk_alloca uint32_t VChunk::get_pdev_id() const { return m_internal_chunk->physical_dev()->pdev_id(); } +const std::string& VChunk::get_pdev_name() const { return m_internal_chunk->physical_dev()->get_devname(); } + uint16_t VChunk::get_chunk_id() const { return m_internal_chunk->chunk_id(); } uint64_t VChunk::size() const { return m_internal_chunk->size(); } diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index da7efaf38..a222e4f48 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -2068,7 +2068,7 @@ void RaftReplDev::monitor_replace_member_replication_status() { if (!catch_up) { RD_LOGD(NO_TRACE_ID, "Checking replace member status, task_id={},replica_in={} with lsn={}, replica_out={} with lsn={}", - boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_in), in_lsn, + task_id, boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); return; } diff --git a/src/tests/test_device_manager.cpp b/src/tests/test_device_manager.cpp index 4077bc917..2ca1df3a8 100644 --- a/src/tests/test_device_manager.cpp +++ b/src/tests/test_device_manager.cpp @@ -79,9 +79,11 @@ class DeviceMgrTest : public ::testing::Test { return std::make_shared< homestore::VirtualDev >(*m_dmgr, vinfo_tmp, nullptr /* event_cb */, false); }); if (m_dmgr->is_first_time_boot()) { + LOGINFO("First time boot, formatting devices"); m_dmgr->format_devices(); m_dmgr->commit_formatting(); } else { + LOGINFO("Not first time boot, loading devices"); m_dmgr->load_devices(); } m_pdevs = m_dmgr->get_pdevs_by_dev_type(homestore::HSDevType::Data); @@ -95,6 +97,12 @@ class DeviceMgrTest : public ::testing::Test { setup_device_manager(); } + void add_data_file(std::string fname, uint64_t data_dev_size) { + init_file(fname, data_dev_size); + m_data_dev_names.emplace_back(fname); + m_dev_infos.emplace_back(std::filesystem::canonical(fname).string(), homestore::HSDevType::Data); + } + virtual void SetUp() override { auto const data_ndevices = SISL_OPTIONS["num_data_devs"].as< uint32_t >(); auto const data_dev_size = SISL_OPTIONS["data_dev_size_mb"].as< uint64_t >() * 1024 * 1024; @@ -103,9 +111,7 @@ class DeviceMgrTest : public ::testing::Test { homestore::in_bytes(data_dev_size)); for (uint32_t i{0}; i < data_ndevices; ++i) { auto fname = std::string{"/tmp/test_devmgr_data_" + std::to_string(i + 1)}; - init_file(fname, data_dev_size); - m_data_dev_names.emplace_back(fname); - m_dev_infos.emplace_back(std::filesystem::canonical(fname).string(), homestore::HSDevType::Data); + add_data_file(fname, data_dev_size); } setup_device_manager(); @@ -118,7 +124,7 @@ class DeviceMgrTest : public ::testing::Test { remove_files(m_data_dev_names); } - void validate_striped_vdevs() { + void validate_striped_vdevs(uint32_t expected_pdev_num = 3) { for (auto& vdev : m_vdevs) { auto chunks = vdev->get_chunks(); ASSERT_EQ(vdev->get_total_chunk_num(), m_pdevs.size() * 2) @@ -134,6 +140,8 @@ class DeviceMgrTest : public ::testing::Test { if (!inserted) { ++(it->second); } } + ASSERT_TRUE(chunks_in_pdev_count.size() == expected_pdev_num) + << "pdev num mismatch, expected " << expected_pdev_num << " but found " << chunks_in_pdev_count.size(); for (const auto& [pdev, count] : chunks_in_pdev_count) { ASSERT_EQ(count, 2) << "Every pdev should have exactly 2 chunks, that has not happened here"; } @@ -182,6 +190,204 @@ TEST_F(DeviceMgrTest, StripedVDevCreation) { this->validate_striped_vdevs(); } +TEST_F(DeviceMgrTest, ReplaceDeviceWithEmptyDevice) { + static constexpr uint32_t num_test_vdevs = 5; + uint64_t avail_size{0}; + for (auto& pdev : m_pdevs) { + avail_size += pdev->data_size(); + } + + uint32_t size_pct = 4; + uint64_t remain_size = avail_size; + + LOGINFO("Step 1: Creating {} vdevs with combined size as {}", num_test_vdevs, in_bytes(avail_size)); + for (uint32_t i = 0; i < num_test_vdevs; ++i) { + std::string name = "test_vdev_" + std::to_string(i + 1); + uint64_t size = std::min(remain_size, (avail_size * size_pct) / 100); + remain_size -= size; + size_pct *= 2; // Double the next vdev size + + LOGINFO("Step 1a: Creating vdev of name={} with size={}", name, in_bytes(size)); + auto vdev = + m_dmgr->create_vdev(homestore::vdev_parameters{.vdev_name = name, + .vdev_size = size, + .num_chunks = uint32_cast(m_pdevs.size() * 2), + .blk_size = 4096, + .dev_type = HSDevType::Data, + .alloc_type = blk_allocator_type_t::none, + .chunk_sel_type = chunk_selector_type_t::NONE, + .multi_pdev_opts = vdev_multi_pdev_opts_t::ALL_PDEV_STRIPED, + .context_data = sisl::blob{}}); + m_vdevs.push_back(std::move(vdev)); + } + + LOGINFO("Step 2: Validating all vdevs if they have created with correct number of chunks"); + this->validate_striped_vdevs(); + + auto fpath = m_data_dev_names[0]; + m_data_dev_names.erase(m_data_dev_names.begin()); + auto dinfo = m_dev_infos[0]; + m_dev_infos.erase(m_dev_infos.begin()); + LOGINFO("Step 3a: Remove device to simulate device failure, file={}", fpath); + if (std::filesystem::exists(fpath)) { std::filesystem::remove(fpath); } + LOGINFO("Step 3b: Restart dmgr", fpath); + this->restart(); + + LOGINFO("Step 4: Validate after one device is removed"); + this->validate_striped_vdevs(2); + + LOGINFO("Step 5: Recreate file to simulate a new device", fpath); + auto const data_dev_size = SISL_OPTIONS["data_dev_size_mb"].as< uint64_t >() * 1024 * 1024; + this->add_data_file(fpath, data_dev_size); + + LOGINFO("Step 6: Restart and validate if new device can be added to vdevs"); + this->restart(); + this->validate_striped_vdevs(); + + LOGINFO("Step 7: Restart and validate again"); + this->restart(); + this->validate_striped_vdevs(); +} + +TEST_F(DeviceMgrTest, ReplaceTwoDevicesAtOnce) { + static constexpr uint32_t num_test_vdevs = 5; + uint64_t avail_size{0}; + for (auto& pdev : m_pdevs) { + avail_size += pdev->data_size(); + } + + uint32_t size_pct = 4; + uint64_t remain_size = avail_size; + + LOGINFO("Step 1: Creating {} vdevs with combined size as {}", num_test_vdevs, in_bytes(avail_size)); + for (uint32_t i = 0; i < num_test_vdevs; ++i) { + std::string name = "test_vdev_" + std::to_string(i + 1); + uint64_t size = std::min(remain_size, (avail_size * size_pct) / 100); + remain_size -= size; + size_pct *= 2; // Double the next vdev size + + LOGINFO("Step 1a: Creating vdev of name={} with size={}", name, in_bytes(size)); + auto vdev = + m_dmgr->create_vdev(homestore::vdev_parameters{.vdev_name = name, + .vdev_size = size, + .num_chunks = uint32_cast(m_pdevs.size() * 2), + .blk_size = 4096, + .dev_type = HSDevType::Data, + .alloc_type = blk_allocator_type_t::none, + .chunk_sel_type = chunk_selector_type_t::NONE, + .multi_pdev_opts = vdev_multi_pdev_opts_t::ALL_PDEV_STRIPED, + .context_data = sisl::blob{}}); + m_vdevs.push_back(std::move(vdev)); + } + + LOGINFO("Step 2: Validating all vdevs if they have created with correct number of chunks"); + this->validate_striped_vdevs(); + + auto fpath1 = m_data_dev_names[0]; + m_data_dev_names.erase(m_data_dev_names.begin()); + auto dinfo = m_dev_infos[0]; + m_dev_infos.erase(m_dev_infos.begin()); + LOGINFO("Step 3a: Remove device to simulate device failure, file={}", fpath1); + if (std::filesystem::exists(fpath1)) { std::filesystem::remove(fpath1); } + + auto fpath2 = m_data_dev_names[1]; + m_data_dev_names.erase(m_data_dev_names.end()); + auto dinfo2 = m_dev_infos[1]; + m_dev_infos.erase(m_dev_infos.end()); + LOGINFO("Step 3a: Remove device to simulate device failure, file={}", fpath2); + if (std::filesystem::exists(fpath2)) { std::filesystem::remove(fpath2); } + + LOGINFO("Step 3b: Restart dmgr"); + this->restart(); + + LOGINFO("Step 4: Validate after one device is removed"); + this->validate_striped_vdevs(1); + + LOGINFO("Step 5: Recreate files to simulate new devices"); + auto const data_dev_size = SISL_OPTIONS["data_dev_size_mb"].as< uint64_t >() * 1024 * 1024; + this->add_data_file(fpath1, data_dev_size); + this->add_data_file(fpath2, data_dev_size); + + LOGINFO("Step 6: Restart and validate if new device can be added to vdevs"); + this->restart(); + this->validate_striped_vdevs(); + + LOGINFO("Step 7: Restart and validate again"); + this->restart(); + this->validate_striped_vdevs(); +} + +TEST_F(DeviceMgrTest, ReplaceTwoDevicesOneByOne) { + static constexpr uint32_t num_test_vdevs = 5; + uint64_t avail_size{0}; + for (auto& pdev : m_pdevs) { + avail_size += pdev->data_size(); + } + + uint32_t size_pct = 4; + uint64_t remain_size = avail_size; + + LOGINFO("Step 1: Creating {} vdevs with combined size as {}", num_test_vdevs, in_bytes(avail_size)); + for (uint32_t i = 0; i < num_test_vdevs; ++i) { + std::string name = "test_vdev_" + std::to_string(i + 1); + uint64_t size = std::min(remain_size, (avail_size * size_pct) / 100); + remain_size -= size; + size_pct *= 2; // Double the next vdev size + + LOGINFO("Step 1a: Creating vdev of name={} with size={}", name, in_bytes(size)); + auto vdev = + m_dmgr->create_vdev(homestore::vdev_parameters{.vdev_name = name, + .vdev_size = size, + .num_chunks = uint32_cast(m_pdevs.size() * 2), + .blk_size = 4096, + .dev_type = HSDevType::Data, + .alloc_type = blk_allocator_type_t::none, + .chunk_sel_type = chunk_selector_type_t::NONE, + .multi_pdev_opts = vdev_multi_pdev_opts_t::ALL_PDEV_STRIPED, + .context_data = sisl::blob{}}); + m_vdevs.push_back(std::move(vdev)); + } + + LOGINFO("Step 2: Validating all vdevs if they have created with correct number of chunks"); + this->validate_striped_vdevs(); + + auto fpath1 = m_data_dev_names[0]; + m_data_dev_names.erase(m_data_dev_names.begin()); + auto dinfo = m_dev_infos[0]; + m_dev_infos.erase(m_dev_infos.begin()); + LOGINFO("Step 3a: Remove device to simulate device failure, file={}", fpath1); + if (std::filesystem::exists(fpath1)) { std::filesystem::remove(fpath1); } + + auto fpath2 = m_data_dev_names[1]; + m_data_dev_names.erase(m_data_dev_names.end()); + auto dinfo2 = m_dev_infos[1]; + m_dev_infos.erase(m_dev_infos.end()); + LOGINFO("Step 3a: Remove device to simulate device failure, file={}", fpath2); + if (std::filesystem::exists(fpath2)) { std::filesystem::remove(fpath2); } + + LOGINFO("Step 3b: Restart dmgr after removing devices"); + this->restart(); + + LOGINFO("Step 4: Validate after devices is removed"); + this->validate_striped_vdevs(1); + + LOGINFO("Step 5: Recreate file to simulate replacement with a new device, file={}", fpath1); + auto const data_dev_size = SISL_OPTIONS["data_dev_size_mb"].as< uint64_t >() * 1024 * 1024; + this->add_data_file(fpath1, data_dev_size); + + this->restart(); + this->validate_striped_vdevs(2); + + LOGINFO("Step 6: Recreate file to simulate replacement with a new device, file={}", fpath2); + this->add_data_file(fpath2, data_dev_size); + this->restart(); + this->validate_striped_vdevs(); + + LOGINFO("Step 7: Restart and validate again"); + this->restart(); + this->validate_striped_vdevs(); +} + TEST_F(DeviceMgrTest, SmallStripedVDevCreation) { std::string name = "test_vdev_small"; From 1eb131b978fed7209eae8920da092e201cb64f72 Mon Sep 17 00:00:00 2001 From: yuwmao <148639999+yuwmao@users.noreply.github.com> Date: Thu, 17 Jul 2025 08:06:52 +0800 Subject: [PATCH 152/170] Refine the algorithm to calculate pdev chunks on vdev (#772) --- conanfile.py | 2 +- src/lib/device/device_manager.cpp | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/conanfile.py b/conanfile.py index 6e220bb31..e3e8831e0 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.19.0" + version = "6.19.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/device/device_manager.cpp b/src/lib/device/device_manager.cpp index 76169f15a..c34405ada 100644 --- a/src/lib/device/device_manager.cpp +++ b/src/lib/device/device_manager.cpp @@ -479,13 +479,16 @@ DeviceManager::calculate_vdev_chunk_num_on_new_pdevs(shared< VirtualDev > vdev, LOGDEBUG("pdev {} is already added to vdev {}, skip it", pdev->get_devname(), vdev->info().get_name()); continue; } - auto expect_chunk_num_on_pdev = - static_cast< uint32_t >(chunk_num * (pdev->data_size() / static_cast< float >(total_pdev_data_size))); + // the device size is expected to be the same, so multiple should be an integer, and chunk_num can be divisible + // by multiple. + auto multiple = static_cast< float >(total_pdev_data_size) / pdev->data_size(); + auto expect_chunk_num_on_pdev = static_cast< uint32_t >(chunk_num / multiple); auto available_chunks_on_pdev = static_cast< uint32_t >(pdev->data_size() / vdev->info().chunk_size); pdev_chunk_num_map[pdev] = std::min(expect_chunk_num_on_pdev, available_chunks_on_pdev); - LOGINFO("pdev {} should add {} chunks to vdev {} , expect_chunk_num_on_pdev={}, available_chunks_on_pdev={}", + LOGINFO("pdev {} should add {} chunks to vdev {} , expect_chunk_num_on_pdev={}, available_chunks_on_pdev={}, " + "pdev_size={}", pdev->get_devname(), pdev_chunk_num_map[pdev], vdev->info().get_name(), expect_chunk_num_on_pdev, - available_chunks_on_pdev); + available_chunks_on_pdev, pdev->data_size()); } return pdev_chunk_num_map; } From ce24955cdb53a8017a9315b352d9edbfe7a30f20 Mon Sep 17 00:00:00 2001 From: yuwmao <148639999+yuwmao@users.noreply.github.com> Date: Mon, 21 Jul 2025 15:55:27 +0800 Subject: [PATCH 153/170] Record cur_pdev_id in first block header (#776) Record cur_pdev_id in first block header. gen_number in first block header is used to track the value change of the first block attributes. Compare gen_number to identify new first block header in load_devices() and increase gen_number every time the attributes(like cur_pdev_id) changed. The gen_number conflicts might arise due to interruption during sequential commit_formatting, but it can be identified and corrected to the latest one next startup. --- conanfile.py | 2 +- src/lib/device/device.h | 2 - src/lib/device/device_manager.cpp | 117 +++++++++++++++++------------- src/lib/device/hs_super_blk.h | 12 +-- src/tests/test_device_manager.cpp | 39 ++++++++++ 5 files changed, 113 insertions(+), 59 deletions(-) diff --git a/conanfile.py b/conanfile.py index e3e8831e0..b63ef6b7a 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.19.1" + version = "6.20.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/device/device.h b/src/lib/device/device.h index fa907f002..3d9818ed0 100644 --- a/src/lib/device/device.h +++ b/src/lib/device/device.h @@ -133,8 +133,6 @@ class DeviceManager { sisl::sparse_vector< std::unique_ptr< PhysicalDev > > m_all_pdevs; std::map< HSDevType, std::vector< PhysicalDev* > > m_pdevs_by_type; - uint32_t m_cur_pdev_id{0}; // This is a monotonically increasing value. In case of disk replacement, this value is - // not inherited, new device will get a new id. std::map< uint16_t, shared< Chunk > > m_chunks; // Chunks organized as array (indexed on chunk id) sisl::Bitset m_chunk_id_bm{hs_super_blk::MAX_CHUNKS_IN_SYSTEM}; // Bitmap to keep track of chunk ids available diff --git a/src/lib/device/device_manager.cpp b/src/lib/device/device_manager.cpp index c34405ada..ff4cb5d88 100644 --- a/src/lib/device/device_manager.cpp +++ b/src/lib/device/device_manager.cpp @@ -167,6 +167,7 @@ void DeviceManager::load_devices() { // 1. Load all physical devices. std::vector< dev_info > pdevs_to_format; + auto stale_first_blk_found = false; for (auto& d : m_dev_infos) { first_block fblk = PhysicalDev::read_first_block(d.dev_name, device_open_flags(d.dev_name)); pdev_info_header* pinfo = &fblk.this_pdev_hdr; @@ -190,7 +191,14 @@ void DeviceManager::load_devices() { } it->second.push_back(pdev.get()); m_all_pdevs[pinfo->pdev_id] = std::move(pdev); - m_cur_pdev_id = std::max(m_cur_pdev_id, pinfo->pdev_id); + stale_first_blk_found = fblk.hdr.gen_number != m_first_blk_hdr.gen_number; + if (fblk.hdr.gen_number > m_first_blk_hdr.gen_number) { + // cur_pdev_id will be updated to the max pdev id found in the formatted devices. The stale number will + // be flushed in commit_formatting(). + LOGINFO("newer generation number {} found in device {}, updating first block header", + fblk.hdr.gen_number, d.dev_name); + m_first_blk_hdr = fblk.hdr; + } } } @@ -203,62 +211,69 @@ void DeviceManager::load_devices() { // 3. Recover vdevs from the physical devices. load_vdevs(); - if (pdevs_to_format.empty()) return; - - // 4. Add new physical devices to existing vdevs. - for (auto vdev : m_vdevs) { - vdev_parameters vparam; - auto vinfo = vdev->info(); - populate_vparam(vparam, vinfo); - if (vparam.size_type == vdev_size_type_t::VDEV_SIZE_DYNAMIC || - vparam.multi_pdev_opts == vdev_multi_pdev_opts_t::SINGLE_FIRST_PDEV || - vparam.multi_pdev_opts == vdev_multi_pdev_opts_t::SINGLE_ANY_PDEV) { - LOGINFO("Skipping adding new devices to vdev {}, as it is dynamic or single pdev type", vinfo.get_name()); - continue; - } + if (pdevs_to_format.empty() && !stale_first_blk_found) return; + + if (!pdevs_to_format.empty()) { + ++m_first_blk_hdr.gen_number; + // 4. Add new physical devices to existing vdevs. + for (auto vdev : m_vdevs) { + vdev_parameters vparam; + auto vinfo = vdev->info(); + populate_vparam(vparam, vinfo); + if (vparam.size_type == vdev_size_type_t::VDEV_SIZE_DYNAMIC || + vparam.multi_pdev_opts == vdev_multi_pdev_opts_t::SINGLE_FIRST_PDEV || + vparam.multi_pdev_opts == vdev_multi_pdev_opts_t::SINGLE_ANY_PDEV) { + LOGINFO("Skipping adding new devices to vdev {}, as it is dynamic or single pdev type", + vinfo.get_name()); + continue; + } - std::vector< PhysicalDev* > pdevs = pdevs_by_type_internal(vparam.dev_type); - RELEASE_ASSERT_GT(pdevs.size(), 0, - "vdev is loaded from at least one pdev, but unable to find any pdevs for given vdev type"); - RELEASE_ASSERT(vparam.blk_size % pdevs[0]->align_size() == 0, "blk_size should be multiple of pdev align_size"); + std::vector< PhysicalDev* > pdevs = pdevs_by_type_internal(vparam.dev_type); + RELEASE_ASSERT_GT( + pdevs.size(), 0, + "vdev is loaded from at least one pdev, but unable to find any pdevs for given vdev type"); + RELEASE_ASSERT(vparam.blk_size % pdevs[0]->align_size() == 0, + "blk_size should be multiple of pdev align_size"); + + // vparam.num_chunks will be inferred. + compose_vparam(vdev->info().vdev_id, vparam, pdevs); + if (vdev->get_pdevs().size() == pdevs.size()) { + LOGDEBUG("Virtual device {} is already sized correctly, no new devices to add", + vdev->info().get_name()); + continue; + } + LOGINFO("Virtual device {} is undersized, pdevs already added={}, qualified pdevs ={}, need to add new " + "devices to it", + vdev->info().get_name(), vdev->get_pdevs().size(), pdevs.size()); + + // calculate the number of chunks to be created in each new pdev + auto pdev_chunk_num_map = calculate_vdev_chunk_num_on_new_pdevs(vdev, pdevs, vparam.num_chunks); + + std::unique_lock lg{m_vdev_mutex}; + auto buf = hs_utils::iobuf_alloc(vdev_info::size, sisl::buftag::superblk, pdevs[0]->align_size()); + std::memcpy(buf, &vinfo, sizeof(vdev_info)); + uint64_t offset = hs_super_blk::vdev_sb_offset() + (vdev->info().vdev_id * vdev_info::size); + + // add the new pdevs to the vdev + for (auto pdev_to_add : pdev_chunk_num_map) { + auto pdev = pdev_to_add.first; + add_pdev_to_vdev(vdev, pdev_to_add.first, pdev_to_add.second); + LOGINFO("Added pdev[name={}, id={}] with total_chunk_num_in_pdev={} to vdev {}", pdev->get_devname(), + pdev->pdev_id(), pdev_to_add.second, vdev->info().get_name()); + + // Update vdev info in the super block area of the pdev + pdev->write_super_block(buf, vdev_info::size, offset); + } - // vparam.num_chunks will be inferred. - compose_vparam(vdev->info().vdev_id, vparam, pdevs); - if (vdev->get_pdevs().size() == pdevs.size()) { - LOGDEBUG("Virtual device {} is already sized correctly, no new devices to add", - vdev->info().get_name()); - continue; + hs_utils::iobuf_free(buf, sisl::buftag::superblk); } - LOGINFO( - "Virtual device {} is undersized, pdevs already added={}, qualified pdevs ={}, need to add new devices to it", - vdev->info().get_name(), vdev->get_pdevs().size(), pdevs.size()); - - // calculate the number of chunks to be created in each new pdev - auto pdev_chunk_num_map = calculate_vdev_chunk_num_on_new_pdevs(vdev, pdevs, vparam.num_chunks); - - std::unique_lock lg{m_vdev_mutex}; - auto buf = hs_utils::iobuf_alloc(vdev_info::size, sisl::buftag::superblk, pdevs[0]->align_size()); - std::memcpy(buf, &vinfo, sizeof(vdev_info)); - uint64_t offset = hs_super_blk::vdev_sb_offset() + (vdev->info().vdev_id * vdev_info::size); - - // add the new pdevs to the vdev - for (auto pdev_to_add : pdev_chunk_num_map) { - auto pdev = pdev_to_add.first; - add_pdev_to_vdev(vdev, pdev_to_add.first, pdev_to_add.second); - LOGINFO("Added pdev[name={}, id={}] with total_chunk_num_in_pdev={} to vdev {}", pdev->get_devname(), - pdev->pdev_id(), pdev_to_add.second, vdev->info().get_name()); - - // Update vdev info in the super block area of the pdev - pdev->write_super_block(buf, vdev_info::size, offset); - } - - hs_utils::iobuf_free(buf, sisl::buftag::superblk); - commit_formatting(); } + commit_formatting(); } void DeviceManager::commit_formatting() { auto buf = hs_utils::iobuf_alloc(hs_super_blk::first_block_size(), sisl::buftag::superblk, 512); + LOGINFO("commit formatting first block with gen_number={}", m_first_blk_hdr.gen_number); for (auto& pdev : m_all_pdevs) { if (!pdev) { continue; } @@ -269,6 +284,8 @@ void DeviceManager::commit_formatting() { } first_block* fblk = r_cast< first_block* >(buf); + fblk->hdr.gen_number = m_first_blk_hdr.gen_number; + fblk->hdr.cur_pdev_id = m_first_blk_hdr.cur_pdev_id; fblk->formatting_done = 0x1; fblk->checksum = crc32_ieee(init_crc32, uintptr_cast(fblk), first_block::s_atomic_fb_size); @@ -647,7 +664,7 @@ uint32_t DeviceManager::populate_pdev_info(const dev_info& dinfo, const iomgr::d const uuid_t& uuid, pdev_info_header& pinfo) { bool hdd = is_hdd(dinfo.dev_name); - pinfo.pdev_id = ++m_cur_pdev_id; + pinfo.pdev_id = ++m_first_blk_hdr.cur_pdev_id; pinfo.mirror_super_block = hdd ? 0x01 : 0x00; pinfo.max_pdev_chunks = hs_super_blk::max_chunks_in_pdev(dinfo); diff --git a/src/lib/device/hs_super_blk.h b/src/lib/device/hs_super_blk.h index bdce07a4d..18a9e963b 100644 --- a/src/lib/device/hs_super_blk.h +++ b/src/lib/device/hs_super_blk.h @@ -77,16 +77,18 @@ struct disk_attr { struct first_block_header { static constexpr const char* PRODUCT_NAME{"HomeStore4x"}; static constexpr size_t s_product_name_size{64}; - static constexpr uint32_t CURRENT_SUPERBLOCK_VERSION{4}; + static constexpr uint32_t CURRENT_SUPERBLOCK_VERSION{5}; public: - uint64_t gen_number{0}; // Generation count of this structure - uint32_t version{0}; // Version Id of this structure + uint64_t gen_number{0}; // Generation count of this structure, will be incremented on every fields change + uint32_t version{0}; // Version Id of this structure char product_name[s_product_name_size]{}; // Product name uint32_t num_pdevs{0}; // Total number of pdevs homestore is being created on uint32_t max_vdevs{0}; // Max VDevs possible, this cannot be changed post formatting uint32_t max_system_chunks{0}; // Max Chunks possible, this cannot be changed post formatting + uint32_t cur_pdev_id{0}; // The current max pdev id of all formatted disks and used to assign next pdev id for new + // disks. It is a monotonically increasing value and is not inherited in case of disk replacement. uuid_t system_uuid; public: @@ -100,9 +102,7 @@ struct first_block_header { get_product_name(), get_system_uuid_str()); return str; } - bool is_empty() const { - return gen_number == 0 && version == 0 && std::string(product_name).empty(); - } + bool is_empty() const { return gen_number == 0 && version == 0 && std::string(product_name).empty(); } }; struct pdev_info_header { diff --git a/src/tests/test_device_manager.cpp b/src/tests/test_device_manager.cpp index 2ca1df3a8..6a53d7773 100644 --- a/src/tests/test_device_manager.cpp +++ b/src/tests/test_device_manager.cpp @@ -224,6 +224,12 @@ TEST_F(DeviceMgrTest, ReplaceDeviceWithEmptyDevice) { LOGINFO("Step 2: Validating all vdevs if they have created with correct number of chunks"); this->validate_striped_vdevs(); + std::set< uint32_t > pdev_ids; + std::vector< PhysicalDev* > pdevs = m_dmgr->get_pdevs_by_dev_type(HSDevType::Data); + for (auto d : pdevs) { + pdev_ids.insert(d->pdev_id()); + } + auto fpath = m_data_dev_names[0]; m_data_dev_names.erase(m_data_dev_names.begin()); auto dinfo = m_dev_infos[0]; @@ -247,6 +253,13 @@ TEST_F(DeviceMgrTest, ReplaceDeviceWithEmptyDevice) { LOGINFO("Step 7: Restart and validate again"); this->restart(); this->validate_striped_vdevs(); + + pdevs = m_dmgr->get_pdevs_by_dev_type(HSDevType::Data); + for (auto d : pdevs) { + pdev_ids.insert(d->pdev_id()); + } + ASSERT_EQ(pdev_ids.size(), m_pdevs.size() + 1) + << "Pdev ids size mismatch after replacing two devices, duplicate pdev ids found or missing pdev ids"; } TEST_F(DeviceMgrTest, ReplaceTwoDevicesAtOnce) { @@ -283,6 +296,12 @@ TEST_F(DeviceMgrTest, ReplaceTwoDevicesAtOnce) { LOGINFO("Step 2: Validating all vdevs if they have created with correct number of chunks"); this->validate_striped_vdevs(); + std::set< uint32_t > pdev_ids; + std::vector< PhysicalDev* > pdevs = m_dmgr->get_pdevs_by_dev_type(HSDevType::Data); + for (auto d : pdevs) { + pdev_ids.insert(d->pdev_id()); + } + auto fpath1 = m_data_dev_names[0]; m_data_dev_names.erase(m_data_dev_names.begin()); auto dinfo = m_dev_infos[0]; @@ -315,6 +334,13 @@ TEST_F(DeviceMgrTest, ReplaceTwoDevicesAtOnce) { LOGINFO("Step 7: Restart and validate again"); this->restart(); this->validate_striped_vdevs(); + + pdevs = m_dmgr->get_pdevs_by_dev_type(HSDevType::Data); + for (auto d : pdevs) { + pdev_ids.insert(d->pdev_id()); + } + ASSERT_EQ(pdev_ids.size(), m_pdevs.size() + 2) + << "Pdev ids size mismatch after replacing two devices, duplicate pdev ids found or missing pdev ids"; } TEST_F(DeviceMgrTest, ReplaceTwoDevicesOneByOne) { @@ -351,6 +377,12 @@ TEST_F(DeviceMgrTest, ReplaceTwoDevicesOneByOne) { LOGINFO("Step 2: Validating all vdevs if they have created with correct number of chunks"); this->validate_striped_vdevs(); + std::set< uint32_t > pdev_ids; + std::vector< PhysicalDev* > pdevs = m_dmgr->get_pdevs_by_dev_type(HSDevType::Data); + for (auto d : pdevs) { + pdev_ids.insert(d->pdev_id()); + } + auto fpath1 = m_data_dev_names[0]; m_data_dev_names.erase(m_data_dev_names.begin()); auto dinfo = m_dev_infos[0]; @@ -386,6 +418,13 @@ TEST_F(DeviceMgrTest, ReplaceTwoDevicesOneByOne) { LOGINFO("Step 7: Restart and validate again"); this->restart(); this->validate_striped_vdevs(); + + pdevs = m_dmgr->get_pdevs_by_dev_type(HSDevType::Data); + for (auto d : pdevs) { + pdev_ids.insert(d->pdev_id()); + } + ASSERT_EQ(pdev_ids.size(), m_pdevs.size() + 2) + << "Pdev ids size mismatch after replacing two devices, duplicate pdev ids found or missing pdev ids"; } TEST_F(DeviceMgrTest, SmallStripedVDevCreation) { From 2592b01351d5a7d29dac3eeb87db1844953d90e0 Mon Sep 17 00:00:00 2001 From: yuwmao <148639999+yuwmao@users.noreply.github.com> Date: Mon, 21 Jul 2025 22:38:59 +0800 Subject: [PATCH 154/170] Fix monitor_replace_member_replication_status (#774) --- conanfile.py | 2 +- src/lib/replication/repl_dev/raft_repl_dev.cpp | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index b63ef6b7a..70a17dcd9 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.0" + version = "6.20.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index a222e4f48..18f7c1eb4 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -2052,16 +2052,25 @@ void RaftReplDev::monitor_replace_member_replication_status() { repl_lsn_t in_lsn = 0; repl_lsn_t out_lsn = 0; repl_lsn_t laggy = HS_DYNAMIC_CONFIG(consensus.laggy_threshold); + auto in_member_found = false; for (auto& peer : peers) { if (peer.id_ == replica_out) { out_lsn = peer.replication_idx_; RD_LOGD(NO_TRACE_ID, "Replica out {} with lsn {}", boost::uuids::to_string(peer.id_), out_lsn); } else if (peer.id_ == replica_in) { + in_member_found = true; in_lsn = peer.replication_idx_; RD_LOGD(NO_TRACE_ID, "Replica in {} with lsn {}", boost::uuids::to_string(peer.id_), in_lsn); } } + if (!in_member_found) { + RD_LOGW(NO_TRACE_ID, + "Checking replace member status, task_id={}, Replica in {} not found in the peers, add_member might " + "fail, wait for users to retry or rollback the task", + task_id, boost::uuids::to_string(replica_in)); + return; + } // TODO optimize the condition bool catch_up = in_lsn + laggy >= out_lsn; From 8cf9553762765b28fd49cac431a2cd30876e01bb Mon Sep 17 00:00:00 2001 From: Sanal Date: Mon, 21 Jul 2025 10:37:27 -0700 Subject: [PATCH 155/170] Run cp mgr timer in its own reactor. (#761) Add reactor for cp manager timer. In high intensive IO tests, cp timer's are not executed. --- conanfile.py | 2 +- src/include/homestore/checkpoint/cp_mgr.hpp | 3 ++ src/lib/checkpoint/cp_mgr.cpp | 37 ++++++++++++++++++--- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/conanfile.py b/conanfile.py index 70a17dcd9..2fdf3dc80 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.1" + version = "6.20.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/checkpoint/cp_mgr.hpp b/src/include/homestore/checkpoint/cp_mgr.hpp index 634524293..43bf8b3ae 100644 --- a/src/include/homestore/checkpoint/cp_mgr.hpp +++ b/src/include/homestore/checkpoint/cp_mgr.hpp @@ -163,6 +163,7 @@ class CPManager { bool m_in_flush_phase{false}; bool m_pending_trigger_cp{false}; // Is there is a waiter for a cp flush to start folly::SharedPromise< bool > m_pending_trigger_cp_comp; + iomgr::io_fiber_t m_timer_fiber; public: CPManager(); @@ -231,6 +232,8 @@ class CPManager { void start_cp_thread(); folly::Future< bool > do_trigger_cp_flush(bool force, bool flush_on_shutdown); uint64_t cp_timer_us(); + void start_timer_thread(); + void stop_timer_thread(); }; extern CPManager& cp_mgr(); diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp index 0acc8588c..15c175bc0 100644 --- a/src/lib/checkpoint/cp_mgr.cpp +++ b/src/lib/checkpoint/cp_mgr.cpp @@ -14,6 +14,7 @@ * *********************************************************************************/ #include +#include #include #include @@ -41,6 +42,7 @@ CPManager::CPManager() : resource_mgr().register_dirty_buf_exceed_cb( [this]([[maybe_unused]] int64_t dirty_buf_count, bool critical) { this->trigger_cp_flush(false /* force */); }); + start_timer_thread(); start_cp_thread(); } @@ -64,12 +66,38 @@ uint64_t CPManager::cp_timer_us() { } } +void CPManager::start_timer_thread() { + std::latch latch{1}; + m_timer_fiber = nullptr; + iomanager.create_reactor("cp_timer_thread", iomgr::TIGHT_LOOP | iomgr::ADAPTIVE_LOOP, 1 /* num_fibers */, + [this, &latch](bool is_started) { + if (is_started) { + m_timer_fiber = iomanager.iofiber_self(); + latch.count_down(); + } + }); + latch.wait(); +} + +void CPManager::stop_timer_thread() { + std::latch latch{1}; + iomanager.run_on_forget(m_timer_fiber, [this, &latch]() mutable { + if (m_cp_timer_hdl != iomgr::null_timer_handle) { + iomanager.cancel_timer(m_cp_timer_hdl, true); + m_cp_timer_hdl = iomgr::null_timer_handle; + } + latch.count_down(); + }); + latch.wait(); +} + void CPManager::start_timer() { auto usecs = cp_timer_us(); LOGINFO("cp timer is set to {} usec", usecs); - m_cp_timer_hdl = iomanager.schedule_global_timer( - usecs * 1000, true, nullptr /*cookie*/, iomgr::reactor_regex::all_worker, - [this](void*) { trigger_cp_flush(false /* false */); }, true /* wait_to_schedule */); + iomanager.run_on_wait(m_timer_fiber, [this, usecs]() { + m_cp_timer_hdl = iomanager.schedule_thread_timer(usecs * 1000, true /* recurring */, nullptr /* cookie */, + [this](void*) { trigger_cp_flush(false /* false */); }); + }); } void CPManager::on_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie) { @@ -87,8 +115,7 @@ void CPManager::create_first_cp() { void CPManager::shutdown() { LOGINFO("Stopping cp timer"); - iomanager.cancel_timer(m_cp_timer_hdl, true); - m_cp_timer_hdl = iomgr::null_timer_handle; + stop_timer_thread(); { std::unique_lock< std::mutex > lk(m_trigger_cp_mtx); From 03dd4ce23742d8a0bd8989b5897cfeb6dd90bf65 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Mon, 21 Jul 2025 15:34:21 -0700 Subject: [PATCH 156/170] Add index chunk selector hb (#768) * Add index chunk selector --- conanfile.py | 2 +- src/include/homestore/homestore.hpp | 3 ++- .../homestore/index/index_internal.hpp | 18 +++++++++++++ src/include/homestore/index/index_table.hpp | 21 +++++++++++----- src/include/homestore/index/wb_cache_base.hpp | 3 ++- src/include/homestore/index_service.hpp | 10 +++++--- src/lib/homestore.cpp | 25 +++++++++++-------- src/lib/index/index_service.cpp | 16 ++++++------ src/lib/index/wb_cache.cpp | 9 ++++--- src/lib/index/wb_cache.hpp | 2 +- .../test_common/homestore_test_common.hpp | 9 +++++-- 11 files changed, 82 insertions(+), 36 deletions(-) diff --git a/conanfile.py b/conanfile.py index 2fdf3dc80..e7d5157b7 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.2" + version = "6.20.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/homestore.hpp b/src/include/homestore/homestore.hpp index 42e20a150..c4e518244 100644 --- a/src/include/homestore/homestore.hpp +++ b/src/include/homestore/homestore.hpp @@ -145,7 +145,8 @@ class HomeStore { ///////////////////////////// Member functions ///////////////////////////////////////////// HomeStore& with_data_service(cshared< ChunkSelector >& custom_chunk_selector = nullptr); HomeStore& with_log_service(); - HomeStore& with_index_service(std::unique_ptr< IndexServiceCallbacks > cbs); + HomeStore& with_index_service(std::unique_ptr< IndexServiceCallbacks > cbs, + cshared< ChunkSelector >& custom_chunk_selector = nullptr); HomeStore& with_repl_data_service(cshared< ReplApplication >& repl_app, cshared< ChunkSelector >& custom_chunk_selector = nullptr); diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp index 6918f9741..4d5ce6afe 100644 --- a/src/include/homestore/index/index_internal.hpp +++ b/src/include/homestore/index/index_internal.hpp @@ -56,6 +56,24 @@ struct index_table_sb { uint32_t user_sb_size; // Size of the user superblk uint8_t user_sb_bytes[0]; + uint32_t pdev_id; + uint32_t index_num_chunks {0}; + // List of chunk ids allocated for this index table are stored after this. + void init_chunks(std::vector const& chunk_ids){ + index_num_chunks = chunk_ids.size(); + auto chunk_id_ptr = get_chunk_ids_mutable(); + for (auto& chunk_id : chunk_ids) { + *chunk_id_ptr = chunk_id; + chunk_id_ptr++; + } + } + chunk_num_t* get_chunk_ids_mutable() { + return r_cast(uintptr_cast(this) + sizeof(index_table_sb)); + } + const chunk_num_t* get_chunk_ids() const { + return r_cast< const chunk_num_t* >(reinterpret_cast< const uint8_t* >(this) + sizeof(index_table_sb)); + } + }; #pragma pack() diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 30ba32321..69d8a0010 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -35,6 +35,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { private: superblk< index_table_sb > m_sb; shared< MetaIndexBuffer > m_sb_buffer; + static constexpr uint32_t INVALID_ORDINAL = std::numeric_limits::max(); // graceful shutdown private: @@ -58,12 +59,17 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } - IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg) : + IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg, uint32_t ordinal= INVALID_ORDINAL, const std::vector< chunk_num_t >& chunk_ids ={}, uint32_t pdev_id = 0): Btree< K, V >{cfg}, m_sb{"index"} { + auto ord_num = (ordinal == INVALID_ORDINAL)? (hs()->index_service().reserve_ordinal()) : ordinal; + BT_LOG_ASSERT(!hs()->index_service().get_index_table(ord_num), "table with ordinal {} already exists"); + // Create a superblk for the index table and create MetaIndexBuffer corresponding to that - m_sb.create(sizeof(index_table_sb)); + m_sb.create(sizeof(index_table_sb)+(chunk_ids.size() * sizeof(chunk_num_t))); + m_sb->init_chunks(chunk_ids); + m_sb->pdev_id = pdev_id; + m_sb->ordinal = ord_num; m_sb->uuid = uuid; - m_sb->ordinal = hs()->index_service().reserve_ordinal(); m_sb->parent_uuid = parent_uuid; m_sb->user_sb_size = user_sb_size; m_sb.write(); @@ -106,8 +112,11 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { btree_status_t destroy() override { if (is_stopping()) return btree_status_t::stopping; incr_pending_request_num(); - auto cpg = cp_mgr().cp_guard(); - Btree< K, V >::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); + auto chunk_selector {hs()->index_service().get_chunk_selector()}; + if(!chunk_selector){ + auto cpg = cp_mgr().cp_guard(); + Btree< K, V >::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); + } m_sb.destroy(); m_sb_buffer->m_valid = false; decr_pending_request_num(); @@ -239,7 +248,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { protected: ////////////////// Override Implementation of underlying store requirements ////////////////// BtreeNodePtr alloc_node(bool is_leaf) override { - return wb_cache().alloc_buf([this, is_leaf](const IndexBufferPtr& idx_buf) -> BtreeNodePtr { + return wb_cache().alloc_buf(ordinal(), [this, is_leaf](const IndexBufferPtr& idx_buf) -> BtreeNodePtr { BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), true, is_leaf); static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf); return BtreeNodePtr{n}; diff --git a/src/include/homestore/index/wb_cache_base.hpp b/src/include/homestore/index/wb_cache_base.hpp index 4624f9444..3fb33d79b 100644 --- a/src/include/homestore/index/wb_cache_base.hpp +++ b/src/include/homestore/index/wb_cache_base.hpp @@ -36,9 +36,10 @@ class IndexWBCacheBase { /// @brief Allocate the buffer and initialize the btree node. It adds the node to the wb cache. /// @tparam K Key type of the Index + /// @param ordinal The index table ordinal used when custom index chunk selector exists /// @param node_initializer Callback to be called upon which buffer is turned into btree node /// @return Node which was created by the node_initializer - virtual BtreeNodePtr alloc_buf(node_initializer_t&& node_initializer) = 0; + virtual BtreeNodePtr alloc_buf(uint32_t ordinal, node_initializer_t&& node_initializer) = 0; /// @brief Write buffer /// @param buf diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp index e14f6c18f..6e5244543 100644 --- a/src/include/homestore/index_service.hpp +++ b/src/include/homestore/index_service.hpp @@ -30,6 +30,7 @@ namespace homestore { class IndexWBCacheBase; class IndexTableBase; class VirtualDev; +class ChunkSelector; class IndexServiceCallbacks { public: @@ -49,21 +50,22 @@ class IndexService { std::pair< meta_blk*, sisl::byte_view >{nullptr, sisl::byte_view{}}}; std::vector< std::pair< meta_blk*, sisl::byte_view > > m_itable_sbs; std::unique_ptr< sisl::IDReserver > m_ordinal_reserver; + std::shared_ptr< ChunkSelector > m_custom_chunk_selector; mutable std::mutex m_index_map_mtx; std::map< uuid_t, std::shared_ptr< IndexTableBase > > m_index_map; std::unordered_map< uint32_t, std::shared_ptr< IndexTableBase > > m_ordinal_index_map; public: - IndexService(std::unique_ptr< IndexServiceCallbacks > cbs); + IndexService(std::unique_ptr< IndexServiceCallbacks > cbs, shared< ChunkSelector > custom_chunk_selector = nullptr); ~IndexService(); // Creates the vdev that is needed to initialize the device - void create_vdev(uint64_t size, HSDevType devType, uint32_t num_chunks); - + void create_vdev(uint64_t size, HSDevType devType, uint32_t num_chunks, + chunk_selector_type_t chunk_sel_type = chunk_selector_type_t::ROUND_ROBIN); // Open the existing vdev which is represnted by the vdev_info_block shared< VirtualDev > open_vdev(const vdev_info& vb, bool load_existing); - + std::shared_ptr< ChunkSelector > get_chunk_selector(){ return m_custom_chunk_selector;}; // for now, we don't support start after stop and there is no use case for this. // TODO: support start after stop if necessary diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 2475362f9..bdb377a47 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -57,7 +57,8 @@ namespace homestore { HomeStoreSafePtr HomeStore::s_instance{nullptr}; static std::unique_ptr< IndexServiceCallbacks > s_index_cbs; -static shared< ChunkSelector > s_custom_chunk_selector{nullptr}; +static shared< ChunkSelector > s_custom_data_chunk_selector{nullptr}; +static shared< ChunkSelector > s_custom_index_chunk_selector{nullptr}; static shared< ReplApplication > s_repl_app{nullptr}; std::string version = PACKAGE_VERSION; @@ -69,13 +70,14 @@ HomeStore* HomeStore::instance() { HomeStore& HomeStore::with_data_service(cshared< ChunkSelector >& custom_chunk_selector) { m_services.svcs |= HS_SERVICE::DATA; m_services.svcs &= ~HS_SERVICE::REPLICATION; // ReplicationDataSvc or DataSvc are mutually exclusive - s_custom_chunk_selector = std::move(custom_chunk_selector); + s_custom_data_chunk_selector = std::move(custom_chunk_selector); return *this; } -HomeStore& HomeStore::with_index_service(std::unique_ptr< IndexServiceCallbacks > cbs) { +HomeStore& HomeStore::with_index_service(std::unique_ptr< IndexServiceCallbacks > cbs, cshared< ChunkSelector >& custom_chunk_selector) { m_services.svcs |= HS_SERVICE::INDEX; s_index_cbs = std::move(cbs); + s_custom_index_chunk_selector = std::move(custom_chunk_selector); return *this; } @@ -89,7 +91,7 @@ HomeStore& HomeStore::with_repl_data_service(cshared< ReplApplication >& repl_ap m_services.svcs |= HS_SERVICE::REPLICATION | HS_SERVICE::LOG; m_services.svcs &= ~HS_SERVICE::DATA; // ReplicationDataSvc or DataSvc are mutually exclusive s_repl_app = repl_app; - s_custom_chunk_selector = std::move(custom_chunk_selector); + s_custom_data_chunk_selector = std::move(custom_chunk_selector); return *this; } @@ -144,8 +146,8 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < s_cast< int >(data_fetch_max_size_in_byte)) { LOGERROR("max_grpc_message_size {} is too small to hold max_data_size {}, max_snapshot_batch_size {} and " "data_fetch_max_size {}", - HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), input.max_data_size, - input.max_snapshot_batch_size, data_fetch_max_size_in_byte); + HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), input.max_data_size, input.max_snapshot_batch_size, + data_fetch_max_size_in_byte); throw std::invalid_argument("max_grpc_message_size is insufficient for the configured data or snapshot sizes"); } @@ -157,15 +159,18 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ LOGINFO("Homestore is loading with following services: {}", m_services.list()); if (has_meta_service()) { m_meta_service = std::make_unique< MetaBlkService >(); } - if (has_index_service()) { m_index_service = std::make_unique< IndexService >(std::move(s_index_cbs)); } + if (has_index_service()) { + m_index_service = std::make_unique< IndexService >(std::move(s_index_cbs), + std::move(s_custom_index_chunk_selector)); + } if (has_repl_data_service()) { m_log_service = std::make_unique< LogStoreService >(); - m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_chunk_selector)); + m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_data_chunk_selector)); m_repl_service = GenericReplService::create(std::move(s_repl_app)); } else { if (has_log_service()) { m_log_service = std::make_unique< LogStoreService >(); } if (has_data_service()) { - m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_chunk_selector)); + m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_data_chunk_selector)); } } m_cp_mgr = std::make_unique< CPManager >(); @@ -235,7 +240,7 @@ void HomeStore::format_and_start(std::map< uint32_t, hs_format_params >&& format fparams.dev_type, fparams.chunk_size)); } else if ((svc_type & HS_SERVICE::INDEX) && has_index_service()) { m_index_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type, - fparams.num_chunks); + fparams.num_chunks, fparams.chunk_sel_type); } else if ((svc_type & HS_SERVICE::DATA) && has_data_service()) { m_data_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type, fparams.block_size, fparams.alloc_type, fparams.chunk_sel_type, diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index c22e70b15..98744fb2d 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -28,7 +28,8 @@ namespace homestore { IndexService& index_service() { return hs()->index_service(); } -IndexService::IndexService(std::unique_ptr< IndexServiceCallbacks > cbs) : m_svc_cbs{std::move(cbs)} { +IndexService::IndexService(std::unique_ptr< IndexServiceCallbacks > cbs, shared< ChunkSelector > chunk_selector) : + m_svc_cbs {std::move(cbs)}, m_custom_chunk_selector{std::move(chunk_selector)} { m_ordinal_reserver = std::make_unique< sisl::IDReserver >(); meta_service().register_handler( "index", @@ -45,7 +46,8 @@ IndexService::IndexService(std::unique_ptr< IndexServiceCallbacks > cbs) : m_svc nullptr); } -void IndexService::create_vdev(uint64_t size, HSDevType devType, uint32_t num_chunks) { +void IndexService::create_vdev(uint64_t size, HSDevType devType, uint32_t num_chunks, + chunk_selector_type_t chunk_sel_type) { auto const atomic_page_size = hs()->device_mgr()->atomic_page_size(devType); hs_vdev_context vdev_ctx; vdev_ctx.type = hs_vdev_type_t::INDEX_VDEV; @@ -56,14 +58,14 @@ void IndexService::create_vdev(uint64_t size, HSDevType devType, uint32_t num_ch .blk_size = atomic_page_size, .dev_type = devType, .alloc_type = blk_allocator_type_t::fixed, - .chunk_sel_type = chunk_selector_type_t::ROUND_ROBIN, + .chunk_sel_type = chunk_sel_type, .multi_pdev_opts = vdev_multi_pdev_opts_t::ALL_PDEV_STRIPED, .context_data = vdev_ctx.to_blob()}); } shared< VirtualDev > IndexService::open_vdev(const vdev_info& vinfo, bool load_existing) { - m_vdev = - std::make_shared< VirtualDev >(*(hs()->device_mgr()), vinfo, nullptr /* event_cb */, true /* auto_recovery */); + m_vdev = std::make_shared< VirtualDev >(*(hs()->device_mgr()), vinfo, nullptr /* event_cb */, + true /* auto_recovery */, m_custom_chunk_selector); return m_vdev; } @@ -92,8 +94,8 @@ void IndexService::start() { tbl->audit_tree(); #endif } - // Force taking cp after recovery done. This makes sure that the index table is in consistent state and dirty buffer - // after recovery can be added to dirty list for flushing in the new cp + // Force taking cp after recovery done. This makes sure that the index table is in consistent state and dirty + // buffer after recovery can be added to dirty list for flushing in the new cp hs()->cp_mgr().trigger_cp_flush(true /* force */); } diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index a935a311a..1459a6361 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -92,13 +92,16 @@ void IndexWBCache::start_flush_threads() { } } -BtreeNodePtr IndexWBCache::alloc_buf(node_initializer_t&& node_initializer) { +BtreeNodePtr IndexWBCache::alloc_buf(uint32_t ordinal,node_initializer_t&& node_initializer) { auto cpg = cp_mgr().cp_guard(); auto cp_ctx = r_cast< IndexCPContext* >(cpg.context(cp_consumer_t::INDEX_SVC)); // Alloc a block of data from underlying vdev - BlkId blkid; - auto ret = m_vdev->alloc_contiguous_blks(1, blk_alloc_hints{}, blkid); + MultiBlkId blkid; + // Ordinal used as a hint in the case of custom chunk selector exists + blk_alloc_hints hints; + hints.application_hint = ordinal; + auto ret = m_vdev->alloc_contiguous_blks(1, hints, blkid); if (ret != BlkAllocStatus::SUCCESS) { return nullptr; } // Alloc buffer and initialize the node diff --git a/src/lib/index/wb_cache.hpp b/src/lib/index/wb_cache.hpp index 42ec5270c..684b3f35d 100644 --- a/src/lib/index/wb_cache.hpp +++ b/src/lib/index/wb_cache.hpp @@ -46,7 +46,7 @@ class IndexWBCache : public IndexWBCacheBase { IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size); - BtreeNodePtr alloc_buf(node_initializer_t&& node_initializer) override; + BtreeNodePtr alloc_buf(uint32_t ordinal, node_initializer_t&& node_initializer) override; void write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf, CPContext* cp_ctx) override; void read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t&& node_initializer) override; diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index ee7faeb7e..9ac750372 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -161,6 +161,7 @@ class HSTestHelper { blk_allocator_type_t blkalloc_type{blk_allocator_type_t::varsize}; uint32_t blk_size{0}; shared< ChunkSelector > custom_chunk_selector{nullptr}; + shared< ChunkSelector > index_chunk_selector{nullptr}; IndexServiceCallbacks* index_svc_cbs{nullptr}; shared< ReplApplication > repl_app{nullptr}; chunk_num_t num_chunks{1}; @@ -439,7 +440,7 @@ class HSTestHelper { if (svc == HS_SERVICE::DATA) { hsi->with_data_service(tp.custom_chunk_selector); } else if (svc == HS_SERVICE::INDEX) { - hsi->with_index_service(std::unique_ptr< IndexServiceCallbacks >(tp.index_svc_cbs)); + hsi->with_index_service(std::unique_ptr< IndexServiceCallbacks >(tp.index_svc_cbs), tp.index_chunk_selector); } else if ((svc == HS_SERVICE::LOG)) { hsi->with_log_service(); } else if (svc == HS_SERVICE::REPLICATION) { @@ -480,7 +481,11 @@ class HSTestHelper { ? chunk_selector_type_t::CUSTOM : chunk_selector_type_t::ROUND_ROBIN}}, {HS_SERVICE::INDEX, - {.dev_type = homestore::HSDevType::Fast, .size_pct = svc_params[HS_SERVICE::INDEX].size_pct}}, + {.dev_type = homestore::HSDevType::Fast, + .size_pct = svc_params[HS_SERVICE::INDEX].size_pct, + .chunk_sel_type = svc_params[HS_SERVICE::INDEX].custom_chunk_selector + ? chunk_selector_type_t::CUSTOM + : chunk_selector_type_t::ROUND_ROBIN}}, {HS_SERVICE::REPLICATION, {.size_pct = svc_params[HS_SERVICE::REPLICATION].size_pct, .alloc_type = svc_params[HS_SERVICE::REPLICATION].blkalloc_type, From 33362c13970b8059ccdcd559fd411575ad202a7e Mon Sep 17 00:00:00 2001 From: Yaming Kuang <1477567+yamingk@users.noreply.github.com> Date: Tue, 22 Jul 2025 10:52:26 -0700 Subject: [PATCH 157/170] Issue: 771 Fault Containment Service (#773) * Issue:771 Expose Fault Containment Service --- conanfile.py | 2 +- src/include/homestore/fault_cmt_service.hpp | 45 +++++++++++++ src/include/homestore/homestore.hpp | 11 +++ src/include/homestore/index/index_table.hpp | 74 ++++++++++++--------- src/lib/homestore.cpp | 11 +++ src/lib/index/index_cp.cpp | 4 +- 6 files changed, 111 insertions(+), 36 deletions(-) create mode 100644 src/include/homestore/fault_cmt_service.hpp diff --git a/conanfile.py b/conanfile.py index e7d5157b7..7b0835273 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.3" + version = "6.20.5" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/fault_cmt_service.hpp b/src/include/homestore/fault_cmt_service.hpp new file mode 100644 index 000000000..484d18402 --- /dev/null +++ b/src/include/homestore/fault_cmt_service.hpp @@ -0,0 +1,45 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once +#include +#include +#include + +#include +#include + +namespace homestore { +ENUM(FaultContainmentEvent, uint8_t, ENTER = 0, EXIT = 1, ENTER_GLOBAL = 2); + +class FaultContainmentCallback { +public: + virtual ~FaultContainmentCallback() = default; + virtual void on_fault_containment(FaultContainmentEvent evt, void* cookie, const std::string& reason) { assert(0); } +}; + +class FaultContainmentService { +private: + std::unique_ptr< FaultContainmentCallback > m_cb; + +public: + FaultContainmentService(std::unique_ptr< FaultContainmentCallback > cb) : m_cb(std::move(cb)) {} + ~FaultContainmentService() = default; + void trigger_fc(FaultContainmentEvent evt, void* cookie, const std::string& reason = "") { + m_cb->on_fault_containment(evt, cookie, reason); + } +}; + +} // namespace homestore diff --git a/src/include/homestore/homestore.hpp b/src/include/homestore/homestore.hpp index c4e518244..69d766092 100644 --- a/src/include/homestore/homestore.hpp +++ b/src/include/homestore/homestore.hpp @@ -53,6 +53,8 @@ class VirtualDev; class ChunkSelector; class ReplDevListener; class ReplApplication; +class FaultContainmentService; +class FaultContainmentCallback; #ifdef _PRERELEASE class CrashSimulator; @@ -83,6 +85,7 @@ struct HS_SERVICE { static constexpr uint32_t DATA = 1 << 2; static constexpr uint32_t INDEX = 1 << 3; static constexpr uint32_t REPLICATION = 1 << 4; + static constexpr uint32_t FAULT_CMT = 1 << 5; uint32_t svcs; @@ -95,6 +98,7 @@ struct HS_SERVICE { if (svcs & INDEX) { str += "index,"; } if (svcs & LOG) { str += "log,"; } if (svcs & REPLICATION) { str += "replication,"; } + if (svcs & FAULT_CMT) { str += "fault_containment,"; } return str; } }; @@ -116,6 +120,7 @@ class HomeStore { std::unique_ptr< LogStoreService > m_log_service; std::unique_ptr< IndexService > m_index_service; std::shared_ptr< ReplicationService > m_repl_service; + std::unique_ptr< FaultContainmentService > m_fc_service; std::unique_ptr< DeviceManager > m_dev_mgr; shared< sisl::logging::logger_t > m_periodic_logger; @@ -149,6 +154,7 @@ class HomeStore { cshared< ChunkSelector >& custom_chunk_selector = nullptr); HomeStore& with_repl_data_service(cshared< ReplApplication >& repl_app, cshared< ChunkSelector >& custom_chunk_selector = nullptr); + HomeStore& with_fault_containment(std::unique_ptr< FaultContainmentCallback > cb); bool start(const hs_input_params& input, hs_before_services_starting_cb_t svcs_starting_cb = nullptr); void format_and_start(std::map< uint32_t, hs_format_params >&& format_opts); @@ -164,6 +170,7 @@ class HomeStore { bool has_meta_service() const; bool has_log_service() const; bool has_repl_data_service() const; + bool has_fc_service() const; BlkDataService& data_service() { return *m_data_service; } MetaBlkService& meta_service() { return *m_meta_service; } @@ -173,6 +180,10 @@ class HomeStore { return *m_index_service; } ReplicationService& repl_service() { return *m_repl_service; } + FaultContainmentService& fc_service() { + if (!m_fc_service) { throw std::runtime_error("fc_service is nullptr"); } + return *m_fc_service; + } DeviceManager* device_mgr() { return m_dev_mgr.get(); } ResourceMgr& resource_mgr() { return *m_resource_mgr.get(); } CPManager& cp_mgr() { return *m_cp_mgr.get(); } diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 69d8a0010..6a9f004bd 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -180,9 +181,18 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf); auto edge_id = n->next_bnode(); - BT_DBG_ASSERT(!n->has_valid_edge(), - "root {} already has a valid edge {}, so we should have found the new root node", - n->to_string(), n->get_edge_value().bnode_id()); + if (n->has_valid_edge() && hs()->has_fc_service()) { + auto const reason = + fmt::format("root {} already has a valid edge {}, so we should have found the new root node", + n->to_string(), n->get_edge_value().bnode_id()); + hs()->fc_service().trigger_fc(FaultContainmentEvent::ENTER, static_cast< void* >(&(m_sb->parent_uuid)), + reason); + return; + } else { + BT_REL_ASSERT(!n->has_valid_edge(), + "root {} already has a valid edge {}, so we should have found the new root node", + n->to_string(), n->get_edge_value().bnode_id()); + } n->set_next_bnode(empty_bnodeid); n->set_edge_value(BtreeLinkInfo{edge_id, 0}); LOGTRACEMOD(wbcache, "change root node {}: edge updated to {} and invalidate the next node! ", n->node_id(), @@ -451,8 +461,10 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { bnodeid_t sibling_first_child_id = empty_bnodeid; if (!parent_node->is_leaf() && !parent_node->has_valid_edge()) { BtreeNodePtr parent_right_sibling; - if (auto parent_right_sibling_id = find_true_sibling(parent_node); parent_right_sibling_id != empty_bnodeid) { - if (auto ret = read_node_impl(parent_right_sibling_id, parent_right_sibling); ret == btree_status_t::success) { + if (auto parent_right_sibling_id = find_true_sibling(parent_node); + parent_right_sibling_id != empty_bnodeid) { + if (auto ret = read_node_impl(parent_right_sibling_id, parent_right_sibling); + ret == btree_status_t::success) { if (parent_right_sibling->total_entries() > 0) { BtreeLinkInfo sibling_first_child_info; parent_right_sibling->get_nth_value(0, &sibling_first_child_info, false); @@ -467,11 +479,9 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return sibling_first_child_id; } - void update_root(BtreeNodePtr const& left_child, BtreeNodeList& new_nodes, void* cp_ctx) { + void update_root(BtreeNodePtr const& left_child, BtreeNodeList& new_nodes, void* cp_ctx) { auto new_root = this->alloc_interior_node(); - if (new_root == nullptr) { - return; - } + if (new_root == nullptr) { return; } new_root->set_level(left_child->level() + 1); auto cur_child = left_child; uint32_t i = 0; @@ -483,16 +493,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } else { auto child_last_key = cur_child->get_last_key< K >(); new_root->insert(new_root->total_entries(), child_last_key, - BtreeLinkInfo{cur_child->node_id(), cur_child->link_version()}); + BtreeLinkInfo{cur_child->node_id(), cur_child->link_version()}); } if (i == new_nodes.size()) { break; } auto next_child_id = cur_child->next_bnode(); cur_child = new_nodes[i++]; DEBUG_ASSERT_EQ(next_child_id, cur_child->node_id(), - "Next node id {} does not match current child node id {}", - next_child_id, cur_child->node_id()); + "Next node id {} does not match current child node id {}", next_child_id, + cur_child->node_id()); } while (true); - + new_nodes.push_back(new_root); LOGTRACEMOD(wbcache, "New root node created {}", new_root->to_string()); on_root_changed(new_root, cp_ctx); @@ -568,19 +578,18 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { last_child_last_key = cur_child->get_last_key< K >(); found_child = true; } - + next_cur_child = nullptr; - if(cur_child->next_bnode() == empty_bnodeid || - read_node_impl(cur_child->next_bnode(), next_cur_child) != btree_status_t::success) { + if (cur_child->next_bnode() == empty_bnodeid || + read_node_impl(cur_child->next_bnode(), next_cur_child) != btree_status_t::success) { break; // No next child, so we can stop here } - + if (sibling_first_child != empty_bnodeid && sibling_first_child == cur_child->next_bnode()) { - LOGTRACEMOD( - wbcache, - "Last child last key {} for child_node [{}] parent node [{}], next neigbor is [{}]", - last_child_last_key.to_string(), cur_child->to_string(), parent_node->to_string(), - next_cur_child->to_string()); + LOGTRACEMOD(wbcache, + "Last child last key {} for child_node [{}] parent node [{}], next neigbor is [{}]", + last_child_last_key.to_string(), cur_child->to_string(), parent_node->to_string(), + next_cur_child->to_string()); break; } cur_child = next_cur_child; @@ -615,7 +624,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { last_parent_key = last_child_last_key; parent_node->set_next_bnode(true_sibling->node_id()); LOGTRACEMOD(wbcache, "True sibling [{}] for parent_node {}", true_sibling->to_string(), - parent_node->to_string()); + parent_node->to_string()); } if (!true_sibling) { LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", parent_node->to_string()); @@ -628,7 +637,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } } - + // Keep a copy of the node buffer, in case we need to revert back uint8_t* tmp_buffer = new uint8_t[this->m_node_size]; std::memcpy(tmp_buffer, parent_node->m_phys_node_buf, this->m_node_size); @@ -641,8 +650,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { BtreeNodeList new_parent_nodes; do { if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) { - LOGTRACEMOD(wbcache, "Child node [{}] is an edge node or a leaf with no next", - child_node->to_string()); + LOGTRACEMOD(wbcache, "Child node [{}] is an edge node or a leaf with no next", child_node->to_string()); if (child_node->is_node_deleted()) { // Edge node is merged, we need to set the current last entry as edge if (cur_parent->total_entries() > 0) { @@ -683,6 +691,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", cur_parent->to_string()); } + BT_REL_ASSERT(cur_parent->total_entries() != 0 || cur_parent->has_valid_edge(), "Parent node [{}] cannot be empty", cur_parent->to_string()); } @@ -759,7 +768,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // No room in the parent_node, let us split the parent_node and continue LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] has no room for put, so we need to split the parent " - "node", cur_parent->node_id(), child_node->to_string()); + "node", + cur_parent->node_id(), child_node->to_string()); auto new_parent = this->alloc_interior_node(); if (new_parent == nullptr) { ret = btree_status_t::space_not_avail; @@ -866,12 +876,10 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // if last parent has the key less than the last child key, then we need to update the parent node with // the last child key if it doesn't have edge. auto last_parent = parent_node; - if (new_parent_nodes.size() > 0) { + if (new_parent_nodes.size() > 0) { last_parent = new_parent_nodes.back(); // handle the case where we are splitting the root node - if (m_sb->root_node == parent_node->node_id()) { - update_root(parent_node, new_parent_nodes, cp_ctx); - } + if (m_sb->root_node == parent_node->node_id()) { update_root(parent_node, new_parent_nodes, cp_ctx); } } if (last_parent->total_entries() && !last_parent->has_valid_edge()) { if (last_parent->compare_nth_key(last_parent_key, last_parent->total_entries() - 1) < 0) { @@ -930,9 +938,9 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return empty_bnodeid; } else { BtreeNodePtr sibling_node; - if (read_node_impl(sibling_id, sibling_node) != btree_status_t::success) { + if (read_node_impl(sibling_id, sibling_node) != btree_status_t::success) { LOGTRACEMOD(wbcache, "Failed to read sibling node with id {}", sibling_id); - return empty_bnodeid; + return empty_bnodeid; } if (sibling_node->is_node_deleted()) { diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index bdb377a47..bec7c2e49 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -56,6 +57,7 @@ namespace homestore { HomeStoreSafePtr HomeStore::s_instance{nullptr}; +static std::unique_ptr< FaultContainmentCallback > s_fc_cb; static std::unique_ptr< IndexServiceCallbacks > s_index_cbs; static shared< ChunkSelector > s_custom_data_chunk_selector{nullptr}; static shared< ChunkSelector > s_custom_index_chunk_selector{nullptr}; @@ -67,6 +69,12 @@ HomeStore* HomeStore::instance() { return s_instance.get(); } +HomeStore& HomeStore::with_fault_containment(std::unique_ptr< FaultContainmentCallback > cb) { + m_services.svcs |= HS_SERVICE::FAULT_CMT; + s_fc_cb = std::move(cb); + return *this; +} + HomeStore& HomeStore::with_data_service(cshared< ChunkSelector >& custom_chunk_selector) { m_services.svcs |= HS_SERVICE::DATA; m_services.svcs &= ~HS_SERVICE::REPLICATION; // ReplicationDataSvc or DataSvc are mutually exclusive @@ -173,6 +181,8 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_data_chunk_selector)); } } + if (has_fc_service()) { m_fc_service = std::make_unique< FaultContainmentService >(std::move(s_fc_cb)); } + m_cp_mgr = std::make_unique< CPManager >(); m_dev_mgr = std::make_unique< DeviceManager >(input.devices, bind_this(HomeStore::create_vdev_cb, 2)); @@ -393,6 +403,7 @@ bool HomeStore::has_log_service() const { auto const s = m_services.svcs; return (s & HS_SERVICE::LOG); } +bool HomeStore::has_fc_service() const { return (m_services.svcs & HS_SERVICE::FAULT_CMT); } #if 0 void HomeStore::init_cache() { diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index df41d0799..c75b513a1 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -82,8 +82,8 @@ std::optional< IndexBufferPtr > IndexCPContext::next_dirty() { } std::string IndexCPContext::to_string_small() { - return fmt::format("IndexCPContext cpid={}, dirty_buf_count={}, dirty_buf_list_size={}", m_cp->id(), m_dirty_buf_count.get(), - m_dirty_buf_list.size()); + return fmt::format("IndexCPContext cpid={}, dirty_buf_count={}, dirty_buf_list_size={}", m_cp->id(), + m_dirty_buf_count.get(), m_dirty_buf_list.size()); } std::string IndexCPContext::to_string() { From be86e4456491ae01712fd54f856f94a2b7561cee Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Tue, 22 Jul 2025 12:30:19 -0700 Subject: [PATCH 158/170] Fix negative metrics and return back 1.3 metrics (#775) --- conanfile.py | 2 +- src/include/homestore/blk.h | 19 ++-- src/include/homestore/btree/btree.hpp | 16 ++- src/include/homestore/btree/btree.ipp | 13 ++- .../homestore/btree/detail/btree_common.ipp | 52 +++++++--- .../homestore/btree/detail/btree_internal.hpp | 5 +- .../btree/detail/btree_mutate_impl.ipp | 1 + .../homestore/btree/detail/btree_node_mgr.ipp | 13 ++- .../btree/detail/btree_remove_impl.ipp | 3 +- .../homestore/btree/detail/prefix_node.hpp | 5 +- .../homestore/btree/detail/simple_node.hpp | 23 ++--- src/include/homestore/checkpoint/cp.hpp | 4 +- .../homestore/index/index_internal.hpp | 14 +-- src/include/homestore/index/index_table.hpp | 46 +++++++-- src/include/homestore/index_service.hpp | 3 +- src/include/homestore/replication/repl_dev.h | 10 +- src/include/homestore/replication_service.hpp | 17 ++-- src/lib/blkalloc/fixed_blk_allocator.h | 2 +- src/lib/blkalloc/varsize_blk_allocator.h | 2 +- src/lib/common/crash_simulator.hpp | 2 +- src/lib/common/homestore_utils.cpp | 2 +- src/lib/common/homestore_utils.hpp | 3 +- src/lib/device/chunk.cpp | 2 +- src/lib/device/journal_vdev.cpp | 18 ++-- src/lib/homestore.cpp | 9 +- src/lib/index/index_cp.hpp | 5 +- src/lib/index/index_service.cpp | 23 ++++- src/lib/index/wb_cache.cpp | 32 +++--- src/lib/index/wb_cache.hpp | 1 + .../replication/repl_dev/raft_repl_dev.cpp | 98 ++++++++----------- src/lib/replication/repl_dev/raft_repl_dev.h | 12 ++- src/lib/replication/repl_dev/solo_repl_dev.h | 3 +- .../replication/service/generic_repl_svc.cpp | 2 +- .../replication/service/raft_repl_service.cpp | 23 +++-- .../replication/service/raft_repl_service.h | 6 +- src/tests/btree_helpers/btree_test_helper.hpp | 16 ++- src/tests/test_blk_cache_queue.cpp | 4 +- src/tests/test_btree_node.cpp | 4 +- .../test_common/homestore_test_common.hpp | 7 +- src/tests/test_common/raft_repl_test_base.hpp | 8 +- src/tests/test_data_service.cpp | 2 +- src/tests/test_index_crash_recovery.cpp | 59 ++++++++++- src/tests/test_meta_blk_mgr.cpp | 4 +- src/tests/test_raft_repl_dev.cpp | 1 - 44 files changed, 366 insertions(+), 230 deletions(-) diff --git a/conanfile.py b/conanfile.py index 7b0835273..bc9952b3f 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.5" + version = "6.20.6" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h index d85185b97..beb88b69f 100644 --- a/src/include/homestore/blk.h +++ b/src/include/homestore/blk.h @@ -248,15 +248,16 @@ VENUM(BlkAllocStatus, uint32_t, ); struct blk_alloc_hints { - blk_temp_t desired_temp{0}; // Temperature hint for the device - std::optional< uint32_t > reserved_blks; // Reserved blks in a chunk - std::optional< uint32_t > pdev_id_hint; // which physical device to pick (hint if any) -1 for don't care - std::optional< chunk_num_t > chunk_id_hint; // any specific chunk id to pick for this allocation - std::optional committed_blk_id; // blk id indicates the blk was already allocated and committed, don't allocate and commit again - std::optional< stream_id_t > stream_id_hint; // any specific stream to pick - std::optional< uint64_t > application_hint; // hints in uint64 what will be passed opaque to select_chunk - bool can_look_for_other_chunk{true}; // If alloc on device not available can I pick other device - bool is_contiguous{true}; // Should the entire allocation be one contiguous block + blk_temp_t desired_temp{0}; // Temperature hint for the device + std::optional< uint32_t > reserved_blks; // Reserved blks in a chunk + std::optional< uint32_t > pdev_id_hint; // which physical device to pick (hint if any) -1 for don't care + std::optional< chunk_num_t > chunk_id_hint; // any specific chunk id to pick for this allocation + std::optional< MultiBlkId > committed_blk_id; // blk id indicates the blk was already allocated and committed, + // don't allocate and commit again + std::optional< stream_id_t > stream_id_hint; // any specific stream to pick + std::optional< uint64_t > application_hint; // hints in uint64 what will be passed opaque to select_chunk + bool can_look_for_other_chunk{true}; // If alloc on device not available can I pick other device + bool is_contiguous{true}; // Should the entire allocation be one contiguous block bool partial_alloc_ok{false}; // ok to allocate only portion of nblks? Mutually exclusive with is_contiguous uint32_t min_blks_per_piece{1}; // blks allocated in a blkid should be atleast this size per entry uint32_t max_blks_per_piece{max_blks_per_blkid()}; // Number of blks on every entry diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index 3ba74623f..bdc89c11c 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -79,7 +79,9 @@ class Btree { BtreeMetrics m_metrics; std::atomic< bool > m_destroyed{false}; - std::atomic< uint64_t > m_total_nodes{0}; + std::atomic< uint64_t > m_total_leaf_nodes{0}; + std::atomic< uint64_t > m_total_interior_nodes{0}; + std::atomic< uint8_t > m_btree_depth{0}; uint32_t m_node_size{4096}; #ifndef NDEBUG std::atomic< uint64_t > m_req_id{0}; @@ -124,6 +126,10 @@ class Btree { std::string to_custom_string(to_string_cb_t< K, V > const& cb) const; std::string visualize_tree_keys(const std::string& file) const; uint64_t count_keys(bnodeid_t bnodeid = 0) const; + std::pair< uint64_t, uint64_t > compute_node_count(); + std::pair< uint64_t, uint64_t > get_num_nodes() const; + uint16_t compute_btree_depth(); + uint16_t get_btree_depth() const; nlohmann::json get_metrics_in_json(bool updated = true); bnodeid_t root_node_id() const; @@ -198,13 +204,13 @@ class Btree { btree_status_t post_order_traversal(const BtreeNodePtr& node, locktype_t acq_lock, const auto& cb); void get_all_kvs(std::vector< std::pair< K, V > >& kvs) const; btree_status_t do_destroy(uint64_t& n_freed_nodes, void* context); - uint64_t get_btree_node_cnt() const; - uint64_t get_child_node_cnt(bnodeid_t bnodeid) const; + void get_child_node_count(bnodeid_t bnodeid, uint64_t& interior_cnt, uint64_t& leaf_cnt) const; void to_string(bnodeid_t bnodeid, std::string& buf) const; - void to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, to_string_cb_t< K, V > const& cb, int nindent=-1) const; + void to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, to_string_cb_t< K, V > const& cb, + int nindent = -1) const; void to_dot_keys(bnodeid_t bnodeid, std::string& buf, std::map< uint32_t, std::vector< uint64_t > >& l_map, std::map< uint64_t, BtreeVisualizeVariables >& info_map) const; - void sanity_sub_tree(bnodeid_t bnodeid=0) const; + void sanity_sub_tree(bnodeid_t bnodeid = 0) const; void validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const; void validate_sanity_next_child(const BtreeNodePtr& parent_node, uint32_t ind) const; void print_node(const bnodeid_t& bnodeid) const; diff --git a/src/include/homestore/btree/btree.ipp b/src/include/homestore/btree/btree.ipp index 35c42ab97..ca7a18294 100644 --- a/src/include/homestore/btree/btree.ipp +++ b/src/include/homestore/btree/btree.ipp @@ -49,6 +49,15 @@ void Btree< K, V >::set_root_node_info(const BtreeLinkInfo& info) { m_root_node_info = info; } +template < typename K, typename V > +uint16_t Btree< K, V >::get_btree_depth() const {return m_btree_depth;} + +template < typename K, typename V > +std::pair Btree< K, V >::get_num_nodes() const { + return {m_total_interior_nodes, m_total_leaf_nodes}; +} + + template < typename K, typename V > std::pair< btree_status_t, uint64_t > Btree< K, V >::destroy_btree(void* context) { btree_status_t ret{btree_status_t::success}; @@ -145,7 +154,7 @@ template < typename ReqT > btree_status_t Btree< K, V >::get(ReqT& greq) const { static_assert(std::is_same_v< BtreeSingleGetRequest, ReqT > || std::is_same_v< BtreeGetAnyRequest< K >, ReqT >, "get api is called with non get request type"); - + COUNTER_INCREMENT(m_metrics, btree_query_ops_count, 1); btree_status_t ret = btree_status_t::success; m_btree_lock.lock_shared(); @@ -171,7 +180,7 @@ btree_status_t Btree< K, V >::remove(ReqT& req) { std::is_same_v< ReqT, BtreeRangeRemoveRequest< K > > || std::is_same_v< ReqT, BtreeRemoveAnyRequest< K > >, "remove api is called with non remove request type"); - + COUNTER_INCREMENT(m_metrics, btree_remove_ops_count, 1); locktype_t acq_lock = locktype_t::READ; m_btree_lock.lock_shared(); diff --git a/src/include/homestore/btree/detail/btree_common.ipp b/src/include/homestore/btree/detail/btree_common.ipp index ec553396a..e0954754b 100644 --- a/src/include/homestore/btree/detail/btree_common.ipp +++ b/src/include/homestore/btree/detail/btree_common.ipp @@ -96,32 +96,56 @@ btree_status_t Btree< K, V >::do_destroy(uint64_t& n_freed_nodes, void* context) } template < typename K, typename V > -uint64_t Btree< K, V >::get_btree_node_cnt() const { - uint64_t cnt = 1; /* increment it for root */ +std::pair Btree< K, V >::compute_node_count() { + uint64_t leaf_cnt = 0; + uint64_t interior_cnt = 0; m_btree_lock.lock_shared(); - cnt += get_child_node_cnt(m_root_node_info.bnode_id()); + get_child_node_count(m_root_node_info.bnode_id(), interior_cnt, leaf_cnt); + m_total_leaf_nodes = leaf_cnt; + m_total_interior_nodes= interior_cnt; m_btree_lock.unlock_shared(); - return cnt; + return {interior_cnt, leaf_cnt}; } template < typename K, typename V > -uint64_t Btree< K, V >::get_child_node_cnt(bnodeid_t bnodeid) const { - uint64_t cnt{0}; +uint16_t Btree< K, V >::compute_btree_depth() { + m_btree_lock.lock_shared(); + BtreeNodePtr root; + locktype_t acq_lock = locktype_t::READ; + if (read_and_lock_node(m_root_node_info.bnode_id(), root, acq_lock, acq_lock, nullptr) != btree_status_t::success){ return -1; } + m_btree_depth = root->level(); + unlock_node(root, acq_lock); + m_btree_lock.unlock_shared(); + return m_btree_depth; +} + +template < typename K, typename V > +void Btree< K, V >::get_child_node_count(bnodeid_t bnodeid, uint64_t& interior_cnt, uint64_t& leaf_cnt) const { BtreeNodePtr node; locktype_t acq_lock = locktype_t::READ; - if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return cnt; } + if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return ; } + if(node->is_leaf()) { + ++leaf_cnt; + } else { + ++interior_cnt; + } if (!node->is_leaf()) { - uint32_t i = 0; - while (i < node->total_entries()) { - BtreeLinkInfo p = node->get_nth_key< K >(i, false); - cnt += get_child_node_cnt(p.bnode_id()) + 1; - ++i; + if(node->level()==1){ + leaf_cnt += node->total_entries() + (node->has_valid_edge()?1:0); + }else{ + uint32_t i = 0; + while (i < node->total_entries()) { + BtreeLinkInfo p; + node->get_nth_value(i, &p, false); + get_child_node_count(p.bnode_id(), interior_cnt, leaf_cnt); + ++i; + } + if (node->has_valid_edge()) {get_child_node_count(node->edge_id(), interior_cnt, leaf_cnt); } } - if (node->has_valid_edge()) { cnt += get_child_node_cnt(node->edge_id()) + 1; } } unlock_node(node, acq_lock); - return cnt; + return ; } template < typename K, typename V > diff --git a/src/include/homestore/btree/detail/btree_internal.hpp b/src/include/homestore/btree/detail/btree_internal.hpp index 8f2b267ac..44ba95828 100644 --- a/src/include/homestore/btree/detail/btree_internal.hpp +++ b/src/include/homestore/btree/detail/btree_internal.hpp @@ -256,13 +256,14 @@ struct BtreeConfig { std::string m_btree_name; // Unique name for the btree bool m_merge_turned_on{true}; uint8_t m_max_merge_level{1}; + private: uint32_t m_suggested_min_size; // Precomputed values uint32_t m_ideal_fill_size; public: BtreeConfig(uint32_t node_size, const std::string& btree_name = "") : - m_node_size{node_size}, m_btree_name{btree_name.empty() ? std::string("btree") : btree_name}{ + m_node_size{node_size}, m_btree_name{btree_name.empty() ? std::string("btree") : btree_name} { set_node_data_size(node_size - 512); // Just put estimate at this point of time. } @@ -320,8 +321,6 @@ class BtreeMetrics : public sisl::MetricsGroup { REGISTER_COUNTER(btree_retry_count, "number of retries"); REGISTER_COUNTER(write_err_cnt, "number of errors in write"); REGISTER_COUNTER(query_err_cnt, "number of errors in query"); - REGISTER_COUNTER(read_node_count_in_write_ops, "number of nodes read in write_op"); - REGISTER_COUNTER(read_node_count_in_query_ops, "number of nodes read in query_op"); REGISTER_COUNTER(btree_write_ops_count, "number of btree operations"); REGISTER_COUNTER(btree_query_ops_count, "number of btree operations"); REGISTER_COUNTER(btree_remove_ops_count, "number of btree operations"); diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp index 5247a6e22..441a3fed0 100644 --- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp +++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp @@ -241,6 +241,7 @@ btree_status_t Btree< K, V >::check_split_root(ReqT& req) { } else { if (req.route_tracing) { append_route_trace(req, child_node, btree_event_t::SPLIT); } m_root_node_info = BtreeLinkInfo{root->node_id(), root->link_version()}; + this->m_btree_depth = root->level(); unlock_node(child_node, locktype_t::WRITE); COUNTER_INCREMENT(m_metrics, btree_depth, 1); } diff --git a/src/include/homestore/btree/detail/btree_node_mgr.ipp b/src/include/homestore/btree/detail/btree_node_mgr.ipp index aa536a728..e5e74e2b0 100644 --- a/src/include/homestore/btree/detail/btree_node_mgr.ipp +++ b/src/include/homestore/btree/detail/btree_node_mgr.ipp @@ -146,6 +146,9 @@ btree_status_t Btree< K, V >::upgrade_node_locks(const BtreeNodePtr& parent_node // If the node things have been changed between unlock and lock example, it has been made invalid (probably by merge // nodes) ask caller to start over again. + if(parent_prev_gen != parent_node->node_gen() || child_prev_gen != child_node->node_gen()) { + COUNTER_INCREMENT(m_metrics, btree_num_pc_gen_mismatch, 1); + } if (parent_node->is_node_deleted() || (parent_prev_gen != parent_node->node_gen()) || child_node->is_node_deleted() || (child_prev_gen != child_node->node_gen())) { unlock_node(child_node, locktype_t::WRITE); @@ -195,7 +198,9 @@ btree_status_t Btree< K, V >::upgrade_node_lock(const BtreeNodePtr& node, lockty auto ret = lock_node(node, locktype_t::WRITE, context); if (ret != btree_status_t::success) { return ret; } - + if(prev_gen != node->node_gen()) { + COUNTER_INCREMENT(m_metrics, btree_num_gen_mismatch, 1); + } if (node->is_node_deleted() || (prev_gen != node->node_gen())) { unlock_node(node, locktype_t::WRITE); return btree_status_t::retry; @@ -232,7 +237,7 @@ BtreeNodePtr Btree< K, V >::alloc_leaf_node() { BtreeNodePtr n = alloc_node(true /* is_leaf */); if (n) { COUNTER_INCREMENT(m_metrics, btree_leaf_node_count, 1); - ++m_total_nodes; + ++m_total_leaf_nodes; } return n; } @@ -242,7 +247,7 @@ BtreeNodePtr Btree< K, V >::alloc_interior_node() { BtreeNodePtr n = alloc_node(false /* is_leaf */); if (n) { COUNTER_INCREMENT(m_metrics, btree_int_node_count, 1); - ++m_total_nodes; + ++m_total_interior_nodes; } return n; } @@ -302,7 +307,7 @@ void Btree< K, V >::free_node(const BtreeNodePtr& node, locktype_t cur_lock, voi node->set_node_deleted(); unlock_node(node, cur_lock); } - --m_total_nodes; + node->is_leaf()?--m_total_leaf_nodes:--m_total_interior_nodes; free_node_impl(node, context); // intrusive_ptr_release(node.get()); diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp index de991edba..73b68e927 100644 --- a/src/include/homestore/btree/detail/btree_remove_impl.ipp +++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp @@ -211,6 +211,7 @@ btree_status_t Btree< K, V >::check_collapse_root(ReqT& req) { free_node(root, locktype_t::WRITE, req.m_op_context); m_root_node_info = child->link_info(); + this->m_btree_depth = child->level(); unlock_node(child, locktype_t::WRITE); COUNTER_DECREMENT(m_metrics, btree_depth, 1); @@ -352,7 +353,7 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const available_size = 0; while (src_cursor.ith_node < old_nodes.size()) { if (available_size == 0) { - new_node.reset(alloc_node(leftmost_node->is_leaf()).get()); + new_node.reset(leftmost_node->is_leaf() ? alloc_leaf_node().get() : alloc_interior_node().get()); if (new_node == nullptr) { ret = btree_status_t::merge_failed; goto out; diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp index 7ba617fb2..43179477c 100644 --- a/src/include/homestore/btree/detail/prefix_node.hpp +++ b/src/include/homestore/btree/detail/prefix_node.hpp @@ -254,8 +254,9 @@ class FixedPrefixNode : public VariantNode< K, V > { } V new_val{s_cast< V const& >(val)}; new_val.shift(s_cast< K const& >(cur_key).distance(first_input_key), app_ctx); - if(get_prefix_entry_c(prefix_slot)->compare(cur_key, new_val)) { - LOGTRACEMOD(btree, "Adding new prefix entry for key={} val={}", cur_key.to_string(), new_val.to_string()); + if (get_prefix_entry_c(prefix_slot)->compare(cur_key, new_val)) { + LOGTRACEMOD(btree, "Adding new prefix entry for key={} val={}", cur_key.to_string(), + new_val.to_string()); prefix_slot = add_prefix(cur_key, new_val); } write_suffix(idx, prefix_slot, cur_key, new_val); diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/detail/simple_node.hpp index 25a87c1c1..85dd88021 100644 --- a/src/include/homestore/btree/detail/simple_node.hpp +++ b/src/include/homestore/btree/detail/simple_node.hpp @@ -168,10 +168,8 @@ class SimpleNode : public VariantNode< K, V > { nentries = std::min(nentries, this->get_available_entries()); #ifdef _PRERELEASE const uint64_t max_keys = this->max_keys_in_node(); - if(max_keys){ - if(this->total_entries() + nentries > max_keys) { - nentries = max_keys - this->total_entries(); - } + if (max_keys) { + if (this->total_entries() + nentries > max_keys) { nentries = max_keys - this->total_entries(); } } #endif uint32_t sz = nentries * get_nth_obj_size(0); @@ -212,7 +210,7 @@ class SimpleNode : public VariantNode< K, V > { bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const override { #ifdef _PRERELEASE auto max_keys = max_keys_in_node(); - if(max_keys) {return (this->total_entries() < max_keys);} + if (max_keys) { return (this->total_entries() < max_keys); } #endif return ((put_type == btree_put_type::UPSERT) || (put_type == btree_put_type::INSERT)) ? (get_available_entries() > 0) @@ -221,10 +219,11 @@ class SimpleNode : public VariantNode< K, V > { std::string to_string(bool print_friendly = false) const override { auto snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode()); - auto str = fmt::format("{}id={} level={} nEntries={} {} {} {}", - (print_friendly ? "------------------------------------------------------------\n" : ""), - this->node_id(), this->level(), this->total_entries(), - (this->is_leaf() ? "LEAF" : "INTERIOR"), snext, this->is_node_deleted()? " Deleted" : " LIVE"); + auto str = + fmt::format("{}id={} level={} nEntries={} {} {} {}", + (print_friendly ? "------------------------------------------------------------\n" : ""), + this->node_id(), this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), + snext, this->is_node_deleted() ? " Deleted" : " LIVE"); if (this->has_valid_edge()) { fmt::format_to(std::back_inserter(str), " edge={}.{}", this->edge_info().m_bnodeid, this->edge_info().m_link_version); @@ -238,12 +237,10 @@ class SimpleNode : public VariantNode< K, V > { return str; } std::string to_dot_keys() const override { - return to_dot_keys_impl(std::is_same().key()), uint64_t>{}); + return to_dot_keys_impl(std::is_same< decltype(std::declval< K& >().key()), uint64_t >{}); } - std::string to_dot_keys_impl(std::false_type) const { - return ""; - } + std::string to_dot_keys_impl(std::false_type) const { return ""; } std::string to_dot_keys_impl(std::true_type) const { std::string str; diff --git a/src/include/homestore/checkpoint/cp.hpp b/src/include/homestore/checkpoint/cp.hpp index e88a9e4e2..761b2a2d2 100644 --- a/src/include/homestore/checkpoint/cp.hpp +++ b/src/include/homestore/checkpoint/cp.hpp @@ -72,8 +72,8 @@ class CPManager; VENUM(cp_consumer_t, uint8_t, // Sealer is a special consumer that provides information regarding where the cp is up to. // It will be the first one during cp switch over , as a conservative marker of everything - // before or equals to this point, should be in current cp, possibly some consumer are above this point which is fine. - // And Sealer is the last one during cp flush after all other services flushed successfully. + // before or equals to this point, should be in current cp, possibly some consumer are above this point which is + // fine. And Sealer is the last one during cp flush after all other services flushed successfully. SEALER = 3, HS_CLIENT = 0, // Client of the homestore module INDEX_SVC = 1, // Index service module diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp index 4d5ce6afe..e079c0448 100644 --- a/src/include/homestore/index/index_internal.hpp +++ b/src/include/homestore/index/index_internal.hpp @@ -51,15 +51,18 @@ struct index_table_sb { uint64_t root_link_version{0}; // Link version to btree root node int64_t index_size{0}; // Size of the Index // seq_id_t last_seq_id{-1}; // TODO: See if this is needed + uint64_t total_leaf_nodes{0}; // Number of leaf nodes in the index + uint64_t total_interior_nodes{0}; // Number of internal nodes in the index + uint8_t btree_depth{0}; // Depth of the btree uint32_t ordinal{0}; // Ordinal of the Index uint32_t user_sb_size; // Size of the user superblk uint8_t user_sb_bytes[0]; uint32_t pdev_id; - uint32_t index_num_chunks {0}; + uint32_t index_num_chunks{0}; // List of chunk ids allocated for this index table are stored after this. - void init_chunks(std::vector const& chunk_ids){ + void init_chunks(std::vector< chunk_num_t > const& chunk_ids) { index_num_chunks = chunk_ids.size(); auto chunk_id_ptr = get_chunk_ids_mutable(); for (auto& chunk_id : chunk_ids) { @@ -67,13 +70,10 @@ struct index_table_sb { chunk_id_ptr++; } } - chunk_num_t* get_chunk_ids_mutable() { - return r_cast(uintptr_cast(this) + sizeof(index_table_sb)); - } + chunk_num_t* get_chunk_ids_mutable() { return r_cast< chunk_num_t* >(uintptr_cast(this) + sizeof(index_table_sb)); } const chunk_num_t* get_chunk_ids() const { return r_cast< const chunk_num_t* >(reinterpret_cast< const uint8_t* >(this) + sizeof(index_table_sb)); } - }; #pragma pack() @@ -95,6 +95,8 @@ class IndexTableBase { virtual void repair_root_node(IndexBufferPtr const& buf) = 0; virtual void delete_stale_children(IndexBufferPtr const& buf) = 0; virtual void audit_tree() = 0; + virtual void update_sb() = 0; + virtual void load_metrics(uint64_t interior, uint64_t leaf, uint8_t depth) = 0; }; enum class index_buf_state_t : uint8_t { diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 6a9f004bd..ace023ed5 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -36,7 +36,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { private: superblk< index_table_sb > m_sb; shared< MetaIndexBuffer > m_sb_buffer; - static constexpr uint32_t INVALID_ORDINAL = std::numeric_limits::max(); + static constexpr uint32_t INVALID_ORDINAL = std::numeric_limits< uint32_t >::max(); // graceful shutdown private: @@ -60,13 +60,15 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } - IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg, uint32_t ordinal= INVALID_ORDINAL, const std::vector< chunk_num_t >& chunk_ids ={}, uint32_t pdev_id = 0): + IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg, + uint32_t ordinal = INVALID_ORDINAL, const std::vector< chunk_num_t >& chunk_ids = {}, + uint32_t pdev_id = 0) : Btree< K, V >{cfg}, m_sb{"index"} { - auto ord_num = (ordinal == INVALID_ORDINAL)? (hs()->index_service().reserve_ordinal()) : ordinal; - BT_LOG_ASSERT(!hs()->index_service().get_index_table(ord_num), "table with ordinal {} already exists"); + auto ord_num = (ordinal == INVALID_ORDINAL) ? (hs()->index_service().reserve_ordinal()) : ordinal; + BT_LOG_ASSERT(!hs()->index_service().get_index_table(ord_num), "table with ordinal {} already exists"); // Create a superblk for the index table and create MetaIndexBuffer corresponding to that - m_sb.create(sizeof(index_table_sb)+(chunk_ids.size() * sizeof(chunk_num_t))); + m_sb.create(sizeof(index_table_sb) + (chunk_ids.size() * sizeof(chunk_num_t))); m_sb->init_chunks(chunk_ids); m_sb->pdev_id = pdev_id; m_sb->ordinal = ord_num; @@ -113,8 +115,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { btree_status_t destroy() override { if (is_stopping()) return btree_status_t::stopping; incr_pending_request_num(); - auto chunk_selector {hs()->index_service().get_chunk_selector()}; - if(!chunk_selector){ + auto chunk_selector{hs()->index_service().get_chunk_selector()}; + if (!chunk_selector) { auto cpg = cp_mgr().cp_guard(); Btree< K, V >::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); } @@ -140,7 +142,10 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { auto cpg = cp_mgr().cp_guard(); put_req.m_op_context = (void*)cpg.context(cp_consumer_t::INDEX_SVC); ret = Btree< K, V >::put(put_req); - if (ret == btree_status_t::cp_mismatch) { LOGTRACEMOD(wbcache, "CP Mismatch, retrying put"); } + if (ret == btree_status_t::cp_mismatch) { + LOGTRACEMOD(wbcache, "CP Mismatch, retrying put"); + COUNTER_INCREMENT(this->m_metrics, btree_retry_count, 1); + } } while (ret == btree_status_t::cp_mismatch); decr_pending_request_num(); return ret; @@ -155,7 +160,10 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { auto cpg = cp_mgr().cp_guard(); remove_req.m_op_context = (void*)cpg.context(cp_consumer_t::INDEX_SVC); ret = Btree< K, V >::remove(remove_req); - if (ret == btree_status_t::cp_mismatch) { LOGTRACEMOD(wbcache, "CP Mismatch, retrying remove"); } + if (ret == btree_status_t::cp_mismatch) { + LOGTRACEMOD(wbcache, "CP Mismatch, retrying remove"); + COUNTER_INCREMENT(this->m_metrics, btree_retry_count, 1); + } } while (ret == btree_status_t::cp_mismatch); decr_pending_request_num(); return ret; @@ -348,6 +356,10 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { LOGTRACEMOD(wbcache, "root changed for index old_root={} new_root={}", m_sb->root_node, new_root->node_id()); m_sb->root_node = new_root->node_id(); m_sb->root_link_version = new_root->link_version(); + m_sb->btree_depth = new_root->level(); + m_sb->total_interior_nodes = this->m_total_interior_nodes; + m_sb->total_leaf_nodes = this->m_total_leaf_nodes; + std::tie(m_sb->total_interior_nodes, m_sb->total_leaf_nodes) = this->get_num_nodes(); if (!wb_cache().refresh_meta_buf(m_sb_buffer, r_cast< CPContext* >(context))) { LOGTRACEMOD(wbcache, "CP mismatch error - discard transact for meta node"); @@ -359,6 +371,22 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return btree_status_t::success; } + void update_sb() override { + m_sb->total_interior_nodes = this->m_total_interior_nodes; + m_sb->total_leaf_nodes = this->m_total_leaf_nodes; + m_sb->btree_depth = this->m_btree_depth; + m_sb.write(); + } + + void load_metrics(uint64_t interior, uint64_t leaf, uint8_t depth) override { + this->m_total_leaf_nodes = leaf; + this->m_total_interior_nodes = interior; + this->m_btree_depth = depth; + COUNTER_INCREMENT(this->m_metrics, btree_int_node_count, interior); + COUNTER_INCREMENT(this->m_metrics, btree_leaf_node_count, leaf); + COUNTER_INCREMENT(this->m_metrics, btree_depth, depth); + } + btree_status_t delete_stale_links(BtreeNodePtr const& parent_node, void* cp_ctx) { LOGTRACEMOD(wbcache, "deleting stale links for parent node [{}]", parent_node->to_string()); BtreeNodeList free_nodes; diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp index 6e5244543..9335e11c6 100644 --- a/src/include/homestore/index_service.hpp +++ b/src/include/homestore/index_service.hpp @@ -65,7 +65,7 @@ class IndexService { chunk_selector_type_t chunk_sel_type = chunk_selector_type_t::ROUND_ROBIN); // Open the existing vdev which is represnted by the vdev_info_block shared< VirtualDev > open_vdev(const vdev_info& vb, bool load_existing); - std::shared_ptr< ChunkSelector > get_chunk_selector(){ return m_custom_chunk_selector;}; + std::shared_ptr< ChunkSelector > get_chunk_selector() { return m_custom_chunk_selector; }; // for now, we don't support start after stop and there is no use case for this. // TODO: support start after stop if necessary @@ -81,6 +81,7 @@ class IndexService { bool remove_index_table(const std::shared_ptr< IndexTableBase >& tbl); std::shared_ptr< IndexTableBase > get_index_table(uuid_t uuid) const; std::shared_ptr< IndexTableBase > get_index_table(uint32_t ordinal) const; + void write_sb(uint32_t ordinal); // Reserve an ordinal for the index table uint32_t reserve_ordinal(); diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 0217b66f5..70d564d88 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -43,11 +43,11 @@ VENUM(repl_req_state_t, uint32_t, ) VENUM(journal_type_t, uint16_t, - HS_DATA_LINKED = 0, // Linked data where each entry will store physical blkid where data reside - HS_DATA_INLINED = 1, // Data is inlined in the header of journal entry - HS_CTRL_DESTROY = 2, // Control message to destroy the repl_dev - HS_CTRL_START_REPLACE = 3, // Control message to start replace a member - HS_CTRL_COMPLETE_REPLACE = 4, // Control message to complete replace a member, + HS_DATA_LINKED = 0, // Linked data where each entry will store physical blkid where data reside + HS_DATA_INLINED = 1, // Data is inlined in the header of journal entry + HS_CTRL_DESTROY = 2, // Control message to destroy the repl_dev + HS_CTRL_START_REPLACE = 3, // Control message to start replace a member + HS_CTRL_COMPLETE_REPLACE = 4, // Control message to complete replace a member, HS_CTRL_UPDATE_TRUNCATION_BOUNDARY = 5, // Control message to update truncation boundary ) diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index b31541686..60eba96c4 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -43,17 +43,20 @@ class ReplicationService { /// @brief Replace one of the members with a new one. /// @param group_id Group where the replace member happens - /// @param task_id Id of the task which is going to be used for this operation. This is used to track the replace member. + /// @param task_id Id of the task which is going to be used for this operation. This is used to track the replace + /// member. /// @param member_out The member which is going to be replaced /// @param member_in The member which is going to be added in place of member_out /// @param commit_quorum Commit quorum to be used for this operation. If 0, it will use the default commit quorum. /// @return A Future on replace the member accepted or Future ReplServiceError upon error - virtual AsyncReplResult<> replace_member(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0) const = 0; - - virtual AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, - bool wait_and_verify = true, uint64_t trace_id = 0) const = 0; + virtual AsyncReplResult<> replace_member(group_id_t group_id, std::string& task_id, + const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const = 0; + + virtual AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, + uint32_t commit_quorum, bool wait_and_verify = true, + uint64_t trace_id = 0) const = 0; /// @brief Get status of member replacement. /// @param group_id Group where the replace member happens diff --git a/src/lib/blkalloc/fixed_blk_allocator.h b/src/lib/blkalloc/fixed_blk_allocator.h index 01f1e1138..4d743f60b 100644 --- a/src/lib/blkalloc/fixed_blk_allocator.h +++ b/src/lib/blkalloc/fixed_blk_allocator.h @@ -41,7 +41,7 @@ class FixedBlkAllocator : public BitmapBlkAllocator { blk_num_t available_blks() const override; blk_num_t get_used_blks() const override; blk_num_t get_defrag_nblks() const override; - void reset() override{}; + void reset() override {}; bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; std::string to_string() const override; diff --git a/src/lib/blkalloc/varsize_blk_allocator.h b/src/lib/blkalloc/varsize_blk_allocator.h index 03a507b03..04945ab52 100644 --- a/src/lib/blkalloc/varsize_blk_allocator.h +++ b/src/lib/blkalloc/varsize_blk_allocator.h @@ -222,7 +222,7 @@ class VarsizeBlkAllocator : public BitmapBlkAllocator { blk_num_t get_used_blks() const override; bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; std::string to_string() const override; - void reset() override{}; + void reset() override {}; nlohmann::json get_metrics_in_json(); private: diff --git a/src/lib/common/crash_simulator.hpp b/src/lib/common/crash_simulator.hpp index 2afc5d30a..9ae7e5236 100644 --- a/src/lib/common/crash_simulator.hpp +++ b/src/lib/common/crash_simulator.hpp @@ -43,7 +43,7 @@ class CrashSimulator { private: std::function< void(void) > m_restart_cb{nullptr}; - std::atomic m_will_crash{false}; + std::atomic< bool > m_will_crash{false}; sisl::urcu_scoped_ptr< bool > m_crashed; }; } // namespace homestore diff --git a/src/lib/common/homestore_utils.cpp b/src/lib/common/homestore_utils.cpp index a3eacbfdc..937083012 100644 --- a/src/lib/common/homestore_utils.cpp +++ b/src/lib/common/homestore_utils.cpp @@ -51,7 +51,7 @@ bool hs_utils::mod_aligned_sz(size_t size_to_check, size_t align_sz) { bool hs_utils::is_ptr_aligned(void* ptr, std::size_t alignment) { // Cast the pointer to uintptr_t, which is an integer type capable of holding a pointer - auto intptr = reinterpret_cast(ptr); + auto intptr = reinterpret_cast< std::uintptr_t >(ptr); // Check if the pointer is a multiple of the alignment return (intptr % alignment) == 0; } diff --git a/src/lib/common/homestore_utils.hpp b/src/lib/common/homestore_utils.hpp index b6989ff48..d8980ae19 100644 --- a/src/lib/common/homestore_utils.hpp +++ b/src/lib/common/homestore_utils.hpp @@ -54,7 +54,6 @@ class hs_utils { std::vector< std::string >& ordered_entries); }; -static bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, - uint32_t interval_ms = 100); +static bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms = 100); } // namespace homestore diff --git a/src/lib/device/chunk.cpp b/src/lib/device/chunk.cpp index 4962be386..ecbd293cf 100644 --- a/src/lib/device/chunk.cpp +++ b/src/lib/device/chunk.cpp @@ -30,7 +30,7 @@ std::string Chunk::to_string() const { } float Chunk::get_blk_usage() const { - return s_cast(m_blk_allocator->get_used_blks()) / s_cast(m_blk_allocator->get_total_blks()); + return s_cast< float >(m_blk_allocator->get_used_blks()) / s_cast< float >(m_blk_allocator->get_total_blks()); } void Chunk::set_user_private(const sisl::blob& data) { diff --git a/src/lib/device/journal_vdev.cpp b/src/lib/device/journal_vdev.cpp index bcddae40a..3e4dda2a0 100644 --- a/src/lib/device/journal_vdev.cpp +++ b/src/lib/device/journal_vdev.cpp @@ -46,15 +46,15 @@ JournalVirtualDev::JournalVirtualDev(DeviceManager& dmgr, const vdev_info& vinfo m_init_private_data = std::make_shared< JournalChunkPrivate >(); m_chunk_pool = std::make_unique< ChunkPool >( dmgr, - ChunkPool::Params{ - HS_DYNAMIC_CONFIG(generic.journal_chunk_pool_capacity), - [this]() { - m_init_private_data->created_at = get_time_since_epoch_ms(); - m_init_private_data->end_of_chunk = m_vdev_info.chunk_size; - sisl::blob private_blob{r_cast< uint8_t* >(m_init_private_data.get()), sizeof(JournalChunkPrivate)}; - return private_blob; - }, - m_vdev_info.hs_dev_type, m_vdev_info.vdev_id, m_vdev_info.chunk_size}); + ChunkPool::Params{HS_DYNAMIC_CONFIG(generic.journal_chunk_pool_capacity), + [this]() { + m_init_private_data->created_at = get_time_since_epoch_ms(); + m_init_private_data->end_of_chunk = m_vdev_info.chunk_size; + sisl::blob private_blob{r_cast< uint8_t* >(m_init_private_data.get()), + sizeof(JournalChunkPrivate)}; + return private_blob; + }, + m_vdev_info.hs_dev_type, m_vdev_info.vdev_id, m_vdev_info.chunk_size}); resource_mgr().register_journal_vdev_exceed_cb([this]([[maybe_unused]] int64_t dirty_buf_count, bool critical) { // either it is critical or non-critical, call cp_flush; diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index bec7c2e49..141ff2063 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -82,7 +82,8 @@ HomeStore& HomeStore::with_data_service(cshared< ChunkSelector >& custom_chunk_s return *this; } -HomeStore& HomeStore::with_index_service(std::unique_ptr< IndexServiceCallbacks > cbs, cshared< ChunkSelector >& custom_chunk_selector) { +HomeStore& HomeStore::with_index_service(std::unique_ptr< IndexServiceCallbacks > cbs, + cshared< ChunkSelector >& custom_chunk_selector) { m_services.svcs |= HS_SERVICE::INDEX; s_index_cbs = std::move(cbs); s_custom_index_chunk_selector = std::move(custom_chunk_selector); @@ -168,9 +169,9 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ LOGINFO("Homestore is loading with following services: {}", m_services.list()); if (has_meta_service()) { m_meta_service = std::make_unique< MetaBlkService >(); } if (has_index_service()) { - m_index_service = std::make_unique< IndexService >(std::move(s_index_cbs), - std::move(s_custom_index_chunk_selector)); - } + m_index_service = + std::make_unique< IndexService >(std::move(s_index_cbs), std::move(s_custom_index_chunk_selector)); + } if (has_repl_data_service()) { m_log_service = std::make_unique< LogStoreService >(); m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_data_chunk_selector)); diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/index_cp.hpp index ad29fe1c4..8cca904f4 100644 --- a/src/lib/index/index_cp.hpp +++ b/src/lib/index/index_cp.hpp @@ -37,7 +37,7 @@ struct IndexCPContext : public VDevCPContext { uint8_t has_inplace_parent : 1; // Do we have parent_id in the list of ids. It will be first uint8_t has_inplace_child : 1; // Do we have child_id in the list of ids. It will be second uint8_t is_parent_meta : 1; // Is the parent buffer a meta buffer - uint8_t free_node_level : 4; // Free/created node level + uint8_t free_node_level : 4; // Free/created node level uint8_t reserved1 : 1; uint8_t num_new_ids; uint8_t num_freed_ids; @@ -101,8 +101,7 @@ struct IndexCPContext : public VDevCPContext { std::string child_id_string() const { auto const idx = (has_inplace_parent == 0x1) ? 1 : 0; - return (has_inplace_child == 0x1) ? fmt::format("{}", blk_id(idx).to_integer()) - : "empty"; + return (has_inplace_child == 0x1) ? fmt::format("{}", blk_id(idx).to_integer()) : "empty"; } std::string to_string() const; diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index 98744fb2d..e6bdf9dbc 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -29,7 +29,7 @@ namespace homestore { IndexService& index_service() { return hs()->index_service(); } IndexService::IndexService(std::unique_ptr< IndexServiceCallbacks > cbs, shared< ChunkSelector > chunk_selector) : - m_svc_cbs {std::move(cbs)}, m_custom_chunk_selector{std::move(chunk_selector)} { + m_svc_cbs{std::move(cbs)}, m_custom_chunk_selector{std::move(chunk_selector)} { m_ordinal_reserver = std::make_unique< sisl::IDReserver >(); meta_service().register_handler( "index", @@ -40,9 +40,7 @@ IndexService::IndexService(std::unique_ptr< IndexServiceCallbacks > cbs, shared< meta_service().register_handler( "wb_cache", - [this](meta_blk* mblk, sisl::byte_view buf, size_t size) { - m_wbcache_sb = std::pair{mblk, std::move(buf)}; - }, + [this](meta_blk* mblk, sisl::byte_view buf, size_t size) { m_wbcache_sb = std::pair{mblk, std::move(buf)}; }, nullptr); } @@ -80,7 +78,13 @@ void IndexService::start() { for (auto const& [meta_cookie, buf] : m_itable_sbs) { superblk< index_table_sb > sb; sb.load(buf, meta_cookie); - add_index_table(m_svc_cbs->on_index_table_found(std::move(sb))); + auto inode = sb->total_interior_nodes; + auto lnode = sb->total_leaf_nodes; + auto depth = sb->btree_depth; + LOGINFO("sb metrics interior {}, leaf: {} depth {}", inode, lnode, depth); + auto tbl = m_svc_cbs->on_index_table_found(std::move(sb)); + tbl->load_metrics(inode, lnode, depth); + add_index_table(tbl); } // Recover the writeback cache, which in-turns recovers any index table nodes @@ -99,6 +103,15 @@ void IndexService::start() { hs()->cp_mgr().trigger_cp_flush(true /* force */); } +void IndexService::write_sb(uint32_t ordinal) { + if (is_stopping()) return; + incr_pending_request_num(); + std::unique_lock lg(m_index_map_mtx); + auto const it = m_ordinal_index_map.find(ordinal); + if (it != m_ordinal_index_map.cend()) { it->second->update_sb(); } + decr_pending_request_num(); +} + IndexService::~IndexService() { m_wb_cache.reset(); } void IndexService::stop() { diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 1459a6361..c315d52af 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -92,13 +92,13 @@ void IndexWBCache::start_flush_threads() { } } -BtreeNodePtr IndexWBCache::alloc_buf(uint32_t ordinal,node_initializer_t&& node_initializer) { +BtreeNodePtr IndexWBCache::alloc_buf(uint32_t ordinal, node_initializer_t&& node_initializer) { auto cpg = cp_mgr().cp_guard(); auto cp_ctx = r_cast< IndexCPContext* >(cpg.context(cp_consumer_t::INDEX_SVC)); // Alloc a block of data from underlying vdev MultiBlkId blkid; - // Ordinal used as a hint in the case of custom chunk selector exists + // Ordinal used as a hint in the case of custom chunk selector exists blk_alloc_hints hints; hints.application_hint = ordinal; auto ret = m_vdev->alloc_contiguous_blks(1, hints, blkid); @@ -521,23 +521,19 @@ std::string IndexWBCache::to_string_dag_bufs(DagMap& dags, cp_id_t cp_id) { void IndexWBCache::prune_up_buffers(IndexBufferPtr const& buf, std::vector< IndexBufferPtr >& pruned_bufs_to_repair) { auto up_buf = buf->m_up_buffer; auto grand_up_buf = up_buf->m_up_buffer; - if (!up_buf || !up_buf->m_wait_for_down_buffers.testz()) { - return; - } + if (!up_buf || !up_buf->m_wait_for_down_buffers.testz()) { return; } // if up buffer has up buffer, then we need to decrement its wait_for_down_buffers - LOGINFOMOD(wbcache, - "\n\npruning up_buffer due to zero dependency of child\n up buffer {}\n buffer {}", - up_buf->to_string(), buf->to_string()); + LOGINFOMOD(wbcache, "\n\npruning up_buffer due to zero dependency of child\n up buffer {}\n buffer {}", + up_buf->to_string(), buf->to_string()); update_up_buffer_counters(up_buf); - pruned_bufs_to_repair.push_back(up_buf); + pruned_bufs_to_repair.push_back(up_buf); if (grand_up_buf && !grand_up_buf->is_meta_buf() && grand_up_buf->m_wait_for_down_buffers.testz()) { LOGTRACEMOD( wbcache, "\nadding grand_buffer to repair list due to zero dependency of child\n grand buffer {}\n buffer {}", - grand_up_buf->to_string(), - buf->to_string()); + grand_up_buf->to_string(), buf->to_string()); pruned_bufs_to_repair.push_back(grand_up_buf); } } @@ -706,9 +702,9 @@ void IndexWBCache::recover(sisl::byte_view sb) { recover_buf(buf); } - // When we prune a buffer due to zero down dependency, there is a case where the key range of the parent needs to be adjusted. - // This can happen when a child is merged and its right sibling is flushed before the parent is flushed. - // And during recovery, we prune the node and keep the deleted child and keep the parent as is. + // When we prune a buffer due to zero down dependency, there is a case where the key range of the parent needs to be + // adjusted. This can happen when a child is merged and its right sibling is flushed before the parent is flushed. + // And during recovery, we prune the node and keep the deleted child and keep the parent as is. // We need to call repair_links directly on them as the recovery_buf() path will not trigger it. for (auto const& buf : pruned_bufs_to_repair) { LOGTRACEMOD(wbcache, "pruned buf {} is repaired", buf->to_string()); @@ -829,7 +825,7 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { } cp_ctx->prepare_flush_iteration(); - + m_updated_ordinals.clear(); for (auto& fiber : m_cp_flush_fibers) { iomanager.run_on_forget(fiber, [this, cp_ctx]() { IndexBufferPtrList buf_list; @@ -903,10 +899,16 @@ void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferP LOGTRACEMOD(wbcache, "cp {} buf {}", cp_ctx->id(), buf->to_string()); resource_mgr().dec_dirty_buf_size(m_node_size); + m_updated_ordinals.insert(buf->m_index_ordinal); auto [next_buf, has_more] = on_buf_flush_done(cp_ctx, buf); if (next_buf) { do_flush_one_buf(cp_ctx, next_buf, false); } else if (!has_more) { + for (const auto& ordinal : m_updated_ordinals) { + LOGTRACEMOD(wbcache, "Updating sb for ordinal {}", ordinal); + index_service().write_sb(ordinal); + } + // We are done flushing the buffers, We flush the vdev to persist the vdev bitmaps and free blks // Pick a CP Manager blocking IO fiber to execute the cp flush of vdev iomanager.run_on_forget(cp_mgr().pick_blocking_io_fiber(), [this, cp_ctx]() { diff --git a/src/lib/index/wb_cache.hpp b/src/lib/index/wb_cache.hpp index 684b3f35d..bf04dbc67 100644 --- a/src/lib/index/wb_cache.hpp +++ b/src/lib/index/wb_cache.hpp @@ -41,6 +41,7 @@ class IndexWBCache : public IndexWBCacheBase { std::mutex m_flush_mtx; void* m_meta_blk; bool m_in_recovery{false}; + std::unordered_set< uint32_t > m_updated_ordinals; public: IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 18f7c1eb4..154178737 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -159,8 +159,8 @@ AsyncReplResult<> RaftReplDev::start_replace_member(std::string& task_id, const // remediate it. Need to rollback the first task. And for the same task, it's reentrant and idempotent. auto existing_task_id = get_replace_member_task_id(); if (!existing_task_id.empty() && existing_task_id != task_id) { - RD_LOGE(trace_id, "Step1. Replace member, task_id={} is not the same as existing task_id={}", - task_id, existing_task_id); + RD_LOGE(trace_id, "Step1. Replace member, task_id={} is not the same as existing task_id={}", task_id, + existing_task_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::REPLACE_MEMBER_TASK_MISMATCH); } @@ -172,13 +172,11 @@ AsyncReplResult<> RaftReplDev::start_replace_member(std::string& task_id, const RD_LOGI(trace_id, "Step1. Replace member, the intent has already been fulfilled, ignore it, task_id={}, " "member_out={} member_in={}", - task_id, boost::uuids::to_string(member_out.id), - boost::uuids::to_string(member_in.id)); + task_id, boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); decr_pending_request_num(); return make_async_success<>(); } - RD_LOGE(trace_id, "Step1. Replace member invalid parameter, out member is not found, task_id={}", - task_id); + RD_LOGE(trace_id, "Step1. Replace member invalid parameter, out member is not found, task_id={}", task_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } @@ -192,8 +190,7 @@ AsyncReplResult<> RaftReplDev::start_replace_member(std::string& task_id, const // until the successor finishes the catch-up of the latest log, and then resign. Return NOT_LEADER and let // client retry. raft_server()->yield_leadership(false /* immediate */, -1 /* successor */); - RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out so yield leadership, task_id={}", - task_id); + RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out so yield leadership, task_id={}", task_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::NOT_LEADER); } @@ -234,8 +231,7 @@ AsyncReplResult<> RaftReplDev::start_replace_member(std::string& task_id, const return make_async_error(ReplServiceError::FAILED); } #endif - RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner, task_id={}", - task_id); + RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner, task_id={}", task_id); auto learner_ret = do_flip_learner(member_out, true, true, trace_id); if (learner_ret != ReplServiceError::OK) { RD_LOGE(trace_id, "Step2. Replace member, failed to flip out member to learner {}, task_id={}", learner_ret, @@ -244,8 +240,7 @@ AsyncReplResult<> RaftReplDev::start_replace_member(std::string& task_id, const decr_pending_request_num(); return make_async_error(std::move(learner_ret)); } - RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner and set priority to 0, task_id={}", - task_id); + RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner and set priority to 0, task_id={}", task_id); // Step 3. Append log entry to mark the old member is out and new member is added. RD_LOGI(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req, group_id={}, task_id={}", @@ -283,14 +278,13 @@ AsyncReplResult<> RaftReplDev::start_replace_member(std::string& task_id, const member_to_add.priority = out_srv_cfg.get()->get_priority(); auto ret = do_add_member(member_to_add, trace_id); if (ret != ReplServiceError::OK) { - RD_LOGE(trace_id, "Step4. Replace member, add member failed, err={}, task_id={}", ret, - task_id); + RD_LOGE(trace_id, "Step4. Replace member, add member failed, err={}, task_id={}", ret, task_id); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(ret)); } - RD_LOGI(trace_id, "Step4. Replace member, proposed to raft to add member, task_id={}, member={}", - task_id, boost::uuids::to_string(member_in.id)); + RD_LOGI(trace_id, "Step4. Replace member, proposed to raft to add member, task_id={}, member={}", task_id, + boost::uuids::to_string(member_in.id)); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_success<>(); @@ -309,9 +303,8 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(std::string& task_id, con } incr_pending_request_num(); - RD_LOGI(trace_id, "Complete replace member, task_id={}, member_out={}, member_in={}", - task_id, boost::uuids::to_string(member_out.id), - boost::uuids::to_string(member_in.id)); + RD_LOGI(trace_id, "Complete replace member, task_id={}, member_out={}, member_in={}", task_id, + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); if (commit_quorum >= 1) { // Two members are down and leader cant form the quorum. Reduce the quorum size. @@ -319,8 +312,8 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(std::string& task_id, con } // Step 5: Remove member - RD_LOGI(trace_id, "Step5. Replace member, remove old member, task_id={}, member={}", - task_id, boost::uuids::to_string(member_out.id)); + RD_LOGI(trace_id, "Step5. Replace member, remove old member, task_id={}, member={}", task_id, + boost::uuids::to_string(member_out.id)); #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("replace_member_remove_member_failure")) { RD_LOGE(trace_id, "Simulating remove member failure"); @@ -329,14 +322,14 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(std::string& task_id, con #endif auto ret = do_remove_member(member_out, trace_id); if (ret != ReplServiceError::OK) { - RD_LOGE(trace_id, "Step5. Replace member, failed to remove member, task_id={}, member={}, err={}", - task_id, boost::uuids::to_string(member_out.id), ret); + RD_LOGE(trace_id, "Step5. Replace member, failed to remove member, task_id={}, member={}, err={}", task_id, + boost::uuids::to_string(member_out.id), ret); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(ret)); } - RD_LOGI(trace_id, "Step5. Replace member, proposed to raft to remove member, task_id={}, member={}", - task_id, boost::uuids::to_string(member_out.id)); + RD_LOGI(trace_id, "Step5. Replace member, proposed to raft to remove member, task_id={}, member={}", task_id, + boost::uuids::to_string(member_out.id)); auto timeout = HS_DYNAMIC_CONFIG(consensus.wait_for_config_change_ms); // TODO Move wait logic to nuraft_mesg if (!wait_and_check( @@ -357,13 +350,13 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(std::string& task_id, con // leave_timeout=leave_limit_(default=5)*heart_beat_interval_, it's better for client to retry it. return make_async_error<>(ReplServiceError::RETRY_REQUEST); } - RD_LOGD(trace_id, "Step5. Replace member, old member is removed, task_id={}, member={}", - task_id, boost::uuids::to_string(member_out.id)); + RD_LOGD(trace_id, "Step5. Replace member, old member is removed, task_id={}, member={}", task_id, + boost::uuids::to_string(member_out.id)); // Step 2. Append log entry to complete replace member RD_LOGI(trace_id, - "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req, group_id={}, task_id={}", - task_id, group_id_str()); + "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req, group_id={}, task_id={}", task_id, + group_id_str()); auto rreq = repl_req_ptr_t(new repl_req_ctx{}); auto ctx = replace_member_ctx(task_id, member_out, member_in); @@ -387,8 +380,7 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(std::string& task_id, con reset_quorum_size(0, trace_id); decr_pending_request_num(); RD_LOGI(trace_id, "Complete replace member done, group_id={}, task_id={}, member_out={} member_in={}", - group_id_str(), task_id, boost::uuids::to_string(member_out.id), - boost::uuids::to_string(member_in.id)); + group_id_str(), task_id, boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); return make_async_success<>(); } @@ -463,12 +455,10 @@ ReplaceMemberStatus RaftReplDev::get_replace_member_status(std::string& task_id, RD_LOGI(trace_id, "Member replacement fulfilled, but task still exists, wait for reaper thread to retry " "complete_replace_member. task_id={}, out_member={}, in_member={}", - persisted_task_id, boost::uuids::to_string(member_out.id), - boost::uuids::to_string(member_in.id)); + persisted_task_id, boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); } - RD_LOGD(trace_id, "Member replacement is in progress. task_id={}, out_member={}, in_member={}", - task_id, boost::uuids::to_string(member_out.id), - boost::uuids::to_string(member_in.id)); + RD_LOGD(trace_id, "Member replacement is in progress. task_id={}, out_member={}, in_member={}", task_id, + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); decr_pending_request_num(); return ReplaceMemberStatus::IN_PROGRESS; } @@ -712,9 +702,7 @@ void RaftReplDev::on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< HS_REL_ASSERT(result.hasError() == false, "Not expecting creating snapshot to return false. "); // propose truncate boundary on leader if needed - if (is_leader()) { - propose_truncate_boundary(); - } + if (is_leader()) { propose_truncate_boundary(); } auto ret_val{true}; if (when_done) { when_done(ret_val, null_except); } @@ -729,22 +717,22 @@ void RaftReplDev::propose_truncate_boundary() { if (p.id_ == m_my_repl_id) { continue; } RD_LOGD(NO_TRACE_ID, "peer_repl_idx={}, minimum_repl_idx={}", p.replication_idx_, minimum_repl_idx); minimum_repl_idx = std::min(minimum_repl_idx, static_cast< repl_lsn_t >(p.replication_idx_)); - } repl_lsn_t raft_logstore_reserve_threshold = HS_DYNAMIC_CONFIG(resource_limits.raft_logstore_reserve_threshold); repl_lsn_t truncation_upper_limit = std::max(leader_commit_idx - raft_logstore_reserve_threshold, minimum_repl_idx); - RD_LOGD(NO_TRACE_ID, "calculated truncation_upper_limit={}, " - "leader_commit_idx={}, raft_logstore_reserve_threshold={}, minimum_repl_idx={}", - truncation_upper_limit, leader_commit_idx, raft_logstore_reserve_threshold, minimum_repl_idx); + RD_LOGD(NO_TRACE_ID, + "calculated truncation_upper_limit={}, " + "leader_commit_idx={}, raft_logstore_reserve_threshold={}, minimum_repl_idx={}", + truncation_upper_limit, leader_commit_idx, raft_logstore_reserve_threshold, minimum_repl_idx); if (truncation_upper_limit > 0) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); auto ctx = truncate_ctx(truncation_upper_limit); sisl::blob header(r_cast< uint8_t* >(&ctx), sizeof(truncate_ctx)); rreq->init(repl_key{.server_id = server_id(), - .term = raft_server()->get_term(), - .dsn = m_next_dsn.fetch_add(1), - .traceID = std::numeric_limits< uint64_t >::max()}, + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = std::numeric_limits< uint64_t >::max()}, journal_type_t::HS_CTRL_UPDATE_TRUNCATION_BOUNDARY, true, header, sisl::blob{}, 0, m_listener); auto err = m_state_machine->propose_to_raft(std::move(rreq)); @@ -1559,8 +1547,7 @@ void RaftReplDev::start_replace_member(repl_req_ptr_t rreq) { auto ctx = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); RD_LOGI(rreq->traceID(), "Raft repl start_replace_member commit, task_id={} member_out={} member_in={}", - ctx->task_id, boost::uuids::to_string(ctx->replica_out.id), - boost::uuids::to_string(ctx->replica_in.id)); + ctx->task_id, boost::uuids::to_string(ctx->replica_out.id), boost::uuids::to_string(ctx->replica_in.id)); m_listener->on_start_replace_member(ctx->task_id, ctx->replica_out, ctx->replica_in, rreq->traceID()); // record the replace_member intent @@ -1613,8 +1600,7 @@ void RaftReplDev::update_truncation_boundary(repl_req_ptr_t rreq) { // T4: F1 catches up and commits upto 10000, this time truncation_upper_limit is updated as 10000 // T5: F1 doing incremental re-sync, applies the log with truncation_upper_limit=6000, which is less than 10000 if (exp_truncation_upper_limit <= cur_truncation_upper_limit) { - RD_LOGW(NO_TRACE_ID, - "exp_truncation_upper_limit {} is no larger than cur_truncation_upper_limit {}", + RD_LOGW(NO_TRACE_ID, "exp_truncation_upper_limit {} is no larger than cur_truncation_upper_limit {}", exp_truncation_upper_limit, cur_truncation_upper_limit); return; } @@ -1876,7 +1862,7 @@ bool RaftReplDev::is_destroyed() const { return (*m_stage.access().get() == repl repl_dev_stage_t RaftReplDev::get_stage() const { return *m_stage.access().get(); } void RaftReplDev::set_stage(repl_dev_stage_t stage) { - m_stage.update([stage](auto* s) { *s = stage; }); + m_stage.update([stage](auto* s) { *s = stage; }); } /////////////////////////////////// nuraft_mesg::mesg_state_mgr overrides //////////////////////////////////// @@ -2077,8 +2063,7 @@ void RaftReplDev::monitor_replace_member_replication_status() { if (!catch_up) { RD_LOGD(NO_TRACE_ID, "Checking replace member status, task_id={},replica_in={} with lsn={}, replica_out={} with lsn={}", - task_id, boost::uuids::to_string(replica_in), in_lsn, - boost::uuids::to_string(replica_out), out_lsn); + task_id, boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); return; } @@ -2086,8 +2071,7 @@ void RaftReplDev::monitor_replace_member_replication_status() { "Checking replace member status, new member has caught up, task_id={}, replica_in={} with lsn={}, " "replica_out={} with " "lsn={}", - task_id, boost::uuids::to_string(replica_in), in_lsn, - boost::uuids::to_string(replica_out), out_lsn); + task_id, boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); trace_id_t trace_id = generateRandomTraceId(); @@ -2504,9 +2488,7 @@ void RaftReplDev::pause_state_machine(size_t timeout) { raft_server()->pause_state_machine_execution(timeout); } -bool RaftReplDev::is_state_machine_paused() { - return raft_server()->is_state_machine_execution_paused(); -} +bool RaftReplDev::is_state_machine_paused() { return raft_server()->is_state_machine_execution_paused(); } void RaftReplDev::resume_state_machine() { RD_LOGI(NO_TRACE_ID, "Resume state machine execution for group_id={}", group_id_str()); diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 76144b236..60245dfa4 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -249,8 +249,9 @@ class RaftReplDev : public ReplDev, bool bind_data_service(); bool join_group(); - AsyncReplResult<> start_replace_member(std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, - uint32_t commit_quorum = 0, uint64_t trace_id = 0); + AsyncReplResult<> start_replace_member(std::string& task_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0); AsyncReplResult<> complete_replace_member(std::string& task_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0); @@ -266,7 +267,7 @@ class RaftReplDev : public ReplDev, uint64_t trace_id = 0); ReplServiceError set_priority(const replica_id_t& member, int32_t priority, uint64_t trace_id = 0); nuraft::cmd_result_code retry_when_config_changing(const std::function< nuraft::cmd_result_code() >& func, - uint64_t trace_id = 0); + uint64_t trace_id = 0); bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms = 100); std::string get_replace_member_task_id() const { return {m_rd_sb->replace_member_task.task_id}; } @@ -317,7 +318,7 @@ class RaftReplDev : public ReplDev, repl_lsn_t get_last_commit_lsn() const override { return m_commit_upto_lsn.load(); } void set_last_commit_lsn(repl_lsn_t lsn) { m_commit_upto_lsn.store(lsn); } repl_lsn_t get_last_append_lsn() override { return raft_server()->get_last_log_idx(); } - repl_lsn_t get_truncation_upper_limit() const { return m_truncation_upper_limit.load(); } + repl_lsn_t get_truncation_upper_limit() const { return m_truncation_upper_limit.load(); } bool is_destroy_pending() const; bool is_destroyed() const; void set_stage(repl_dev_stage_t stage); @@ -405,7 +406,8 @@ class RaftReplDev : public ReplDev, void flush_durable_commit_lsn(); /** - * Monitor the replace_member replication status, if the new member is fully synced up and ready to take over, remove the old member. + * Monitor the replace_member replication status, if the new member is fully synced up and ready to take over, + * remove the old member. */ void monitor_replace_member_replication_status(); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index cf961dabe..f4707124d 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -80,8 +80,7 @@ class SoloReplDev : public ReplDev { } bool is_ready_for_traffic() const override { return true; } void set_stage(repl_dev_stage_t stage) override {} - repl_dev_stage_t get_stage() const override { - return repl_dev_stage_t::ACTIVE; } + repl_dev_stage_t get_stage() const override { return repl_dev_stage_t::ACTIVE; } void purge() override {} void pause_state_machine(size_t timeout) override { m_paused.store(true); } diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 155090411..b5966f239 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -79,7 +79,7 @@ hs_stats GenericReplService::get_cap_stats() const { ///////////////////// SoloReplService specializations and CP Callbacks ///////////////////////////// SoloReplService::SoloReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} {} -SoloReplService::~SoloReplService(){}; +SoloReplService::~SoloReplService() {}; void SoloReplService::start() { for (auto const& [buf, mblk] : m_sb_bufs) { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index ad67bf91d..3af6908bf 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -65,7 +65,8 @@ int32_t RaftReplService::compute_raft_follower_priority() { auto max_wait_round = std::min(raft_priority_election_round_upper_limit, HS_DYNAMIC_CONFIG(consensus.max_wait_rounds_of_priority_election)); if (max_wait_round == 0) { return raft_leader_priority; } - auto priority = 1 + static_cast< int32_t >( + auto priority = 1 + + static_cast< int32_t >( std::ceil(raft_leader_priority * std::pow(raft_priority_decay_coefficient, max_wait_round))); return priority; } @@ -368,12 +369,12 @@ AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t for (auto& member : members) { if (member == my_id) { continue; } // Skip myself do { - auto srv_config = nuraft::srv_config(nuraft_mesg::to_server_id(member), 0, boost::uuids::to_string(member), "", - false, follower_priority); + auto srv_config = nuraft::srv_config(nuraft_mesg::to_server_id(member), 0, + boost::uuids::to_string(member), "", false, follower_priority); auto const result = m_msg_mgr->add_member(group_id, srv_config).get(); if (result) { - LOGINFOMOD(replication, "Groupid={}, new member={} added with priority={}", boost::uuids::to_string(group_id), - boost::uuids::to_string(member), follower_priority); + LOGINFOMOD(replication, "Groupid={}, new member={} added with priority={}", + boost::uuids::to_string(group_id), boost::uuids::to_string(member), follower_priority); break; } else if (result.error() != nuraft::CONFIG_CHANGING) { LOGWARNMOD(replication, "Groupid={}, add member={} failed with error={}", @@ -489,7 +490,8 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki // In this function, it only invokes replDev start_replace_member. There is // a background reaper thread helps periodically check the member_in replication status, after in_member has caught up, // will trigger replDev complete_replace_member. -AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, +AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, std::string& task_id, + const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) const { if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); @@ -513,8 +515,9 @@ AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, std::stri }); } -AsyncReplResult<> RaftReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, - bool wait_and_verify, uint64_t trace_id) const { +AsyncReplResult<> RaftReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, + bool target, uint32_t commit_quorum, bool wait_and_verify, + uint64_t trace_id) const { if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); incr_pending_request_num(); auto rdev_result = get_repl_dev(group_id); @@ -578,12 +581,12 @@ void RaftReplService::start_reaper_thread() { HS_DYNAMIC_CONFIG(consensus.flush_durable_commit_interval_ms) * 1000 * 1000, true /* recurring */, nullptr, [this](void*) { flush_durable_commit_lsn(); }); - // Check replace_member sync status to see a new member is fully synced up and ready to remove the old member + // Check replace_member sync status to see a new member is fully synced up and ready to remove the old + // member m_replace_member_sync_check_timer_hdl = iomanager.schedule_thread_timer( HS_DYNAMIC_CONFIG(consensus.replace_member_sync_check_interval_ms) * 1000 * 1000, true /* recurring */, nullptr, [this](void*) { monitor_replace_member_replication_status(); }); - p.setValue(); } else { // Cancel all recurring timers started diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 187bd5f74..2f8acfb2f 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -80,8 +80,8 @@ class RaftReplService : public GenericReplService, folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; AsyncReplResult<> replace_member(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0) const override; + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, @@ -119,7 +119,7 @@ class ReplSvcCPContext : public CPContext { std::map< ReplDev*, cshared< ReplDevCPContext > > m_cp_ctx_map; public: - ReplSvcCPContext(CP* cp) : CPContext(cp){}; + ReplSvcCPContext(CP* cp) : CPContext(cp) {}; virtual ~ReplSvcCPContext() = default; int add_repl_dev_ctx(ReplDev* dev, cshared< ReplDevCPContext > dev_ctx); cshared< ReplDevCPContext > get_repl_dev_ctx(ReplDev* dev); diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index 0f709291a..c2ba5780c 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -45,7 +45,7 @@ struct BtreeTestHelper { m_cfg.m_leaf_node_type = T::leaf_node_type; m_cfg.m_int_node_type = T::interior_node_type; m_cfg.m_max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); - if (SISL_OPTIONS.count("disable_merge")){m_cfg.m_merge_turned_on = false;} + if (SISL_OPTIONS.count("disable_merge")) { m_cfg.m_merge_turned_on = false; } m_max_range_input = SISL_OPTIONS["num_entries"].as< uint32_t >(); @@ -327,8 +327,8 @@ struct BtreeTestHelper { auto req = BtreeSingleGetRequest{copy_key.get(), out_v.get()}; req.enable_route_tracing(); const auto ret = m_bt->get(req); - ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map" << - " - status=" << enum_name(ret); + ASSERT_EQ(ret, btree_status_t::success) + << "Missing key " << key << " in btree but present in shadow map" << " - status=" << enum_name(ret); ASSERT_EQ((const V&)req.value(), value) << "Found value in btree doesn't return correct data for key=" << key; }); @@ -381,6 +381,16 @@ struct BtreeTestHelper { run_in_parallel(op_list); } + std::tuple< uint64_t, uint64_t, uint8_t > get_btree_metrics(const nlohmann::json& metrics_json) { + const auto& counters = metrics_json.at("Counters"); + + uint64_t int_cnt = counters.at("Btree Interior node count").get< uint64_t >(); + uint64_t leaf_cnt = counters.at("Btree Leaf node count").get< uint64_t >(); + uint8_t depth = counters.at("Depth of btree").get< uint8_t >(); + + return std::make_tuple(int_cnt, leaf_cnt, depth); + } + void dump_to_file(const std::string& file = "") const { m_bt->dump_tree_to_file(file); } void print_keys(const std::string& preamble = "") const { auto print_key_range = [](std::vector< std::pair< K, V > > const& kvs) -> std::string { diff --git a/src/tests/test_blk_cache_queue.cpp b/src/tests/test_blk_cache_queue.cpp index 840c921af..e4b75ef26 100644 --- a/src/tests/test_blk_cache_queue.cpp +++ b/src/tests/test_blk_cache_queue.cpp @@ -50,8 +50,8 @@ struct BlkCacheQueueTest : public ::testing::Test { virtual ~BlkCacheQueueTest() override = default; protected: - virtual void SetUp() override{}; - virtual void TearDown() override{}; + virtual void SetUp() override {}; + virtual void TearDown() override {}; void SetUp(const uint32_t nslabs, const uint32_t count_per_slab) { m_nslabs = nslabs; diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp index af803bfd4..0db9416e4 100644 --- a/src/tests/test_btree_node.cpp +++ b/src/tests/test_btree_node.cpp @@ -378,8 +378,8 @@ TYPED_TEST(NodeTest, SimpleInsert) { } TYPED_TEST(NodeTest, RangeChangeInsert) { - if (this->m_node1->get_node_type() != btree_node_type::PREFIX) {return;} - this->put_range(0xFFFFFFFF - 10,20); + if (this->m_node1->get_node_type() != btree_node_type::PREFIX) { return; } + this->put_range(0xFFFFFFFF - 10, 20); this->print(); } diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 9ac750372..4e18b70d1 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -218,9 +218,7 @@ class HSTestHelper { #ifdef _PRERELEASE void wait_for_crash_recovery(bool check_will_crash = false) { - if(check_will_crash && !homestore::HomeStore::instance()->crash_simulator().will_crash()) { - return; - } + if (check_will_crash && !homestore::HomeStore::instance()->crash_simulator().will_crash()) { return; } LOGDEBUG("Waiting for m_crash_recovered future"); m_crash_recovered.getFuture().get(); m_crash_recovered = folly::Promise< folly::Unit >(); @@ -440,7 +438,8 @@ class HSTestHelper { if (svc == HS_SERVICE::DATA) { hsi->with_data_service(tp.custom_chunk_selector); } else if (svc == HS_SERVICE::INDEX) { - hsi->with_index_service(std::unique_ptr< IndexServiceCallbacks >(tp.index_svc_cbs), tp.index_chunk_selector); + hsi->with_index_service(std::unique_ptr< IndexServiceCallbacks >(tp.index_svc_cbs), + tp.index_chunk_selector); } else if ((svc == HS_SERVICE::LOG)) { hsi->with_log_service(); } else if (svc == HS_SERVICE::REPLICATION) { diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 4fa73c817..136aceb7f 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -755,8 +755,8 @@ class RaftReplDevTestBase : public testing::Test { replica_id_t member_in, uint32_t commit_quorum = 0, ReplServiceError error = ReplServiceError::OK) { this->run_on_leader(db, [this, error, db, &task_id, member_out, member_in, commit_quorum]() { - LOGINFO("Start replace member task_id={}, out={}, in={}", task_id, - boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); + LOGINFO("Start replace member task_id={}, out={}, in={}", task_id, boost::uuids::to_string(member_out), + boost::uuids::to_string(member_in)); replica_member_info out{member_out, ""}; replica_member_info in{member_in, ""}; @@ -773,8 +773,8 @@ class RaftReplDevTestBase : public testing::Test { ReplaceMemberStatus check_replace_member_status(std::shared_ptr< TestReplicatedDB > db, std::string& task_id, replica_id_t member_out, replica_id_t member_in) { - LOGINFO("check replace member status, task_id={}, out={} in={}", task_id, - boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); + LOGINFO("check replace member status, task_id={}, out={} in={}", task_id, boost::uuids::to_string(member_out), + boost::uuids::to_string(member_in)); replica_member_info out{member_out, ""}; replica_member_info in{member_in, ""}; diff --git a/src/tests/test_data_service.cpp b/src/tests/test_data_service.cpp index 0cb855752..dc72fa2fe 100644 --- a/src/tests/test_data_service.cpp +++ b/src/tests/test_data_service.cpp @@ -456,7 +456,7 @@ class BlkDataServiceTest : public testing::Test { // every piece in bid is a single block, e.g. nblks = 1 auto const nbids = bid.num_pieces(); auto sub_io_size = nbids * inst().get_blk_size(); - HS_REL_ASSERT_LE(sub_io_size, remaining_io_size, "not expecting sub_io_size to exceed remaining_io_size"); + HS_REL_ASSERT_LE(sub_io_size, remaining_io_size, "not expecting sub_io_size to exceed remaining_io_size"); // we pass crc from lambda becaues if there is any async_free_blk, the written blks in the blkcrc map will // be removed by the time read thenVlue is called; diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 06a2f26d2..ddd6e6a22 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -60,7 +60,8 @@ SISL_OPTION_GROUP( (save_to_file, "", "save_to_file", "save to file", ::cxxopts::value< bool >()->default_value("0"), ""), (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", ::cxxopts::value< bool >()->default_value("1"), ""), - (print_keys_verbose_logging, "", "print_keys_verbose_logging", "print_keys_verbose_logging", ::cxxopts::value< bool >()->default_value("0"), ""), + (print_keys_verbose_logging, "", "print_keys_verbose_logging", "print_keys_verbose_logging", + ::cxxopts::value< bool >()->default_value("0"), ""), (seed, "", "seed", "random engine seed, use random if not defined", ::cxxopts::value< uint64_t >()->default_value("0"), "number")) @@ -72,10 +73,8 @@ void log_obj_life_counter() { LOGINFO("Object Life Counter\n:{}", str); } -#define print_keys_logging(msg) \ - if (SISL_OPTIONS.count("print_keys_verbose_logging")) { \ - this->print_keys(msg); \ - } +#define print_keys_logging(msg) \ + if (SISL_OPTIONS.count("print_keys_verbose_logging")) { this->print_keys(msg); } enum class OperationType { Put, @@ -998,6 +997,56 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { } } +TYPED_TEST(IndexCrashTest, MetricsTest) { + const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); + std::vector< uint32_t > vec(num_entries); + iota(vec.begin(), vec.end(), 0); + std::random_shuffle(vec.begin(), vec.end()); + for (auto key : vec) { + this->put(key, btree_put_type::INSERT, true /* expect_success */); + } + print_keys_logging("After populating"); + + auto log_btree_metrics = [this](std::string prompt) { + auto metrics = this->m_bt->get_metrics_in_json().dump(1, '\t'); + LOGDEBUG("metrics: \n{}", metrics); + auto metrics_json = this->m_bt->get_metrics_in_json(); + auto bt_cnts = this->m_bt->get_num_nodes(); + auto bt_d = this->m_bt->get_btree_depth(); + auto com_cnts = this->m_bt->compute_node_count(); + auto com_d = this->m_bt->compute_btree_depth(); + auto [int_cnt, leaf_cnt, depth] = this->get_btree_metrics(metrics_json); + + LOGDEBUG("\n{}:\nmetrics (interior, leaf, height):\ncompute ({}, {}, {})\nbtree ({}, {}, {})\nmetrics ({}, " + "{}, {})", + prompt, com_cnts.first, com_cnts.second, com_d, bt_cnts.first, bt_cnts.second, bt_d, int_cnt, leaf_cnt, + depth); + ASSERT_EQ(bt_cnts.first, com_cnts.first) << "btree interior count doesn't match the actual node counts"; + ASSERT_EQ(bt_cnts.first, int_cnt) << "btree interior count doesn't match the metrics node counts"; + ASSERT_EQ(bt_cnts.second, com_cnts.second) << "btree leaf count doesn't match the actual node counts"; + ASSERT_EQ(bt_cnts.second, leaf_cnt) << "btree leaf count doesn't match the metrics node counts"; + ASSERT_EQ(bt_d, com_d) << "btree depth doesn't match the actual btee depth"; + ASSERT_EQ(bt_d, depth) << "btree depth doesn't match the metrics depth report"; + }; + log_btree_metrics("node count before CP"); + + test_common::HSTestHelper::trigger_cp(true); + log_btree_metrics("node count after CP"); + + this->m_shadow_map.save(this->m_shadow_filename); + this->restart_homestore(); + print_keys_logging("After restart"); + log_btree_metrics("node count after restart"); + std::string flip = "crash_flush_on_merge_at_parent"; + for (auto key : vec) { + this->remove_one(key, true); + } + this->trigger_cp(false); + this->wait_for_crash_recovery(true); + log_btree_metrics("node count after crash recovery"); + print_keys_logging("after removing all keys"); +} + // // TYPED_TEST(IndexCrashTest, MergeCrash1) { // auto const num_entries = SISL_OPTIONS["num_entries"].as(); diff --git a/src/tests/test_meta_blk_mgr.cpp b/src/tests/test_meta_blk_mgr.cpp index f087f81c0..f4bbe2386 100644 --- a/src/tests/test_meta_blk_mgr.cpp +++ b/src/tests/test_meta_blk_mgr.cpp @@ -125,7 +125,7 @@ class VMetaBlkMgrTest : public ::testing::Test { protected: void SetUp() override { m_helper.start_homestore("test_meta_blk_mgr", {{HS_SERVICE::META, {.size_pct = 85.0}}}); } - void TearDown() override{}; + void TearDown() override {}; public: [[nodiscard]] uint64_t get_elapsed_time(const Clock::time_point& start) { @@ -402,7 +402,7 @@ class VMetaBlkMgrTest : public ::testing::Test { iomanager.iobuf_free(buf); } else { if (unaligned_addr) { - delete[](buf - unaligned_shift); + delete[] (buf - unaligned_shift); } else { delete[] buf; } diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index e52c5f00f..d705ef130 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -558,7 +558,6 @@ TEST_F(RaftReplDevTest, ComputePriority) { g_helper->sync_for_cleanup_start(); } - TEST_F(RaftReplDevTest, RaftLogTruncationTest) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); g_helper->sync_for_test_start(); From 47512e19df8816e123c8f951f5a58b9a19ecc908 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Tue, 22 Jul 2025 13:24:35 -0700 Subject: [PATCH 159/170] Fast sanity check of index table after recovery (#767) --- conanfile.py | 2 +- src/include/homestore/btree/btree.hpp | 3 + .../homestore/btree/detail/btree_common.ipp | 223 +++++++++++++++--- .../homestore/index/index_internal.hpp | 3 +- src/include/homestore/index/index_table.hpp | 15 +- src/include/homestore/index_service.hpp | 1 + src/lib/index/index_service.cpp | 8 + src/lib/index/wb_cache.cpp | 25 ++ 8 files changed, 245 insertions(+), 35 deletions(-) diff --git a/conanfile.py b/conanfile.py index bc9952b3f..c7160eeef 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.6" + version = "6.20.7" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index bdc89c11c..154e3e651 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -211,6 +211,9 @@ class Btree { void to_dot_keys(bnodeid_t bnodeid, std::string& buf, std::map< uint32_t, std::vector< uint64_t > >& l_map, std::map< uint64_t, BtreeVisualizeVariables >& info_map) const; void sanity_sub_tree(bnodeid_t bnodeid = 0) const; + void validate_node(const bnodeid_t& bnodeid) const; + void validate_node_child_relation(BtreeNodePtr node, BtreeNodePtr& last_child_node) const; + void validate_next_node_relation(BtreeNodePtr node, BtreeNodePtr neighbor_node, BtreeNodePtr last_child_node) const; void validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const; void validate_sanity_next_child(const BtreeNodePtr& parent_node, uint32_t ind) const; void print_node(const bnodeid_t& bnodeid) const; diff --git a/src/include/homestore/btree/detail/btree_common.ipp b/src/include/homestore/btree/detail/btree_common.ipp index e0954754b..b4e730b67 100644 --- a/src/include/homestore/btree/detail/btree_common.ipp +++ b/src/include/homestore/btree/detail/btree_common.ipp @@ -171,17 +171,15 @@ void Btree< K, V >::to_string(bnodeid_t bnodeid, std::string& buf) const { } template < typename K, typename V > -void Btree< K, V >::to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, - to_string_cb_t< K, V > const& cb, int nindent) const { +void Btree< K, V >::to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, to_string_cb_t< K, V > const& cb, + int nindent) const { BtreeNodePtr node; locktype_t acq_lock = locktype_t::READ; if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return; } - if(nindent <0){ - nindent = node->level(); - } - std::string tabs(3*(nindent- node->level()), ' '); + if (nindent < 0) { nindent = node->level(); } + std::string tabs(3 * (nindent - node->level()), ' '); fmt::format_to(std::back_inserter(buf), "{}{}\n", tabs, node->to_custom_string(cb)); if (!node->is_leaf()) { @@ -250,30 +248,193 @@ uint64_t Btree< K, V >::count_keys(bnodeid_t bnodeid) const { return result; } +template +void Btree::validate_node_child_relation(BtreeNodePtr node, BtreeNodePtr& last_child_node) const { + if (node->is_leaf()) { return; } + uint32_t nentries = node->has_valid_edge() ? node->total_entries() + 1 : node->total_entries(); + BtreeNodePtr previous_child = nullptr; + for (uint32_t ind = 0; ind < nentries; ++ind) { + BtreeLinkInfo child_info; + node->get_nth_value(ind, &child_info, false /* copy */); + if (child_info.bnode_id() == empty_bnodeid) { + throw std::runtime_error(fmt::format("{}-th child of node [{}] info has empty bnode_id", ind, node->to_string())); + } + BtreeNodePtr child_node; + if (auto ret = read_node_impl(child_info.bnode_id(), child_node); ret != btree_status_t::success) { + throw std::runtime_error(fmt::format("Failed to read child node [{}] of node [{}]", child_info.bnode_id(), node->to_string())); + } + if (ind == nentries - 1) { last_child_node = child_node; } + if (child_node->is_node_deleted()) { + throw std::runtime_error(fmt::format("Child node [{}] is deleted for parent [{}]", child_node->to_string(), node->to_string())); + } + if (child_node->level() != node->level() - 1) { + throw std::runtime_error(fmt::format("Child node level mismatch node [{}] child level: {}, expected: {}",child_node->to_string(), child_node->level(), node->level() - 1)); + } + + K child_first_key = child_node->get_first_key< K >(); + K child_last_key = child_node->get_last_key< K >(); + K parent_nth_key; + + if(child_node->total_entries() >0) { + if(ind< node->total_entries()){ + parent_nth_key= node->get_nth_key(ind, false /* copy */); + if(child_first_key.compare(parent_nth_key) > 0) { + throw std::runtime_error(fmt::format("{}-th Child node [{}] first key is less than its corresponding parent node [{}] key",ind,child_node->to_string(),node->to_string())); + } + if(child_last_key.compare(parent_nth_key) > 0) { + throw std::runtime_error(fmt::format("{}-th Child node [{}] last key is greater than its corresponding parent node [{}] key",ind, child_node->to_string(), node->to_string())); + } + } + + } else if (!child_node->is_leaf() && !child_node->has_valid_edge()) { + throw std::runtime_error(fmt::format("Interior Child node [{}] cannot be empty", child_node->to_string())); + } + + if(ind > 0){ + if (previous_child->next_bnode()!= child_node->node_id()) { + throw std::runtime_error(fmt::format("Broken child linkage: {}-th Child node [{}] node id is not equal to previous child node [{}] next node",ind, child_node->to_string(), child_node->node_id(), previous_child->to_string())); + } + K last_parent_key = node->get_nth_key< K >(ind-1, false /* copy */); + K previous_child_last_key = previous_child->get_last_key< K >(); + if(child_node->total_entries()){ + if (previous_child->total_entries() && child_first_key.compare(previous_child_last_key) <= 0) { + throw std::runtime_error(fmt::format("Child node [{}] first key is not greater than previous child node [{}] last key",child_node->to_string(), previous_child->to_string())); + } + if(child_first_key.compare(last_parent_key) <= 0) { + throw std::runtime_error(fmt::format("Child node [{}] first key is not greater than previous key ({}-th) parent node [{}] key ",child_node->to_string(),ind-1, node->to_string())); + } + } + } + + previous_child = child_node; + } + if(node->has_valid_edge() && last_child_node->is_leaf() && last_child_node->next_bnode()!=empty_bnodeid) { + // If the last child node is a leaf and has a next_bnode, it cannot be a valid edge. + throw std::runtime_error(fmt::format("Last child node [{}] of node [{}] is the last child but has next_bnode", + last_child_node->to_string(), node->to_string())); + } + if(node->has_valid_edge() && !last_child_node->is_leaf() && !last_child_node->has_valid_edge()) { + throw std::runtime_error(fmt::format("Last child node [{}] of edge node [{}] is not edge", + last_child_node->to_string(), node->to_string())); + } + if(!node->has_valid_edge() && last_child_node->is_leaf() && last_child_node->next_bnode()==empty_bnodeid){ + throw std::runtime_error(fmt::format("node [{}] is not edge but last child node [{}] is leaf and has no next_bnode", + node->to_string(),last_child_node->to_string())); + } + if(!node->has_valid_edge() && !last_child_node->is_leaf() && last_child_node->has_valid_edge()){ + throw std::runtime_error(fmt::format("node [{}] is not edge but last child node [{}] has valid edge", + node->to_string(), last_child_node->to_string())); + } +} + template < typename K, typename V > -void Btree< K, V >::sanity_sub_tree(bnodeid_t bnodeid) const { - if (bnodeid==0) { - bnodeid= m_root_node_info.bnode_id(); +void Btree< K, V >::validate_next_node_relation(BtreeNodePtr node, BtreeNodePtr neighbor_node, + BtreeNodePtr last_child_node) const { + K last_key = node->get_last_key< K >(); + + if (neighbor_node->total_entries() == 0 && !neighbor_node->has_valid_edge() && last_child_node &&last_child_node->next_bnode() != empty_bnodeid) { + throw std::runtime_error(fmt::format("neighbor [{}] has no entries nor valid edge but the last child, [{}] of the parent [{}] has next node id {}",neighbor_node->to_string(), last_child_node->to_string(), node->to_string(), last_child_node->next_bnode())); + } + if ((neighbor_node->total_entries() != 0 || neighbor_node->has_valid_edge()) && last_child_node &&last_child_node->next_bnode() == empty_bnodeid) { + throw std::runtime_error(fmt::format("neighbor [{}] has entries or valid edge but the last child, [{}] of the parent [{}] has no next node id",neighbor_node->to_string(), last_child_node->to_string(), node->to_string())); + } + + if (neighbor_node->is_node_deleted()) { + throw std::runtime_error(fmt::format("Neighbor node [{}] is deleted " , neighbor_node->to_string())); + } + if (neighbor_node->level() != node->level()) { + throw std::runtime_error(fmt::format("Neighbor node [{}] level {} mismatch vs node [{}] level {}", + neighbor_node->to_string(), neighbor_node->level(), node->to_string(), + node->level())); } + K neighbor_first_key = neighbor_node->get_first_key< K >(); + auto neighbor_entities = neighbor_node->total_entries(); + if (neighbor_entities && neighbor_first_key.compare(last_key) < 0) { + throw std::runtime_error(fmt::format("Neighbor's first key {} is not greater than node's last key {} (node=[{}], neighbor=[{}])", + neighbor_first_key.to_string(), last_key.to_string(), node->to_string(), neighbor_node->to_string())); + } + if (!node->is_leaf()) { + if (!neighbor_node->has_valid_edge() && !neighbor_entities) { + throw std::runtime_error(fmt::format("Interior neighbor node [{}] is empty ", neighbor_node->to_string())); + } + BtreeLinkInfo first_neighbor_info; + neighbor_node->get_nth_value(0, &first_neighbor_info, false /* copy */); + if (last_child_node->next_bnode() != first_neighbor_info.bnode_id()) { + throw std::runtime_error(fmt::format("Last child node's next_bnode (child=[{}]) does not match neighbor's first bnode_id (neighbor=[{}])", last_child_node->to_string(), neighbor_node->to_string())); + + } + } +} + +template +void Btree::validate_node(const bnodeid_t& bnodeid) const { BtreeNodePtr node; - if ( - auto ret = read_node_impl(bnodeid, node); ret!=btree_status_t::success) { - LOGINFO("reading node failed for bnodeid: {} reason: {}", bnodeid, ret); - }else{ - if(node->is_leaf()){ - return; + if (auto ret = read_node_impl(bnodeid, node); ret != btree_status_t::success) { + throw std::runtime_error(fmt::format("node read failed for bnodeid: {} reason: {}", bnodeid, ret)); + } else { + try { + if (node->is_node_deleted()) { return; } + auto nentities = node->total_entries(); + if (!node->is_leaf() && !nentities && !node->has_valid_edge()) { + throw std::runtime_error(fmt::format("Node [{}] has no entries and no valid edge", node->to_string())); + } + if (node->is_leaf() && node->has_valid_edge()) { + throw std::runtime_error(fmt::format("node [{}] is leaf but has valid edge", node->to_string())); + } + if(!node->validate_key_order()){ + throw std::runtime_error(fmt::format("unsorted node's entries [{}]", node->to_string())); + } + + BtreeNodePtr last_child_node; + validate_node_child_relation(node, last_child_node); + + auto neighbor_id = node->next_bnode(); + if (neighbor_id != empty_bnodeid && node->has_valid_edge()) { + throw std::runtime_error(fmt::format("node [{}] has valid edge but next_bnode is not empty", node->to_string())); + } + if (!node->is_leaf() && neighbor_id == empty_bnodeid && !node->has_valid_edge()) { + throw std::runtime_error(fmt::format("node [{}] is interior but has no valid edge and next_bnode is empty", node->to_string())); + } + if (bnodeid == neighbor_id) { + throw std::runtime_error(fmt::format("node [{}] has next_bnode same as itself", node->to_string())); + } + + if (neighbor_id != empty_bnodeid) { + BtreeNodePtr neighbor_node; + if (auto ret = read_node_impl(neighbor_id, neighbor_node); ret != btree_status_t::success) { + throw std::runtime_error(fmt::format("reading neighbor node of [{}] failed for bnodeid: {} reason : {}", node->to_string(), neighbor_id, ret)); + } + validate_next_node_relation(node, neighbor_node, last_child_node); + } + } catch (const std::exception& e) { + LOGERROR("Validation failed for bnodeid: {} error: {}", bnodeid, e.what()); + throw; } + } +} + + +template < typename K, typename V > +void Btree< K, V >::sanity_sub_tree(bnodeid_t bnodeid) const { + if (bnodeid == 0) { bnodeid = m_root_node_info.bnode_id(); } + BtreeNodePtr node; + if (auto ret = read_node_impl(bnodeid, node); ret != btree_status_t::success) { + LOGINFO("reading node failed for bnodeid: {} reason: {}", bnodeid, ret); + } else { + node->validate_key_order< K >(); + if (node->is_leaf()) { return; } uint32_t nentries = node->has_valid_edge() ? node->total_entries() + 1 : node->total_entries(); - std::vector child_id_list; + std::vector< bnodeid_t > child_id_list; child_id_list.reserve(nentries); - BT_REL_ASSERT_NE(node->has_valid_edge() && node->next_bnode() != empty_bnodeid, true, "node {} has valid edge and next id is not empty", node->to_string()); + BT_REL_ASSERT_NE(node->has_valid_edge() && node->next_bnode() != empty_bnodeid, true, + "node {} has valid edge and next id is not empty", node->to_string()); for (uint32_t i = 0; i < nentries; ++i) { validate_sanity_child(node, i); BtreeLinkInfo child_info; node->get_nth_value(i, &child_info, false /* copy */); child_id_list.push_back(child_info.bnode_id()); } - for (auto child_id: child_id_list){ + for (auto child_id : child_id_list) { sanity_sub_tree(child_id); } } @@ -298,20 +459,17 @@ void Btree< K, V >::validate_sanity_child(const BtreeNodePtr& parent_node, uint3 return; } BT_REL_ASSERT_NE(child_node->is_node_deleted(), true, "child node {} is deleted", child_node->to_string()); - if(ind >= parent_node->total_entries()){ + if (ind >= parent_node->total_entries()) { BT_REL_ASSERT_EQ(parent_node->has_valid_edge(), true); - if( ind >0){ - parent_key = parent_node->get_nth_key< K >(ind -1, false); - } - }else - { + if (ind > 0) { parent_key = parent_node->get_nth_key< K >(ind - 1, false); } + } else { parent_key = parent_node->get_nth_key< K >(ind, false); } K previous_parent_key; - if( ind >0 && parent_node->total_entries()>0){ + if (ind > 0 && parent_node->total_entries() > 0) { previous_parent_key = parent_node->get_nth_key< K >(ind - 1, false); } - for (uint32_t i = 0; i total_entries() ; ++i) { + for (uint32_t i = 0; i < child_node->total_entries(); ++i) { K cur_child_key = child_node->get_nth_key< K >(i, false); if(ind < parent_node->total_entries()){ BT_REL_ASSERT_LE(cur_child_key.compare(parent_key), 0, " child {} {}-th key is greater than its parent's {} {}-th key", child_node->to_string(), i , parent_node->to_string(), ind); @@ -320,15 +478,16 @@ void Btree< K, V >::validate_sanity_child(const BtreeNodePtr& parent_node, uint3 // there can be a transient case where a key appears in two children. When the replay is done, it should be fixed // Consider the example Parent P, children C1, C2, C3, C4. A key is deleted resulting in a merge and C3 deleted, and the same key is inserted in the current cp // Our case is that P is dirtied, C3 deleted, C4 updated and flushed. During recover, we will keep C3 and P remains the same. - // Since C4 is flushed, the key that was removd and inserted will showup in C3 and C4. + // Since C4 is flushed, the key that was removd and inserted will showup in C3 and C4. // After the replay post recovery, C3 should be gone and the tree is valid again. BT_LOG(DEBUG, "child {} {}-th key is less than or equal to its parent's {} {}-th key", child_node->to_string(), i, parent_node->to_string(), ind - 1); } } - }else - { - BT_REL_ASSERT_GT(cur_child_key.compare(parent_key), 0, " child {} {}-th key is greater than its parent {} {}-th key", child_node->to_string(), i , parent_node->to_string(), ind); + } else { + BT_REL_ASSERT_GT(cur_child_key.compare(parent_key), 0, + " child {} {}-th key is greater than its parent {} {}-th key", child_node->to_string(), i, + parent_node->to_string(), ind); } } } @@ -360,10 +519,10 @@ void Btree< K, V >::validate_sanity_next_child(const BtreeNodePtr& parent_node, } /* in case of merge next child will never have zero entries otherwise it would have been merged */ BT_NODE_REL_ASSERT_NE(child_node->total_entries(), 0, child_node); - child_node->get_first_key(&child_key); + child_key = child_node->get_first_key< K >(); parent_node->get_nth_key< K >(ind, &parent_key, false); BT_REL_ASSERT_GT(child_key.compare(&parent_key), 0) - BT_REL_ASSERT_LT(parent_key.compare_start(&child_key), 0) + BT_REL_ASSERT_LT(parent_key.compare_start(&child_key), 0); } template < typename K, typename V > diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp index e079c0448..1d6dbd4e9 100644 --- a/src/include/homestore/index/index_internal.hpp +++ b/src/include/homestore/index/index_internal.hpp @@ -94,9 +94,10 @@ class IndexTableBase { virtual void repair_node(IndexBufferPtr const& buf) = 0; virtual void repair_root_node(IndexBufferPtr const& buf) = 0; virtual void delete_stale_children(IndexBufferPtr const& buf) = 0; - virtual void audit_tree() = 0; + virtual void audit_tree() const = 0; virtual void update_sb() = 0; virtual void load_metrics(uint64_t interior, uint64_t leaf, uint8_t depth) = 0; + virtual bool sanity_check(const IndexBufferPtrList& bufs) const = 0; }; enum class index_buf_state_t : uint8_t { diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index ace023ed5..abf76dc98 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -107,11 +107,24 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } - void audit_tree() override { + void audit_tree() const override { cp_mgr().cp_guard(); Btree< K, V >::sanity_sub_tree(); } + bool sanity_check(const IndexBufferPtrList& bufs) const override { + for (auto& buf : bufs) { + if (buf->is_meta_buf()) { continue; } + try { + Btree< K, V >::validate_node(buf->blkid().to_integer()); + } catch (const std::exception& e) { + LOGERROR("Exception during validation of node {}", buf->blkid().to_integer()); + return false; + } + } + return true; + } + btree_status_t destroy() override { if (is_stopping()) return btree_status_t::stopping; incr_pending_request_num(); diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp index 9335e11c6..431865848 100644 --- a/src/include/homestore/index_service.hpp +++ b/src/include/homestore/index_service.hpp @@ -82,6 +82,7 @@ class IndexService { std::shared_ptr< IndexTableBase > get_index_table(uuid_t uuid) const; std::shared_ptr< IndexTableBase > get_index_table(uint32_t ordinal) const; void write_sb(uint32_t ordinal); + bool sanity_check(const uint32_t index_ordinal, const IndexBufferPtrList& bufs) const; // Reserve an ordinal for the index table uint32_t reserve_ordinal(); diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index e6bdf9dbc..b117adb0f 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -114,6 +114,14 @@ void IndexService::write_sb(uint32_t ordinal) { IndexService::~IndexService() { m_wb_cache.reset(); } +bool IndexService::sanity_check(const uint32_t index_ordinal, const IndexBufferPtrList& bufs) const { + auto tbl = get_index_table(index_ordinal); + if (!tbl) { + HS_DBG_ASSERT(false, "Index corresponding to ordinal={} has not been loaded yet, unexpected", index_ordinal); + } + return tbl->sanity_check(bufs); +} + void IndexService::stop() { start_stopping(); while (true) { diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index c315d52af..0e7f98e8d 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -716,6 +716,31 @@ void IndexWBCache::recover(sisl::byte_view sb) { m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); } + if (pending_bufs.empty()) { + LOGTRACEMOD(wbcache, "No buffers to repair, recovery completed"); + } else { + std::map< uint32_t, IndexBufferPtrList > changed_bufs; + for (auto const& [_, buf] : bufs) { + LOGTRACEMOD(wbcache, "{}", buf->to_string()); + if (!buf->m_node_freed) { changed_bufs[buf->m_index_ordinal].push_back(buf); } + } + for (auto const& [index_ordinal, bufs] : changed_bufs) { + LOGTRACEMOD(wbcache, "Sanity checking buffers for index ordinal {}: # of bufs {}", index_ordinal, + bufs.size()); + auto ret = index_service().sanity_check(index_ordinal, bufs); + if (ret) { + LOGTRACEMOD(wbcache, "Sanity check for index ordinal {} passed", index_ordinal); + } else { + LOGERRORMOD(wbcache, "Sanity check for index ordinal {} failed", index_ordinal); +#ifdef _PRELEASE + HS_DBG_ASSERT(true, "sanity failed: {}", ret); +#else + // TODO: make this index table offline and let others work + HS_REL_ASSERT(0, "sanity failed: {}", ret); +#endif + } + } + } m_in_recovery = false; m_vdev->recovery_completed(); } From bb8d07aeb24b675eabe53dbe7eb3f2fd0c17e992 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Wed, 23 Jul 2025 11:03:53 -0700 Subject: [PATCH 160/170] Fix prefix merge (#742) --- conanfile.py | 2 +- .../btree/detail/btree_remove_impl.ipp | 29 ++++++++++++++++--- .../homestore/btree/detail/prefix_node.hpp | 7 +++-- src/tests/test_index_btree.cpp | 8 +++-- 4 files changed, 37 insertions(+), 9 deletions(-) diff --git a/conanfile.py b/conanfile.py index c7160eeef..e51105ca9 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.7" + version = "6.20.8" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp index 73b68e927..ce2954706 100644 --- a/src/include/homestore/btree/detail/btree_remove_impl.ipp +++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp @@ -232,6 +232,10 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const uint32_t balanced_size{0}; int32_t available_size{0}; uint32_t num_nodes{0}; + uint32_t expected_holes{0}; + uint32_t expected_tail{0}; + uint32_t init_holes{0}; + uint32_t init_tail{0}; struct _leftmost_src_info { std::vector< uint32_t > ith_nodes; @@ -307,6 +311,13 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const // leftmost node as special case without moving, because that is the only node which is modified in-place and hence // doing a dry run and if for some reason there is a problem in balancing the nodes, then it is easy to give up. available_size = static_cast< int32_t >(balanced_size) - leftmost_node->occupied_size(); + if (leftmost_node->get_node_type() == btree_node_type::PREFIX) { + auto cur_node = static_cast< FixedPrefixNode< K, V >* >(leftmost_node.get()); + expected_holes = cur_node->num_prefix_holes(); + init_holes = expected_holes; + expected_tail = cur_node->cprefix_header()->tail_slot; + init_tail = expected_tail; + } src_cursor.ith_node = old_nodes.size(); for (uint32_t i{0}; (i < old_nodes.size() && available_size >= 0); ++i) { leftmost_src.ith_nodes.push_back(i); @@ -326,12 +337,22 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const #endif if ((old_nodes[i]->total_entries() - nentries) == 0) { // Entire node goes in - available_size -= old_nodes[i]->occupied_size(); - // For prefix nodes, compaction will make the size smaller, so we can compact saving to available size; - // hence it cannot get negative. if (old_nodes[i]->get_node_type() == btree_node_type::PREFIX) { auto cur_node = static_cast< FixedPrefixNode< K, V >* >(old_nodes[i].get()); - available_size += cur_node->compact_saving(); + auto c_used_slot = cur_node->cprefix_header()->used_slots; + expected_holes = c_used_slot > init_holes ? 0 : (expected_holes - c_used_slot); + expected_tail = init_tail + (expected_holes > 0 ? 0 : (c_used_slot - init_holes)); + BT_NODE_DBG_ASSERT_EQ(expected_tail >= init_tail, true, leftmost_node, + "Expected tail {} is not greater than initial tail {}", expected_tail, init_tail); + auto prefix_increased_size = + (expected_tail - init_tail) * FixedPrefixNode< K, V >::prefix_entry::size(); + auto suffix_increased_size = cur_node->total_entries() * FixedPrefixNode< K, V >::suffix_entry::size(); + + available_size -= (prefix_increased_size + suffix_increased_size); + init_holes = expected_holes; + init_tail = expected_tail; + } else { + available_size -= old_nodes[i]->occupied_size(); } BT_NODE_DBG_ASSERT_EQ(available_size >= 0, true, leftmost_node, "negative available size"); if (i >= old_nodes.size() - 1) { diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp index 43179477c..facdd2107 100644 --- a/src/include/homestore/btree/detail/prefix_node.hpp +++ b/src/include/homestore/btree/detail/prefix_node.hpp @@ -25,7 +25,8 @@ SISL_LOGGING_DECL(btree) namespace homestore { - +template < typename K, typename V > +class Btree; // Internal format of variable node: // [Persistent Header][prefix_node_header][prefix_area_bitset][KV Suffix][KV Suffix].. ... ... [KV Prefix][KV Prefix] // @@ -38,6 +39,7 @@ class FixedPrefixNode : public VariantNode< K, V > { using BtreeNode::get_nth_value_size; using BtreeNode::to_string; using VariantNode< K, V >::get_nth_value; + friend class Btree< K, V >; private: #pragma pack(1) @@ -596,6 +598,8 @@ class FixedPrefixNode : public VariantNode< K, V > { uint32_t copy_by_entries(BtreeConfig const& cfg, BtreeNode const& o, uint32_t start_idx, uint32_t nentries) override { + if (nentries == 0) { return 0; } + if (!has_room(nentries) && has_room_after_compaction(nentries)) { compact(); } return copy_internal(cfg, o, start_idx, false /* by_size*/, nentries); } @@ -691,7 +695,6 @@ class FixedPrefixNode : public VariantNode< K, V > { fmt::format_to(std::back_inserter(str), "{}Prefix_Hdr=[{}], Prefix_Bitmap = [{}] # of holes = {}\n", (print_friendly ? "\n\t" : " "), cprefix_header()->to_string(), this->compact_bitset(), this->num_prefix_holes()); - for (uint32_t i{0}; i < this->total_entries(); ++i) { fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={} slot#={} ref_count={}]", (print_friendly ? "\n\t" : " "), i + 1, BtreeNode::get_nth_key< K >(i, false).to_string(), diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp index 47fbd8f21..f02537281 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_index_btree.cpp @@ -112,6 +112,9 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { void TearDown() override { BtreeTestHelper< TestType >::TearDown(); + auto [interior, leaf] = this->m_bt->compute_node_count(); + LOGINFO("Teardown with Root bnode_id {} tree size: {} btree node count (interior = {} leaf= {})", + this->m_bt->root_node_id(), this->m_bt->count_keys(this->m_bt->root_node_id()), interior, leaf); m_helper.shutdown_homestore(false); this->m_bt.reset(); log_obj_life_counter(); @@ -529,8 +532,9 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin LOGINFO("Error: failed to remove {}", m_shadow_filename); } } - LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(), - this->m_bt->count_keys(this->m_bt->root_node_id())); + auto [interior, leaf] = this->m_bt->compute_node_count(); + LOGINFO("Teardown with Root bnode_id {} tree size: {} btree node count (interior = {} leaf= {})", + this->m_bt->root_node_id(), this->m_bt->count_keys(this->m_bt->root_node_id()), interior, leaf); BtreeTestHelper< TestType >::TearDown(); m_helper.shutdown_homestore(false); this->m_bt.reset(); From b0482c0077641859d3e801cd5b310d51196f4664 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Thu, 24 Jul 2025 15:30:14 -0700 Subject: [PATCH 161/170] Fix table ordinal (#777) --- conanfile.py | 2 +- src/include/homestore/index/index_table.hpp | 16 +++++++++++--- src/include/homestore/index_service.hpp | 4 +++- src/lib/index/index_service.cpp | 23 +++++++++++++++++++++ src/lib/index/wb_cache.cpp | 11 +++++++--- src/tests/test_index_crash_recovery.cpp | 4 ++++ src/tests/test_scripts/index_test.py | 14 ++++++------- 7 files changed, 59 insertions(+), 15 deletions(-) diff --git a/conanfile.py b/conanfile.py index e51105ca9..f28c0329c 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.8" + version = "6.20.9" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index abf76dc98..3997a3ca3 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -64,9 +64,15 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { uint32_t ordinal = INVALID_ORDINAL, const std::vector< chunk_num_t >& chunk_ids = {}, uint32_t pdev_id = 0) : Btree< K, V >{cfg}, m_sb{"index"} { - auto ord_num = (ordinal == INVALID_ORDINAL) ? (hs()->index_service().reserve_ordinal()) : ordinal; - BT_LOG_ASSERT(!hs()->index_service().get_index_table(ord_num), "table with ordinal {} already exists"); - + uint32_t ord_num = INVALID_ORDINAL; + if (ordinal != INVALID_ORDINAL) { + BT_LOG_ASSERT(!hs()->index_service().get_index_table(ordinal), "table with ordinal {} already exists", + ordinal); + hs()->index_service().reserve_ordinal(ordinal); + ord_num = ordinal; + } else { + ord_num = hs()->index_service().reserve_ordinal(); + } // Create a superblk for the index table and create MetaIndexBuffer corresponding to that m_sb.create(sizeof(index_table_sb) + (chunk_ids.size() * sizeof(chunk_num_t))); m_sb->init_chunks(chunk_ids); @@ -385,6 +391,10 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } void update_sb() override { + if (!this->m_sb_buffer || !this->m_sb_buffer->m_valid) { + LOGERROR("Attempting to update superblk when it is already invalid"); + return; + } m_sb->total_interior_nodes = this->m_total_interior_nodes; m_sb->total_leaf_nodes = this->m_total_leaf_nodes; m_sb->btree_depth = this->m_btree_depth; diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp index 431865848..2cef7eb7e 100644 --- a/src/include/homestore/index_service.hpp +++ b/src/include/homestore/index_service.hpp @@ -84,8 +84,10 @@ class IndexService { void write_sb(uint32_t ordinal); bool sanity_check(const uint32_t index_ordinal, const IndexBufferPtrList& bufs) const; - // Reserve an ordinal for the index table + // Reserve/unreserve an ordinal for the index table uint32_t reserve_ordinal(); + bool reserve_ordinal(uint32_t ordinal); + bool unreserve_ordinal(uint32_t ordinal); uint64_t used_size() const; uint32_t node_size() const; diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index b117adb0f..db2e19172 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -69,6 +69,24 @@ shared< VirtualDev > IndexService::open_vdev(const vdev_info& vinfo, bool load_e uint32_t IndexService::reserve_ordinal() { return m_ordinal_reserver->reserve(); } +bool IndexService::reserve_ordinal(uint32_t ordinal) { + if (m_ordinal_reserver->is_reserved(ordinal)) { + LOGERROR("ordinal {} is already reserved", ordinal); + return false; + } + m_ordinal_reserver->reserve(ordinal); + return true; +} + +bool IndexService::unreserve_ordinal(uint32_t ordinal) { + if (!m_ordinal_reserver->is_reserved(ordinal)) { + LOGERROR("ordinal {} doesn't exist", ordinal); + return false; + } + m_ordinal_reserver->unreserve(ordinal); + return true; +} + void IndexService::start() { // Start Writeback cache m_wb_cache = std::make_unique< IndexWBCache >(m_vdev, m_wbcache_sb, hs()->evictor(), @@ -84,6 +102,7 @@ void IndexService::start() { LOGINFO("sb metrics interior {}, leaf: {} depth {}", inode, lnode, depth); auto tbl = m_svc_cbs->on_index_table_found(std::move(sb)); tbl->load_metrics(inode, lnode, depth); + reserve_ordinal(tbl->ordinal()); add_index_table(tbl); } @@ -152,6 +171,10 @@ bool IndexService::remove_index_table(const std::shared_ptr< IndexTableBase >& t if (is_stopping()) return false; incr_pending_request_num(); std::unique_lock lg(m_index_map_mtx); + if (!unreserve_ordinal(tbl->ordinal())) { + decr_pending_request_num(); + return false; + } m_index_map.erase(tbl->uuid()); m_ordinal_index_map.erase(tbl->ordinal()); decr_pending_request_num(); diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 0e7f98e8d..2336b61c6 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -885,12 +885,17 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const buf->set_state(index_buf_state_t::FLUSHING); if (buf->is_meta_buf()) { - LOGTRACEMOD(wbcache, "Flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), - buf->to_string()); + LOGTRACEMOD(wbcache, "Flushing cp {} meta buf {}", cp_ctx->id(), buf->to_string()); auto const sb_buf = r_cast< MetaIndexBuffer* >(buf.get()); if (sb_buf->m_valid) { auto const& sb = sb_buf->m_sb; - if (!sb.is_empty()) { meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); } + if (!sb.is_empty()) { + meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + } else { + LOGTRACEMOD(wbcache, "Skipping flushing meta buf {} as sb is empty", buf->to_string()); + } + } else { + LOGTRACEMOD(wbcache, "Skipping flushing meta buf {} as it is not valid", buf->to_string()); } process_write_completion(cp_ctx, buf); } else if (buf->m_node_freed) { diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index ddd6e6a22..a19dc15b7 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -351,14 +351,18 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } void reset_btree() { + LOGINFO("Destroying index btree with uuid {} root id {}", boost::uuids::to_string(this->m_bt->uuid()), + this->m_bt->root_node_id()); hs()->index_service().remove_index_table(this->m_bt); this->m_bt->destroy(); this->trigger_cp(true); + ASSERT_EQ(hs()->index_service().num_tables(), 0) << "After destroying the index table, some table still exists"; auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); hs()->index_service().add_index_table(this->m_bt); + auto num_keys = this->m_bt->count_keys(this->m_bt->root_node_id()); this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); this->m_shadow_map.save(m_shadow_filename); LOGINFO("Reset btree with uuid {} - erase shadow map {}", boost::uuids::to_string(uuid), m_shadow_filename); diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index 25060a89d..0cfb3f497 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -96,7 +96,7 @@ def long_running_crash_put(options): print("Long running crash put started") options['num_entries'] = 1310720 # 1280K options['init_device'] = True - options['run_time'] = 14400 # 4 hours + options['run_time'] = 7200 # 2 hours options['preload_size'] = 1024 print(f"options: {options}") run_crash_test(options, 'put', 0) @@ -104,9 +104,9 @@ def long_running_crash_put(options): def long_running_crash_remove(options): print("Long running crash remove started") - options['num_entries'] = 1000 + options['num_entries'] = 102400 # 100K options['init_device'] = True - options['run_time'] = 14400 # 4 hours + options['run_time'] = 7200 # 2 hours options['num_entries_per_rounds'] = 100 options['min_keys_in_node'] = 2 options['max_keys_in_node'] = 10 @@ -116,9 +116,9 @@ def long_running_crash_remove(options): def long_running_crash_put_remove(options): print("Long running crash put_remove started") - options['num_entries'] = 2000 # 1280K + options['num_entries'] = 102400 # 100K options['init_device'] = True - options['run_time'] = 14400 # 4 hours + options['run_time'] = 7200 # 2 hours options['preload_size'] = 1024 options['min_keys_in_node'] = 3 options['max_keys_in_node'] = 10 @@ -148,10 +148,10 @@ def long_running(*args): long_running_clean_shutdown(options, 0) # long_runnig_index(options, 1) # long_running_clean_shutdown(options, 1) - for i in range(20): + for i in range(5): print(f"Iteration {i + 1}") long_running_crash_put_remove(options) - for i in range(50): + for i in range(5): print(f"Iteration {i + 1}") long_running_crash_remove(options) for i in range(5): From 3054646e6ac6dc643ba9f60a8aa084923a10a3a1 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Fri, 25 Jul 2025 02:47:34 -0700 Subject: [PATCH 162/170] Adding more logs to check actual vdev cp flush and freeing blkids (#778) --- conanfile.py | 2 +- src/lib/device/virtual_dev.cpp | 6 ++++-- src/lib/index/index_cp.cpp | 9 +++++++++ src/lib/index/index_cp.hpp | 1 + src/lib/index/wb_cache.cpp | 14 ++++++++++++-- 5 files changed, 27 insertions(+), 5 deletions(-) diff --git a/conanfile.py b/conanfile.py index f28c0329c..04306c2e9 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.9" + version = "6.20.10" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp index c6597577c..2a58886a3 100644 --- a/src/lib/device/virtual_dev.cpp +++ b/src/lib/device/virtual_dev.cpp @@ -726,8 +726,10 @@ void VirtualDev::cp_flush(VDevCPContext* v_cp_ctx) { CP* cp = v_cp_ctx->cp(); // pass down cp so that underlying components can get their customized CP context if needed; - m_chunk_selector->foreach_chunks( - [this, cp](cshared< Chunk >& chunk) { chunk->blk_allocator_mutable()->cp_flush(cp); }); + m_chunk_selector->foreach_chunks([this, cp](cshared< Chunk >& chunk) { + HS_LOG(TRACE, device, "Flushing chunk: {}, vdev: {}", chunk->chunk_id(), m_vdev_info.name); + chunk->blk_allocator_mutable()->cp_flush(cp); + }); // All of the blkids which were captured in the current vdev cp context will now be freed and hence available for // allocation on the new CP dirty collection session which is ongoing diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index c75b513a1..30e6ac02e 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -86,6 +86,15 @@ std::string IndexCPContext::to_string_small() { m_dirty_buf_count.get(), m_dirty_buf_list.size()); } +std::string IndexCPContext::to_string_free_list() { + std::string str{ + fmt::format("IndexCPContext cpid={} free_blkid_list_size={}\n[", m_cp->id(), m_free_blkid_list.size())}; + if (m_free_blkid_list.size() == 0) { return str + "empty]"; } + m_free_blkid_list.foreach_entry( + [&str](BlkId bid) { fmt::format_to(std::back_inserter(str), "{}:{}, ", bid.to_integer(), bid.to_string()); }); + return str + "]"; +} + std::string IndexCPContext::to_string() { std::string str{fmt::format("IndexCPContext cpid={} dirty_buf_count={} dirty_buf_list_size={}\n", m_cp->id(), m_dirty_buf_count.get(), m_dirty_buf_list.size())}; diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/index_cp.hpp index 8cca904f4..b15cba892 100644 --- a/src/lib/index/index_cp.hpp +++ b/src/lib/index/index_cp.hpp @@ -163,6 +163,7 @@ struct IndexCPContext : public VDevCPContext { std::optional< IndexBufferPtr > next_dirty(); std::string to_string(); std::string to_string_small(); + std::string to_string_free_list(); std::string to_string_with_dags(); uint16_t num_dags(); void to_string_dot(const std::string& filename); diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 2336b61c6..c17c2b436 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -442,6 +442,8 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { buf->m_node_freed = true; resource_mgr().inc_free_blk(m_node_size); m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(cp_ctx)); + LOGTRACEMOD(wbcache, "Freeing bkid = {}. Remove from cache?(aka not recovery mode) = {}", buf->m_blkid.to_integer(), + !m_in_recovery); } //////////////////// Recovery Related section ///////////////////////////////// @@ -903,6 +905,10 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const buf->to_string()); process_write_completion(cp_ctx, buf); } else { + if (buf->m_created_cp_id == cp_ctx->id()) { + LOGTRACEMOD(wbcache, "Flushing cp {} new node buf {} blkid {}", cp_ctx->id(), buf->to_string(), + buf->blkid().to_string()); + } m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) .thenValue([buf, cp_ctx](auto) { try { @@ -927,7 +933,8 @@ void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferP } #endif - LOGTRACEMOD(wbcache, "cp {} buf {}", cp_ctx->id(), buf->to_string()); + LOGTRACEMOD(wbcache, "cp {} completed flushed for buf {} blkid {}", cp_ctx->id(), buf->to_string(), + buf->blkid().to_string()); resource_mgr().dec_dirty_buf_size(m_node_size); m_updated_ordinals.insert(buf->m_index_ordinal); auto [next_buf, has_more] = on_buf_flush_done(cp_ctx, buf); @@ -942,9 +949,12 @@ void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferP // We are done flushing the buffers, We flush the vdev to persist the vdev bitmaps and free blks // Pick a CP Manager blocking IO fiber to execute the cp flush of vdev iomanager.run_on_forget(cp_mgr().pick_blocking_io_fiber(), [this, cp_ctx]() { - LOGTRACEMOD(wbcache, "Initiating CP flush"); + auto cp_id = cp_ctx->id(); + LOGTRACEMOD(wbcache, "Initiating CP {} flush", cp_id); m_vdev->cp_flush(cp_ctx); // This is a blocking io call + LOGTRACEMOD(wbcache, "CP {} freed blkids: \n{}", cp_id, cp_ctx->to_string_free_list()); cp_ctx->complete(true); + LOGTRACEMOD(wbcache, "Completed CP {} flush", cp_id); }); } } From f2b02c07450c9a897e1e3dd8bc69942da22cbec8 Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Sat, 26 Jul 2025 14:17:51 -0700 Subject: [PATCH 163/170] flush a buffer only if it is dirtied in the current cp --- conanfile.py | 2 +- src/lib/index/wb_cache.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index 04306c2e9..5b63b86d1 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.10" + version = "6.20.11" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index c17c2b436..7e86ccdf2 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -1023,7 +1023,8 @@ void IndexWBCache::get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_c std::optional< IndexBufferPtr > buf = cp_ctx->next_dirty(); if (!buf) { break; } // End of list - if ((*buf)->state() == index_buf_state_t::DIRTY && (*buf)->m_wait_for_down_buffers.testz()) { + if ((*buf)->state() == index_buf_state_t::DIRTY && (*buf)->m_dirtied_cp_id == cp_ctx->id() + && (*buf)->m_wait_for_down_buffers.testz()) { bufs.emplace_back(std::move(*buf)); ++count; } else { From 06e6c49303dcd523bdea427e700eb03b04c3553b Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Sat, 26 Jul 2025 15:25:00 -0700 Subject: [PATCH 164/170] add comment explaining the change --- conanfile.py | 2 +- src/lib/index/wb_cache.cpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 5b63b86d1..b96435adc 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.11" + version = "6.22.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 7e86ccdf2..7040d5b15 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -1023,6 +1023,9 @@ void IndexWBCache::get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_c std::optional< IndexBufferPtr > buf = cp_ctx->next_dirty(); if (!buf) { break; } // End of list + // If a buffer is reused during overlapping cp, there is a possibility that + // the buffer which is already flushed in cp x is dirtied by cp x + 1 + // and is picked up again to flush by cp x through this code path. if ((*buf)->state() == index_buf_state_t::DIRTY && (*buf)->m_dirtied_cp_id == cp_ctx->id() && (*buf)->m_wait_for_down_buffers.testz()) { bufs.emplace_back(std::move(*buf)); From 9c39af4281db0768e2ca59869c81edbe43f1d874 Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Mon, 28 Jul 2025 11:55:09 -0700 Subject: [PATCH 165/170] skip sanity check for the new bufs which are not considered after recovery --- conanfile.py | 2 +- src/lib/index/wb_cache.cpp | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index b96435adc..5b63b86d1 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.22.1" + version = "6.20.11" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 7040d5b15..9610b89ec 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -599,6 +599,7 @@ void IndexWBCache::recover(sisl::byte_view sb) { [](const IndexBufferPtr& a, const IndexBufferPtr& b) { return a->m_node_level < b->m_node_level; }); std::vector< IndexBufferPtr > pruned_bufs_to_repair; + std::set< IndexBufferPtr > bufs_to_skip_sanity_check; LOGTRACEMOD(wbcache, "\n\n\nRecovery processing begins\n\n\n"); for (auto const& [_, buf] : bufs) { load_buf(buf); @@ -657,6 +658,8 @@ void IndexWBCache::recover(sisl::byte_view sb) { buf->m_up_buffer->to_string(), buf->to_string()); buf->m_up_buffer->remove_down_buffer(buf); prune_up_buffers(buf, pruned_bufs_to_repair); + // Skip the sanity check on this buf as we do not keep it + bufs_to_skip_sanity_check.insert(buf); // buf->m_up_buffer = nullptr; } } @@ -724,7 +727,9 @@ void IndexWBCache::recover(sisl::byte_view sb) { std::map< uint32_t, IndexBufferPtrList > changed_bufs; for (auto const& [_, buf] : bufs) { LOGTRACEMOD(wbcache, "{}", buf->to_string()); - if (!buf->m_node_freed) { changed_bufs[buf->m_index_ordinal].push_back(buf); } + if (!buf->m_node_freed && !bufs_to_skip_sanity_check.contains(buf)) { + changed_bufs[buf->m_index_ordinal].push_back(buf); + } } for (auto const& [index_ordinal, bufs] : changed_bufs) { LOGTRACEMOD(wbcache, "Sanity checking buffers for index ordinal {}: # of bufs {}", index_ordinal, From 3ffa892baf4511842c5bbba16aefeb0ddeaa11a4 Mon Sep 17 00:00:00 2001 From: Sanal Date: Wed, 30 Jul 2025 11:40:15 -0700 Subject: [PATCH 166/170] Add metrics for cp and blk alloc latency. (#782) --- src/include/homestore/checkpoint/cp.hpp | 1 + src/lib/checkpoint/cp_mgr.cpp | 3 +++ src/lib/device/virtual_dev.cpp | 5 ++++- src/lib/device/virtual_dev.hpp | 1 + 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/include/homestore/checkpoint/cp.hpp b/src/include/homestore/checkpoint/cp.hpp index 761b2a2d2..266713c55 100644 --- a/src/include/homestore/checkpoint/cp.hpp +++ b/src/include/homestore/checkpoint/cp.hpp @@ -89,6 +89,7 @@ struct CP { cp_id_t m_cp_id; std::array< std::unique_ptr< CPContext >, (size_t)cp_consumer_t::SENTINEL > m_contexts; folly::SharedPromise< bool > m_comp_promise; + Clock::time_point m_cp_start_time; #ifdef _PRERELEASE std::atomic< bool > m_abrupt_cp{false}; #endif diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp index 15c175bc0..960d885e2 100644 --- a/src/lib/checkpoint/cp_mgr.cpp +++ b/src/lib/checkpoint/cp_mgr.cpp @@ -111,6 +111,7 @@ void CPManager::create_first_cp() { m_cur_cp = new CP(this); m_cur_cp->m_cp_status = cp_status_t::cp_io_ready; m_cur_cp->m_cp_id = m_sb->m_last_flushed_cp + 1; + m_cur_cp->m_cp_start_time = Clock::now(); } void CPManager::shutdown() { @@ -220,6 +221,7 @@ folly::Future< bool > CPManager::do_trigger_cp_flush(bool force, bool flush_on_s // allocate a new cp and ask consumers to switchover to new cp auto new_cp = new CP(this); new_cp->m_cp_id = cur_cp->m_cp_id + 1; + new_cp->m_cp_start_time = Clock::now(); HS_PERIODIC_LOG(DEBUG, cp, "Create New CP session", new_cp->id()); // sealer should be the first one to switch over @@ -291,6 +293,7 @@ void CPManager::on_cp_flush_done(CP* cp) { ++(m_sb->m_last_flushed_cp); m_sb.write(); + HISTOGRAM_OBSERVE(*m_metrics, cp_latency, get_elapsed_time_us(cp->m_cp_start_time)); cleanup_cp(cp); // Setting promise will cause the CP manager destructor to cleanup before getting a chance to do the diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp index 2a58886a3..e0a251c8c 100644 --- a/src/lib/device/virtual_dev.cpp +++ b/src/lib/device/virtual_dev.cpp @@ -230,6 +230,7 @@ BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& BlkAllocStatus status; Chunk* chunk; size_t attempt{0}; + auto start_time = Clock::now(); if (hints.chunk_id_hint) { // this is a target-chunk allocation; chunk = m_dmgr.get_chunk_mutable(*(hints.chunk_id_hint)); @@ -257,6 +258,7 @@ BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& COUNTER_INCREMENT(m_metrics, vdev_num_alloc_failure, 1); } + HISTOGRAM_OBSERVE(m_metrics, blk_alloc_latency, get_elapsed_time_us(start_time)); return status; } catch (const std::exception& e) { LOGERROR("exception happened {}", e.what()); @@ -274,7 +276,7 @@ BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& h.is_contiguous = true; blk_count_t nblks_remain = nblks; BlkAllocStatus status; - + auto start_time = Clock::now(); do { MultiBlkId mbid; status = alloc_n_contiguous_blks(nblks_remain, h, mbid); @@ -296,6 +298,7 @@ BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& } while (nblks_remain); + HISTOGRAM_OBSERVE(m_metrics, blk_alloc_latency, get_elapsed_time_us(start_time)); return status; } diff --git a/src/lib/device/virtual_dev.hpp b/src/lib/device/virtual_dev.hpp index 14ea1ee9a..d853acac2 100644 --- a/src/lib/device/virtual_dev.hpp +++ b/src/lib/device/virtual_dev.hpp @@ -54,6 +54,7 @@ class VirtualDevMetrics : public sisl::MetricsGroupWrapper { REGISTER_COUNTER(default_chunk_allocation_cnt, "default chunk allocation count"); REGISTER_COUNTER(random_chunk_allocation_cnt, "random chunk allocation count"); // ideally it should be zero for hdd + REGISTER_HISTOGRAM(blk_alloc_latency, "Blk allocation latency", "blk_alloc_latency"); register_me_to_farm(); } From f7570e071bcdf25ceb253a82e12fde56b8e723e2 Mon Sep 17 00:00:00 2001 From: Ravi Nagarjun Akella Date: Wed, 30 Jul 2025 16:41:59 -0700 Subject: [PATCH 167/170] set buf state clean after getting its down buf during cp flush --- conanfile.py | 2 +- src/lib/index/wb_cache.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index 5b63b86d1..c865ee577 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.11" + version = "6.20.12" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 9610b89ec..6032ce1a7 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -982,12 +982,13 @@ std::pair< IndexBufferPtr, bool > IndexWBCache::on_buf_flush_done_internal(Index buf->m_down_buffers.clear(); } #endif - buf->set_state(index_buf_state_t::CLEAN); if (cp_ctx->m_dirty_buf_count.decrement_testz()) { + buf->set_state(index_buf_state_t::CLEAN); return std::make_pair(nullptr, false); } else { get_next_bufs_internal(cp_ctx, 1u, buf, buf_list); + buf->set_state(index_buf_state_t::CLEAN); return std::make_pair((buf_list.size() ? buf_list[0] : nullptr), true); } } From a4542bd32a466cdc0d5cbcdad223eb92704f02f3 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Mon, 4 Aug 2025 21:49:32 +0800 Subject: [PATCH 168/170] return no-op if no chunk available (#785) --- conanfile.py | 2 +- src/lib/device/virtual_dev.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index c865ee577..63bacef43 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.12" + version = "6.20.13" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp index e0a251c8c..b06859b0b 100644 --- a/src/lib/device/virtual_dev.cpp +++ b/src/lib/device/virtual_dev.cpp @@ -241,7 +241,7 @@ BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& do { chunk = m_chunk_selector->select_chunk(nblks, hints).get(); if (chunk == nullptr) { - status = BlkAllocStatus::SPACE_FULL; + status = BlkAllocStatus::BLK_ALLOC_NONE; break; } From 5c82e7dceec7c58dcb627595a3ebb427f9adb7c1 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Wed, 6 Aug 2025 21:28:35 -0700 Subject: [PATCH 169/170] UTs for simulating tombstone and GC (#737) * UTs for simulating tombstone and GC * Remove retry for now and leave it for future decision --- conanfile.py | 2 +- .../homestore/btree/detail/prefix_node.hpp | 1 - src/lib/index/wb_cache.cpp | 6 +- src/tests/btree_helpers/btree_test_helper.hpp | 69 +++++++++- src/tests/btree_helpers/btree_test_kvs.hpp | 3 + src/tests/test_mem_btree.cpp | 125 ++++++++++++++++++ 6 files changed, 198 insertions(+), 8 deletions(-) diff --git a/conanfile.py b/conanfile.py index 63bacef43..f4f51a727 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.13" + version = "6.20.14" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp index facdd2107..cd75beca0 100644 --- a/src/include/homestore/btree/detail/prefix_node.hpp +++ b/src/include/homestore/btree/detail/prefix_node.hpp @@ -315,7 +315,6 @@ class FixedPrefixNode : public VariantNode< K, V > { auto x = cur_key.compare(keys.end_key()); if ((x > 0) || ((x == 0) && !keys.is_end_inclusive())) { break; } - bool remove{true}; if (!filter_cb || filter_cb(cur_key, get_nth_value(idx, false))) { suffix_entry* sentry = get_suffix_entry(idx); deref_remove_prefix(sentry->prefix_slot); diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 6032ce1a7..8cc2192c3 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -727,7 +727,7 @@ void IndexWBCache::recover(sisl::byte_view sb) { std::map< uint32_t, IndexBufferPtrList > changed_bufs; for (auto const& [_, buf] : bufs) { LOGTRACEMOD(wbcache, "{}", buf->to_string()); - if (!buf->m_node_freed && !bufs_to_skip_sanity_check.contains(buf)) { + if (!buf->m_node_freed && !bufs_to_skip_sanity_check.contains(buf)) { changed_bufs[buf->m_index_ordinal].push_back(buf); } } @@ -1032,8 +1032,8 @@ void IndexWBCache::get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_c // If a buffer is reused during overlapping cp, there is a possibility that // the buffer which is already flushed in cp x is dirtied by cp x + 1 // and is picked up again to flush by cp x through this code path. - if ((*buf)->state() == index_buf_state_t::DIRTY && (*buf)->m_dirtied_cp_id == cp_ctx->id() - && (*buf)->m_wait_for_down_buffers.testz()) { + if ((*buf)->state() == index_buf_state_t::DIRTY && (*buf)->m_dirtied_cp_id == cp_ctx->id() && + (*buf)->m_wait_for_down_buffers.testz()) { bufs.emplace_back(std::move(*buf)); ++count; } else { diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index c2ba5780c..04357afc0 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -181,8 +181,8 @@ struct BtreeTestHelper { K key = K{k}; V value = V::generate_rand(); auto sreq = BtreeSinglePutRequest{&key, &value, btree_put_type::UPSERT, existing_v.get()}; - sreq.enable_route_tracing(); + sreq.enable_route_tracing(); auto const ret = m_bt->put(sreq); ASSERT_EQ(ret, btree_status_t::success) << "Upsert key=" << k << " failed with error=" << enum_name(ret); m_shadow_map.force_put(k, value); @@ -249,6 +249,70 @@ struct BtreeTestHelper { do_range_remove(start_k, end_key.key(), true /* removing_all_existing */); } + void move_to_tombstone(uint64_t k, btree_status_t expected_status = btree_status_t::success) { + auto existing_v = std::make_unique< V >(); + K key = K{k}; + V value = V::zero(); + put_filter_cb_t filter_cb = [](BtreeKey const& key, BtreeValue const& existing_value, BtreeValue const& value) { + if (static_cast< const V& >(existing_value) == static_cast< const V& >(value)) { + return put_filter_decision::keep; + } + return put_filter_decision::replace; + }; + auto sreq = BtreeSinglePutRequest{&key, &value, btree_put_type::UPDATE, existing_v.get(), filter_cb}; + sreq.enable_route_tracing(); + + const auto ret = m_bt->put(sreq); + ASSERT_EQ(ret, expected_status) << "UPDATING key=" << k << " failed with error=" << enum_name(ret); + } + + void move_to_tombstone(uint64_t start_key, uint64_t end_key, std::vector< std::pair< K, V > >& previous_entities, + btree_status_t expected_status = btree_status_t::success) { + auto existing_v = std::make_unique< V >(); + V value = V::zero(); + previous_entities.clear(); + put_filter_cb_t filter_cb = [&previous_entities](BtreeKey const& key, BtreeValue const& existing_value, + BtreeValue const& value) { + if (static_cast< const V& >(existing_value) == static_cast< const V& >(value)) { + return put_filter_decision::keep; + } + previous_entities.push_back( + std::make_pair(static_cast< const K& >(key), static_cast< const V& >(existing_value))); + return put_filter_decision::replace; + }; + auto preq = BtreeRangePutRequest< K >{BtreeKeyRange< K >{start_key, true, end_key, true}, + btree_put_type::UPDATE, + &value, + nullptr, + std::numeric_limits< uint32_t >::max(), + filter_cb}; + preq.enable_route_tracing(); + const auto ret = m_bt->put(preq); + + ASSERT_EQ(ret, expected_status) << "UPDATING key=[" << start_key << ", " << end_key + << "] failed with error=" << enum_name(ret); + } + + void remove_tombstone(uint64_t start_key, uint64_t end_key, std::vector< std::pair< K, V > >& previous_entities, + btree_status_t expected_status = btree_status_t::success) { + previous_entities.clear(); + auto rreq = BtreeRangeRemoveRequest< K >{ + BtreeKeyRange< K >{start_key, true, end_key, true}, nullptr, std::numeric_limits< uint32_t >::max(), + [&previous_entities](BtreeKey const& key, BtreeValue const& value) mutable -> bool { + if (static_cast< const V& >(value) == V::zero()) { return true; } + previous_entities.push_back( + std::make_pair(static_cast< const K& >(key), static_cast< const V& >(value))); + return false; + }}; + + rreq.enable_route_tracing(); + const auto ret = m_bt->remove(rreq); + + LOGDEBUG("Range remove from {} to {} returned {}", start_key, end_key, enum_name(ret)); + ASSERT_EQ(ret, expected_status) << "GC key=[" << start_key << ", " << end_key + << "] failed with error=" << enum_name(ret); + } + void range_remove_existing_random() { static std::uniform_int_distribution< uint32_t > s_rand_range_generator{2, 50}; @@ -451,7 +515,6 @@ struct BtreeTestHelper { auto sreq = BtreeSinglePutRequest{&key, &value, put_type, existing_v.get()}; sreq.enable_route_tracing(); bool done = expect_success == (m_bt->put(sreq) == btree_status_t::success); - if (put_type == btree_put_type::INSERT) { ASSERT_EQ(done, !m_shadow_map.exists(key)); } else if (put_type == btree_put_type::UPDATE) { @@ -466,7 +529,7 @@ struct BtreeTestHelper { auto rreq = BtreeRangeRemoveRequest< K >{BtreeKeyRange< K >{start_key, true, end_key, true}}; rreq.enable_route_tracing(); - auto const ret = m_bt->remove(rreq); + const auto ret = m_bt->remove(rreq); if (all_existing) { m_shadow_map.range_erase(start_key, end_key); diff --git a/src/tests/btree_helpers/btree_test_kvs.hpp b/src/tests/btree_helpers/btree_test_kvs.hpp index fb87d1939..e4a8e39bb 100644 --- a/src/tests/btree_helpers/btree_test_kvs.hpp +++ b/src/tests/btree_helpers/btree_test_kvs.hpp @@ -406,6 +406,7 @@ class TestFixedValue : public BtreeValue { virtual ~TestFixedValue() = default; static TestFixedValue generate_rand() { return TestFixedValue{g_randval_generator(g_re)}; } + static TestFixedValue zero() { return TestFixedValue{uint32_t(0)}; } TestFixedValue& operator=(const TestFixedValue& other) { m_val = other.m_val; @@ -465,6 +466,7 @@ class TestVarLenValue : public BtreeValue { } static TestVarLenValue generate_rand() { return TestVarLenValue{gen_random_string(rand_val_size())}; } + static TestVarLenValue zero() { return TestVarLenValue{""}; } sisl::blob serialize() const override { sisl::blob b{r_cast< const uint8_t* >(m_val.c_str()), uint32_cast(m_val.size())}; @@ -518,6 +520,7 @@ class TestIntervalValue : public BtreeIntervalValue { static TestIntervalValue generate_rand() { return TestIntervalValue{g_randval_generator(g_re), s_cast< uint16_t >(0)}; } + static TestIntervalValue zero() { return TestIntervalValue{0, 0}; } ///////////////////////////// Overriding methods of BtreeValue ////////////////////////// TestIntervalValue& operator=(const TestIntervalValue& other) = default; diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 14e81a2d9..680acb3bd 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -291,6 +291,131 @@ TYPED_TEST(BtreeTest, RandomRemoveRange) { this->query_all(); } +TYPED_TEST(BtreeTest, SimpleTombstone) { + const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); + LOGINFO("Step 1: Do forward sequential insert for {} entries", num_entries); + for (uint32_t i{0}; i < 20; ++i) { + this->put(i, btree_put_type::INSERT); + } + this->move_to_tombstone(10, btree_status_t::success); + this->move_to_tombstone(10, btree_status_t::filtered_out); + this->move_to_tombstone(40, btree_status_t::not_found); +} + +TYPED_TEST(BtreeTest, SimpleMultiTombstone) { + if constexpr (std::is_same_v< TypeParam, PrefixIntervalBtreeTest >) { return; } + uint32_t start_key = 500; + uint32_t end_key = 1000; + LOGDEBUG("Step 1: Do forward sequential insert for [{},{}] entries", start_key, end_key); + for (uint32_t i{start_key}; i <= end_key; ++i) { + this->put(i, btree_put_type::INSERT); + } + std::vector< std::pair< typename TypeParam::KeyType, typename TypeParam::ValueType > > out; + auto format_tombstoned = [](const auto& out) { + std::stringstream ss; + for (const auto& [k, v] : out) { + ss << "[" << k.to_string() << "] =" << v.to_string() << std::endl; + } + return ss.str(); + }; + auto run_and_validate_tombstone = [&](auto s, auto e, auto expect_status, auto expected_size) { + this->move_to_tombstone(s, e, out, expect_status); + LOGDEBUG("Tombstoned {} keys:\n{}", out.size(), format_tombstoned(out)); + ASSERT_EQ(out.size(), expected_size) << "Tombstoned keys should be " << expected_size << ", but got " + << out.size() << " keys in range [" << s << ", " << e << "]"; + }; + auto sum_tombstoned = 0; + { + run_and_validate_tombstone(0, start_key - 100, btree_status_t::not_found, 0); + run_and_validate_tombstone(end_key + 100, end_key + 2000, btree_status_t::not_found, 0); + } + { + run_and_validate_tombstone(start_key - 100, start_key, btree_status_t::success, 1); + run_and_validate_tombstone(start_key - 100, start_key, btree_status_t::success, 0); + sum_tombstoned += 1; + } + { + run_and_validate_tombstone(start_key + 20, start_key + 40, btree_status_t::success, 21); + run_and_validate_tombstone(start_key + 20, start_key + 40, btree_status_t::success, 0); + run_and_validate_tombstone(start_key + 20, start_key + 41, btree_status_t::success, 1); + run_and_validate_tombstone(start_key + 45, start_key + 50, btree_status_t::success, 6); + run_and_validate_tombstone(start_key + 20, start_key + 60, btree_status_t::success, 41 - 28); + sum_tombstoned += 21 + 1 + 6 + (41 - 28); + } + + { + run_and_validate_tombstone(end_key, end_key + 1000, btree_status_t::success, 1); + run_and_validate_tombstone(end_key, end_key + 1000, btree_status_t::success, 0); + sum_tombstoned += 1; + } + { + run_and_validate_tombstone(0, end_key + 1000, btree_status_t::success, + end_key - start_key - sum_tombstoned + 1); + run_and_validate_tombstone(0, end_key + 1000, btree_status_t::success, 0); + } + this->range_remove_existing(start_key, end_key - start_key + 1); + ASSERT_EQ(this->m_bt->count_keys(), 0); + // creating two intervals + uint32_t start_key1 = 1000; + uint32_t end_key1 = 1999; + uint32_t start_key2 = 3000; + uint32_t end_key2 = 3999; + sum_tombstoned = 0; + for (uint32_t i{start_key1}; i <= end_key1; ++i) { + this->put(i, btree_put_type::INSERT); + } + for (uint32_t i{start_key2}; i <= end_key2; ++i) { + this->put(i, btree_put_type::INSERT); + } + { + run_and_validate_tombstone(start_key1 + 100, end_key2 + 100, btree_status_t::success, 1900); + run_and_validate_tombstone(start_key1 + 100, end_key2 + 100, btree_status_t::success, 0); + } +} + +TYPED_TEST(BtreeTest, SimpleGC) { + if constexpr (std::is_same_v< TypeParam, PrefixIntervalBtreeTest >) { return; } + uint32_t start_key1 = 1000; + uint32_t end_key1 = 1999; + uint32_t start_key2 = 3000; + uint32_t end_key2 = 3999; + std::vector< std::pair< typename TypeParam::KeyType, typename TypeParam::ValueType > > out; + for (uint32_t i{start_key1}; i <= end_key1; ++i) { + this->put(i, btree_put_type::INSERT); + } + for (uint32_t i{start_key2}; i <= end_key2; ++i) { + this->put(i, btree_put_type::INSERT); + } + this->print_keys(" Before tombstone "); + auto start_tombstone = start_key1 + 100; + auto end_tombstone = end_key1 - 100; + auto expected_size = end_key1 - 200 - start_key1 + 1; + this->move_to_tombstone(start_tombstone, end_tombstone, out, btree_status_t::success); + ASSERT_EQ(out.size(), expected_size) << "Tombstoned keys should be " << expected_size << ", but got " << out.size() + << " keys in range [" << start_tombstone << ", " << end_tombstone << "]"; + + this->print_keys(fmt::format(" After tombstone [{},{}] ", start_tombstone, end_tombstone)); + LOGINFO("Step 2: Do GC on the tree for keys in range [{}, {}]", start_key1, end_key2); + this->remove_tombstone(start_key1, end_key2, out, btree_status_t::success); + expected_size = end_key2 - start_key1 + 1 - 1000 - expected_size; + ASSERT_EQ(out.size(), expected_size) << "# of keys after GCs hould be " << expected_size << ", but got " + << out.size() << " keys in range [" << start_key1 << ", " << end_key2 << "]"; + auto format_tombstoned = [](const auto& out) { + std::stringstream ss; + for (const auto& [k, v] : out) { + ss << "[" << k.to_string() << "] =" << v.to_string() << std::endl; + } + return ss.str(); + }; + + this->print_keys(fmt::format(" After GC {} entries are still in range [{},{}] ", out.size(), start_key1, end_key2)); + LOGDEBUG("GC {} keys:\n{}", out.size(), format_tombstoned(out)); + this->remove_tombstone(start_key1, end_key2, out, btree_status_t::not_found); + ASSERT_EQ(out.size(), expected_size) << "After GC, no keys should be left in range [" << start_key1 << ", " + << end_key2 << "] but got " << out.size(); + LOGDEBUG("GC {} keys:\n{}", out.size(), format_tombstoned(out)); +} + template < typename TestType > struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testing::Test { using T = TestType; From 1eff796f9cc4a10e3bd01ca2c1557a294c846478 Mon Sep 17 00:00:00 2001 From: Hari Kadayam Date: Fri, 8 Aug 2025 06:39:00 -0700 Subject: [PATCH 170/170] Next version of HomeStore, where following major changes are implemented 1. Introduce multiple index so that homestore can actually have different types of Index stores. 2. Introduce a new Btree called CopyOnWrite Btree, instead of inplace btree where the btree pages are not written in place, but on different location, but maintain a map. 3. Make the public interfaces to be very concise (having a BtreeBase and put that in the implementation) 4. Simplified the btree apis 5. Used latest sisl 13.x with REGISTER_LOG_MODS 6. Added cow btree crash test, updated other tests to ensure pass 7. Moved existing implementation of btree to inplace btree 8. Updated the build and dependency build github CI/CD pipeline 9. Made replication as an optional module --- .github/workflows/build_commit.yml | 80 +- .github/workflows/build_dependencies.yml | 178 ++-- .github/workflows/merge_build.yml | 15 +- .github/workflows/pr_build.yml | 6 +- CMakeLists.txt | 14 +- README.md | 6 +- cmake/test_mode.cmake | 6 + conanfile.py | 21 +- src/CMakeLists.txt | 44 +- src/docs/metablk_test_case.md | 59 ++ src/include/homestore/blk.h | 23 +- src/include/homestore/blkdata_service.hpp | 13 - src/include/homestore/btree/btree.hpp | 410 ++++---- src/include/homestore/btree/btree.ipp | 376 +++---- src/include/homestore/btree/btree_base.hpp | 171 +++ src/include/homestore/btree/btree_kv.hpp | 8 +- src/include/homestore/btree/btree_store.h | 39 + .../homestore/btree/detail/btree_common.ipp | 420 ++------ .../homestore/btree/detail/btree_get_impl.ipp | 31 +- .../homestore/btree/detail/btree_internal.hpp | 69 +- .../btree/detail/btree_mutate_impl.ipp | 111 +- .../homestore/btree/detail/btree_node.hpp | 421 +++++--- .../homestore/btree/detail/btree_node_mgr.ipp | 436 ++------ .../btree/detail/btree_query_impl.ipp | 62 +- .../btree/detail/btree_remove_impl.ipp | 333 ++++-- .../btree/{ => detail}/btree_req.hpp | 142 ++- src/include/homestore/btree/mem_btree.hpp | 86 -- .../btree/node_variant/mini_trie_node.hpp | 988 ++++++++++++++++++ .../{detail => node_variant}/prefix_node.hpp | 289 +++-- .../{detail => node_variant}/simple_node.hpp | 131 ++- .../{detail => node_variant}/variant_node.hpp | 45 +- .../{detail => node_variant}/varlen_node.hpp | 205 ++-- src/include/homestore/checkpoint/cp.hpp | 11 +- src/include/homestore/checkpoint/cp_mgr.hpp | 59 +- src/include/homestore/chunk_selector.h | 4 +- src/include/homestore/fault_cmt_service.hpp | 45 - src/include/homestore/homestore.hpp | 89 +- src/include/homestore/homestore_decl.hpp | 5 +- src/include/homestore/index/index_common.h | 19 + .../homestore/index/index_internal.hpp | 186 ---- src/include/homestore/index_service.hpp | 151 +-- src/include/homestore/logstore/log_store.hpp | 35 +- src/include/homestore/logstore_service.hpp | 14 - src/include/homestore/meta_service.hpp | 6 +- .../homestore/replication/repl_decls.h | 15 - src/include/homestore/replication/repl_dev.h | 63 +- src/include/homestore/replication_service.hpp | 28 +- src/include/homestore/vchunk.h | 2 - src/lib/blkalloc/append_blk_allocator.cpp | 12 +- src/lib/blkalloc/blk.cpp | 12 + src/lib/blkalloc/blk_allocator.h | 3 - src/lib/blkalloc/fixed_blk_allocator.h | 2 +- src/lib/blkalloc/ss_blk_allocator.h | 53 + src/lib/blkalloc/varsize_blk_allocator.cpp | 18 +- src/lib/blkalloc/varsize_blk_allocator.h | 2 +- src/lib/blkdata_svc/blkdata_service.cpp | 76 +- src/lib/checkpoint/cp_mgr.cpp | 185 ++-- src/lib/common/concurrent_vector.hpp | 97 ++ src/lib/common/crash_simulator.hpp | 12 +- src/lib/common/homestore_config.fbs | 39 +- src/lib/common/homestore_utils.cpp | 2 +- src/lib/common/homestore_utils.hpp | 3 +- src/lib/common/large_id_reserver.hpp | 56 + src/lib/common/resource_mgr.cpp | 88 +- src/lib/common/resource_mgr.hpp | 27 +- src/lib/device/chunk.cpp | 2 +- src/lib/device/device.h | 8 +- src/lib/device/device_manager.cpp | 373 ++----- src/lib/device/hs_super_blk.h | 9 +- src/lib/device/journal_vdev.cpp | 22 +- src/lib/device/physical_dev.hpp | 1 - src/lib/device/vchunk.cpp | 4 - src/lib/device/virtual_dev.cpp | 97 +- src/lib/device/virtual_dev.hpp | 10 +- src/lib/homestore.cpp | 170 +-- src/lib/index/CMakeLists.txt | 6 +- src/lib/index/btree_base.cpp | 455 ++++++++ src/lib/index/cow_btree/CMakeLists.txt | 13 + src/lib/index/cow_btree/cow_btree.cpp | 943 +++++++++++++++++ src/lib/index/cow_btree/cow_btree.h | 321 ++++++ src/lib/index/cow_btree/cow_btree_cp.cpp | 120 +++ src/lib/index/cow_btree/cow_btree_cp.h | 84 ++ src/lib/index/cow_btree/cow_btree_node.cpp | 79 ++ src/lib/index/cow_btree/cow_btree_node.h | 53 + src/lib/index/cow_btree/cow_btree_store.cpp | 326 ++++++ src/lib/index/cow_btree/cow_btree_store.h | 80 ++ src/lib/index/index_cp.cpp | 456 +------- src/lib/index/index_cp.h | 63 ++ src/lib/index/index_service.cpp | 390 +++---- src/lib/index/{ => inplace_btree}/README.md | 0 src/lib/index/inplace_btree/index.hpp | 64 ++ src/lib/index/inplace_btree/index_buffer.cpp | 66 ++ src/lib/index/inplace_btree/index_cp.cpp | 335 ++++++ .../index/{ => inplace_btree}/index_cp.hpp | 11 +- .../inplace_btree/inplace_btree_store.h} | 340 +++--- .../index/inplace_btree/ss_btree_cache.cpp | 697 ++++++++++++ .../index/inplace_btree/ss_btree_cache.hpp | 79 ++ .../index/{ => inplace_btree}/wb_cache.cpp | 262 ++--- .../index/{ => inplace_btree}/wb_cache.hpp | 14 +- .../index/inplace_btree}/wb_cache_base.hpp | 3 +- src/lib/index/mem_btree/CMakeLists.txt | 10 + src/lib/index/mem_btree/mem_btree_store.cpp | 54 + src/lib/index/mem_btree/mem_btree_store.h | 47 + src/lib/logging.cpp | 4 - src/lib/logstore/log_dev.cpp | 137 +-- src/lib/logstore/log_dev.hpp | 41 +- src/lib/logstore/log_group.cpp | 1 - src/lib/logstore/log_store.cpp | 103 +- src/lib/logstore/log_store_service.cpp | 77 +- src/lib/logstore/log_stream.cpp | 1 - src/lib/meta/meta_blk_service.cpp | 4 - .../log_store/home_raft_log_store.cpp | 2 - .../replication/log_store/repl_log_store.cpp | 10 +- src/lib/replication/repl_dev/common.cpp | 4 +- src/lib/replication/repl_dev/common.h | 1 - .../replication/repl_dev/raft_repl_dev.cpp | 543 +++------- src/lib/replication/repl_dev/raft_repl_dev.h | 58 +- .../repl_dev/raft_state_machine.cpp | 19 +- .../replication/repl_dev/solo_repl_dev.cpp | 82 +- src/lib/replication/repl_dev/solo_repl_dev.h | 26 +- .../replication/service/generic_repl_svc.cpp | 43 +- .../replication/service/generic_repl_svc.h | 31 +- .../replication/service/raft_repl_service.cpp | 113 +- .../replication/service/raft_repl_service.h | 15 +- src/tests/CMakeLists.txt | 95 +- src/tests/btree_helpers/btree_decls.h | 27 +- src/tests/btree_helpers/btree_test_helper.hpp | 367 +++---- src/tests/btree_helpers/btree_test_kvs.hpp | 19 +- src/tests/btree_helpers/shadow_map.hpp | 36 +- src/tests/index_btree_benchmark.cpp | 1 - src/tests/log_dev_benchmark.cpp | 2 +- src/tests/log_store_benchmark.cpp | 2 +- src/tests/test_append_blkalloc.cpp | 3 +- src/tests/test_blk_cache_queue.cpp | 6 +- src/tests/test_blk_read_tracker.cpp | 3 +- src/tests/test_blkalloc.cpp | 19 +- src/tests/test_blkid.cpp | 3 +- src/tests/test_btree.cpp | 532 ++++++++++ ...ndex_btree.cpp => test_btree_long_running} | 202 +++- src/tests/test_btree_node.cpp | 112 +- .../test_common/homestore_test_common.hpp | 249 +++-- src/tests/test_common/raft_repl_test_base.hpp | 53 +- src/tests/test_cow_btree_recovery.cpp | 574 ++++++++++ src/tests/test_cp_mgr.cpp | 3 +- src/tests/test_data_service.cpp | 5 +- src/tests/test_device_manager.cpp | 255 +---- src/tests/test_home_raft_logstore.cpp | 2 +- src/tests/test_index_crash_recovery.cpp | 162 +-- src/tests/test_journal_vdev.cpp | 4 +- src/tests/test_log_dev.cpp | 3 +- src/tests/test_log_store.cpp | 2 +- src/tests/test_log_store_long_run.cpp | 2 +- src/tests/test_mem_btree.cpp | 168 +-- src/tests/test_meta_blk_mgr.cpp | 12 +- src/tests/test_pdev.cpp | 2 +- src/tests/test_raft_repl_dev.cpp | 131 --- src/tests/test_raft_repl_dev_dynamic.cpp | 112 +- src/tests/test_scripts/index_test.py | 27 +- src/tests/test_solo_repl_dev.cpp | 93 +- 159 files changed, 10468 insertions(+), 6859 deletions(-) create mode 100644 src/docs/metablk_test_case.md create mode 100644 src/include/homestore/btree/btree_base.hpp create mode 100644 src/include/homestore/btree/btree_store.h rename src/include/homestore/btree/{ => detail}/btree_req.hpp (61%) delete mode 100644 src/include/homestore/btree/mem_btree.hpp create mode 100644 src/include/homestore/btree/node_variant/mini_trie_node.hpp rename src/include/homestore/btree/{detail => node_variant}/prefix_node.hpp (79%) rename src/include/homestore/btree/{detail => node_variant}/simple_node.hpp (78%) rename src/include/homestore/btree/{detail => node_variant}/variant_node.hpp (90%) rename src/include/homestore/btree/{detail => node_variant}/varlen_node.hpp (81%) delete mode 100644 src/include/homestore/fault_cmt_service.hpp create mode 100644 src/include/homestore/index/index_common.h delete mode 100644 src/include/homestore/index/index_internal.hpp create mode 100644 src/lib/blkalloc/ss_blk_allocator.h create mode 100644 src/lib/common/concurrent_vector.hpp create mode 100644 src/lib/common/large_id_reserver.hpp create mode 100644 src/lib/index/btree_base.cpp create mode 100644 src/lib/index/cow_btree/CMakeLists.txt create mode 100644 src/lib/index/cow_btree/cow_btree.cpp create mode 100644 src/lib/index/cow_btree/cow_btree.h create mode 100644 src/lib/index/cow_btree/cow_btree_cp.cpp create mode 100644 src/lib/index/cow_btree/cow_btree_cp.h create mode 100644 src/lib/index/cow_btree/cow_btree_node.cpp create mode 100644 src/lib/index/cow_btree/cow_btree_node.h create mode 100644 src/lib/index/cow_btree/cow_btree_store.cpp create mode 100644 src/lib/index/cow_btree/cow_btree_store.h create mode 100644 src/lib/index/index_cp.h rename src/lib/index/{ => inplace_btree}/README.md (100%) create mode 100644 src/lib/index/inplace_btree/index.hpp create mode 100644 src/lib/index/inplace_btree/index_buffer.cpp create mode 100644 src/lib/index/inplace_btree/index_cp.cpp rename src/lib/index/{ => inplace_btree}/index_cp.hpp (96%) rename src/{include/homestore/index/index_table.hpp => lib/index/inplace_btree/inplace_btree_store.h} (77%) create mode 100644 src/lib/index/inplace_btree/ss_btree_cache.cpp create mode 100644 src/lib/index/inplace_btree/ss_btree_cache.hpp rename src/lib/index/{ => inplace_btree}/wb_cache.cpp (80%) rename src/lib/index/{ => inplace_btree}/wb_cache.hpp (85%) rename src/{include/homestore/index => lib/index/inplace_btree}/wb_cache_base.hpp (93%) create mode 100644 src/lib/index/mem_btree/CMakeLists.txt create mode 100644 src/lib/index/mem_btree/mem_btree_store.cpp create mode 100644 src/lib/index/mem_btree/mem_btree_store.h delete mode 100644 src/lib/logging.cpp create mode 100644 src/tests/test_btree.cpp rename src/tests/{test_index_btree.cpp => test_btree_long_running} (72%) create mode 100644 src/tests/test_cow_btree_recovery.cpp diff --git a/.github/workflows/build_commit.yml b/.github/workflows/build_commit.yml index 36ebd4575..8f959775b 100644 --- a/.github/workflows/build_commit.yml +++ b/.github/workflows/build_commit.yml @@ -20,62 +20,26 @@ on: type: string jobs: - SislDeps: - uses: eBay/sisl/.github/workflows/build_dependencies.yml@master - with: - branch: master - platform: ${{ inputs.platform }} - build-type: ${{ inputs.build-type }} - malloc-impl: ${{ inputs.malloc-impl }} - prerelease: ${{ inputs.prerelease }} - tooling: None - if: ${{ github.event_name != 'pull_request' }} + HomestorePRBuild: + uses: ./.github/workflows/build_dependencies.yml + with: + branch: ${{ github.ref }} + platform: ${{ inputs.platform }} + build-type: ${{ inputs.build-type }} + malloc-impl: ${{ inputs.malloc-impl }} + prerelease: ${{ inputs.prerelease }} + tooling: ${{ inputs.tooling }} + testing : 'True' + if: ${{ github.event_name == 'pull_request' }} - NuraftMesgDeps: - needs: SislDeps - uses: eBay/nuraft_mesg/.github/workflows/build_dependencies.yml@main - with: - branch: main - platform: ${{ inputs.platform }} - build-type: ${{ inputs.build-type }} - malloc-impl: ${{ inputs.malloc-impl }} - prerelease: ${{ inputs.prerelease }} - tooling: None - if: ${{ github.event_name != 'pull_request' }} - - IOMgrDeps: - needs: SislDeps - uses: eBay/iomanager/.github/workflows/build_dependencies.yml@master - with: - branch: master - platform: ${{ inputs.platform }} - build-type: ${{ inputs.build-type }} - malloc-impl: ${{ inputs.malloc-impl }} - prerelease: ${{ inputs.prerelease }} - tooling: None - if: ${{ github.event_name != 'pull_request' }} - - HomestoreDeps: - needs: [IOMgrDeps, NuraftMesgDeps] - uses: ./.github/workflows/build_dependencies.yml - with: - branch: ${{ github.ref }} - platform: ${{ inputs.platform }} - build-type: ${{ inputs.build-type }} - malloc-impl: ${{ inputs.malloc-impl }} - prerelease: ${{ inputs.prerelease }} - tooling: ${{ inputs.tooling }} - testing: 'True' - if: ${{ github.event_name != 'pull_request' }} - - HomestoreBuild: - uses: ./.github/workflows/build_dependencies.yml - with: - branch: ${{ github.ref }} - platform: ${{ inputs.platform }} - build-type: ${{ inputs.build-type }} - malloc-impl: ${{ inputs.malloc-impl }} - prerelease: ${{ inputs.prerelease }} - tooling: ${{ inputs.tooling }} - testing: 'True' - if: ${{ github.event_name == 'pull_request' }} + HomestoreCommitBuild: + uses: ./.github/workflows/build_dependencies.yml + with: + branch: ${{ github.ref }} + platform: ${{ inputs.platform }} + build-type: ${{ inputs.build-type }} + malloc-impl: ${{ inputs.malloc-impl }} + prerelease: ${{ inputs.prerelease }} + tooling: ${{ inputs.tooling }} + testing: 'False' + if: ${{ github.event_name != 'pull_request' }} \ No newline at end of file diff --git a/.github/workflows/build_dependencies.yml b/.github/workflows/build_dependencies.yml index 122f825af..12d2093de 100644 --- a/.github/workflows/build_dependencies.yml +++ b/.github/workflows/build_dependencies.yml @@ -81,10 +81,18 @@ on: default: 'True' jobs: - BuildHomestoreDeps: + BuildHomestore: runs-on: ${{ inputs.platform }} timeout-minutes: 1440 steps: + - name: Recover space + run: | + df -h + sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL + sudo docker image prune --all --force + sudo docker builder prune -a + df -h + - name: Retrieve Code uses: actions/checkout@main with: @@ -94,160 +102,130 @@ jobs: - name: Retrieve Recipe uses: actions/checkout@main with: - repository: eBay/Homestore + repository: hkadayam/Homestore ref: ${{ inputs.branch }} if: ${{ inputs.testing == 'False' }} - - name: Load Homestore Cache + - name: Restore Dependency Cache id: restore-cache - uses: eBay/sisl/.github/actions/load_conan@master - with: - testing: ${{ inputs.testing }} - key_prefix: HomestoreDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} - - - name: Load Sisl Cache - uses: eBay/sisl/.github/actions/load_conan@master + uses: actions/cache/restore@v4 with: - load_any: 'True' - key_prefix: SislDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} - if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} + path: | + ~/.conan2/p + key: HomestoreDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} + fail-on-cache-miss: ${{ inputs.fail_on_cache_miss }} - - name: Retrieve Dependencies + - name: Retrieve IOManager code uses: actions/checkout@main with: - repository: eBay/iomanager + repository: hkadayam/iomanager path: import/iomgr ref: master - if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} - - name: Retrieve Dependencies + - name: Retrieve NuraftMesg code uses: actions/checkout@main with: - repository: eBay/nuraft_mesg + repository: hkadayam/nuraft_mesg path: import/nuraft_mesg ref: main - if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} - - name: Load IOMgr Cache - uses: eBay/sisl/.github/actions/load_conan@master - with: - testing: 'False' - path: import/iomgr - key_prefix: IOMgrDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} - fail_on_cache_miss: true - if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} - - - name: Load NuraftMesg Cache - uses: eBay/sisl/.github/actions/load_conan@master + - name: Retrieve Sisl code + uses: actions/checkout@main with: - testing: 'False' - path: import/nuraft_mesg - key_prefix: NuMesgDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} - fail_on_cache_miss: true - if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} + repository: hkadayam/sisl + path: import/sisl + ref: master - name: Setup Conan - uses: eBay/sisl/.github/actions/setup_conan@master + uses: hkadayam/sisl/.github/actions/setup_conan@master with: platform: ${{ inputs.platform }} - if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} - - name: Export Recipes + - name: Export Recipes and DepsBuild Env run: | - sudo apt-get install -y python3-pyelftools libaio-dev - sudo rm -rf $ANDROID_HOME - python -m pip install pyelftools - conan export import/iomgr oss/master - conan export import/nuraft_mesg oss/main - cached_pkgs=$(ls -1d ~/.conan/data/*/*/*/*/package | sed 's,.*data/,,' | cut -d'/' -f1,2 | paste -sd',' - -) - echo "::info:: Pre-cached: ${cached_pkgs}" - if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} - - - name: Build Cache + conan export import/sisl --user oss --channel master + cd import/sisl && ./prepare.sh && cd ../.. + conan export import/iomgr --user oss --channel master + cd import/iomgr && ./prepare.sh && cd ../.. + conan export import/nuraft_mesg --user oss --channel main + cd import/nuraft_mesg/ && ./prepare.sh && cd ../.. + + - name: Build Homestore Dependencies + id : build-deps run: | - pre=$([[ "${{ inputs.build-type }}" != "Debug" ]] && echo "-o sisl:prerelease=${{ inputs.prerelease }}" || echo "") + du -sh ~/.conan2/p | awk '{printf("size=%d\n", $1)}' > $GITHUB_OUTPUT conan install \ + -o "sisl/*:prerelease=${{ inputs.prerelease }}" \ + -o "sisl/*:malloc_impl=${{ inputs.malloc-impl }}" \ -c tools.build:skip_test=True \ - ${pre} \ - -o sisl:malloc_impl=${{ inputs.malloc-impl }} \ - -o iomgr:testing=off \ - -o testing=off \ -s build_type=${{ inputs.build-type }} \ --build missing \ . - if: ${{ steps.restore-cache.outputs.cache-hit != 'true' }} - - - name: Save Conan Cache - uses: eBay/sisl/.github/actions/store_conan@master - with: - key_prefix: HomestoreDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} - if: ${{ github.event_name != 'pull_request' && steps.restore-cache.outputs.cache-hit != 'true' }} - - - name: Reload Sisl Cache - uses: eBay/sisl/.github/actions/load_conan@master - with: - load_any: 'True' - key_prefix: SislDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} - if: ${{ inputs.testing == 'True' && github.event_name != 'pull_request' && steps.restore-cache.outputs.cache-hit != 'true' }} - - - name: Reload IOMgr Cache - uses: eBay/sisl/.github/actions/load_conan@master - with: - testing: 'False' - path: import/iomgr - key_prefix: IOMgrDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} - fail_on_cache_miss: true - if: ${{ inputs.testing == 'True' && github.event_name != 'pull_request' && steps.restore-cache.outputs.cache-hit != 'true' }} - - name: Reload NuraftMesg Cache - uses: eBay/sisl/.github/actions/load_conan@master + - name: Remove Prior Dependencies + id : remove-prior-deps + run: | + conan remove -c -p "build_type=${{ inputs.build-type }}" '*:*#!latest' + rm -rf ~/.conan2/p/b/*/b + rm -rf ~/.conan2/p/*/b ~/.conan2/p/*/s + du -sh ~/.conan2/p | awk '{printf("size=%d\n", $1)}' > $GITHUB_OUTPUT + df -h + + - name: Delete Previous Cache + continue-on-error: true + run: | + gh extension install actions/gh-actions-cache + gh actions-cache delete "HomestoreDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }}" --confirm + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + if: ${{ github.event_name != 'pull_request' && steps.build-deps.outputs.size != steps.remove-prior-deps.outputs.size }} + + - name: Save Dependency Cache + uses: actions/cache/save@v4 with: - testing: 'False' - path: import/nuraft_mesg - key_prefix: NuMesgDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} - fail_on_cache_miss: true - if: ${{ inputs.testing == 'True' && github.event_name != 'pull_request' && steps.restore-cache.outputs.cache-hit != 'true' }} + path: | + ~/.conan2/p + key: HomestoreDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} + if: ${{ github.event_name != 'pull_request' && steps.build-deps.outputs.size != steps.remove-prior-deps.outputs.size }} - - uses: actions/checkout@main - name: Setup tmate session uses: mxschmitt/action-tmate@v3 with: limit-access-to-actor: true detached: true - connect-timeout-seconds: 60 - if: ${{ inputs.testing == 'True' }} + if: ${{ inputs.testing == 'True' || inputs.tooling == 'Coverage' }} - name: Create and Test Package run: | sanitize=$([[ "${{ inputs.tooling }}" == "Sanitize" ]] && echo "True" || echo "False") - pre=$([[ "${{ inputs.build-type }}" != "Debug" ]] && echo "-o sisl:prerelease=${{ inputs.prerelease }}" || echo "") + skip_test=$([[ "${{ inputs.testing }}" == "True" ]] && echo "False" || echo "True") conan create \ - ${pre} \ - -o sisl:malloc_impl=${{ inputs.malloc-impl }} \ - -o iomgr:testing=off \ - -o homestore:sanitize=${sanitize} \ + -o "sisl/*:prerelease=${{ inputs.prerelease }}" \ + -o "sisl/*:malloc_impl=${{ inputs.malloc-impl }}" \ + -o "homestore/*:sanitize=${sanitize}" \ + -c tools.build:skip_test=${skip_test} \ -s build_type=${{ inputs.build-type }} \ --build missing \ . - if: ${{ inputs.testing == 'True' && inputs.tooling != 'Coverage' }} + if: ${{ inputs.tooling != 'Coverage' }} - name: Code Coverage Run run: | - pre=$([[ "${{ inputs.build-type }}" != "Debug" ]] && echo "-o sisl:prerelease=${{ inputs.prerelease }}" || echo "") - conan install \ - ${pre} \ - -o sisl:malloc_impl=${{ inputs.malloc-impl }} \ - -o iomgr:testing=off \ + du -sh ~/.conan2/p/* + df -h + conan build \ + -o "sisl/*:prerelease=${{ inputs.prerelease }}" \ + -o "sisl/*:malloc_impl=${{ inputs.malloc-impl }}" \ -o coverage=True \ + -c tools.build:skip_test=False \ -s build_type=${{ inputs.build-type }} \ --build missing \ . - conan build . - if: ${{ inputs.testing == 'True' && inputs.tooling == 'Coverage' }} + if: ${{ inputs.tooling == 'Coverage' }} - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} gcov: true - if: ${{ inputs.testing == 'True' && inputs.tooling == 'Coverage' }} + if: ${{ inputs.tooling == 'Coverage' }} diff --git a/.github/workflows/merge_build.yml b/.github/workflows/merge_build.yml index 3cc5f8c6d..0dc1430ce 100644 --- a/.github/workflows/merge_build.yml +++ b/.github/workflows/merge_build.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: push: branches: - - master + - main jobs: Build: @@ -36,16 +36,3 @@ jobs: malloc-impl: ${{ matrix.malloc-impl }} prerelease: ${{ matrix.prerelease }} tooling: ${{ matrix.tooling }} - ChainBuild: - runs-on: "ubuntu-22.04" - steps: - - name: Start HomeObject Build - run: | - curl -L \ - -X POST \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer ${{ secrets.CHAIN_BUILD_TOKEN }}"\ - -H "X-GitHub-Api-Version: 2022-11-28" \ - https://api.github.com/repos/eBay/homeobject/actions/workflows/conan_build.yml/dispatches \ - -d '{"ref":"main","inputs":{}}' - if: ${{ github.ref == 'refs/heads/master' }} diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml index 3bcd6cb35..7ed350aed 100644 --- a/.github/workflows/pr_build.yml +++ b/.github/workflows/pr_build.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: pull_request: branches: - - master + - main jobs: Build: @@ -15,7 +15,7 @@ jobs: build-type: ["Debug", "Release"] malloc-impl: ["libc", "tcmalloc"] prerelease: ["True", "False"] - tooling: ["Sanitize", "Coverage", "None"] + tooling: ["Sanitize", "None"] exclude: - build-type: Debug prerelease: "False" @@ -27,8 +27,6 @@ jobs: malloc-impl: libc - build-type: Release tooling: Sanitize - - build-type: Release - tooling: Coverage uses: ./.github/workflows/build_commit.yml with: platform: ${{ matrix.platform }} diff --git a/CMakeLists.txt b/CMakeLists.txt index e1a0bbdc0..728a2bdbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,6 +87,18 @@ endif () add_flags("-DPACKAGE_NAME=\\\"${PROJECT_NAME}\\\"") add_flags("-DPACKAGE_VERSION=\\\"${PACKAGE_REVISION}\\\"") +# add replication flag +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + add_flags("-DREPLICATION_SUPPORT") + message(STATUS "Building with REPLICATION enabled") + else() + message(STATUS "Building with REPLICATION disabled") + endif() +else() + message(STATUS "Building with REPLICATION disabled") +endif() + if(UNIX) # enable proper pread/pwrite and large file add_flags("-D_POSIX_C_SOURCE=200809L -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE") @@ -116,4 +128,4 @@ message(STATUS "C++ flags: ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_ message(STATUS "Linker flags (executable): ${CMAKE_EXE_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS_${UC_CMAKE_BUILD_TYPE}}") message(STATUS "Linker flags (shared): ${CMAKE_SHARED_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS_${UC_CMAKE_BUILD_TYPE}}") message(STATUS "Linker flags (module): ${CMAKE_MODULE_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS_${UC_CMAKE_BUILD_TYPE}}") -message(STATUS "Linker flags (static): ${CMAKE_STATIC_LINKER_FLAGS} ${CMAKE_STATIC_LINKER_FLAGS_${UC_CMAKE_BUILD_TYPE}}") +message(STATUS "Linker flags (static): ${CMAKE_STATIC_LINKER_FLAGS} ${CMAKE_STATIC_LINKER_FLAGS_${UC_CMAKE_BUILD_TYPE}}") \ No newline at end of file diff --git a/README.md b/README.md index 7808f6042..e400ccdb3 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # HomeStore -[![Conan Build](https://github.com/eBay/HomeStore/actions/workflows/merge_build.yml/badge.svg?branch=master)](https://github.com/eBay/HomeStore/actions/workflows/merge_build.yml) -[![CodeCov](https://codecov.io/gh/eBay/homestore/branch/master/graph/badge.svg)](https://codecov.io/gh/eBay/homestore) +[![Conan Build](https://github.com/hkadayam/HomeStore/actions/workflows/merge_build.yml/badge.svg?branch=master)](https://github.com/hkadayam/HomeStore/actions/workflows/merge_build.yml) +[![CodeCov](https://codecov.io/gh/hkadayam/homestore/branch/master/graph/badge.svg)](https://codecov.io/gh/hkadayam/homestore) Homestore is a generic *StorageEngine* upon which different *StorageSolution*s can be built. These Solutions can model Block, K/V, Object or Database *StorageInterface*s. @@ -31,7 +31,7 @@ Free flat-allocation space. Hooks are provided if a particular allocation patter ### LogSvc (std::list) Random Access circular buffer. Typically not used directly but levaraged by other Services to provide crash-resiliency. -## Application Diagram +## Architecture Diagram ![HomeObject Overview](docs/imgs/HomeStore.png) diff --git a/cmake/test_mode.cmake b/cmake/test_mode.cmake index 486186bd5..4195a68b1 100644 --- a/cmake/test_mode.cmake +++ b/cmake/test_mode.cmake @@ -39,6 +39,9 @@ if (DEFINED TEST_TARGET) set(${ret} true) endif() endmacro() + macro(can_build_repl_tests ret) + set(${ret} false) + endmacro() else() macro(can_build_io_tests ret) set(${ret} false) @@ -55,4 +58,7 @@ else() macro(can_build_epoll_io_tests ret) set(${ret} false) endmacro() + macro(can_build_repl_tests ret) + set(${ret} false) + endmacro() endif() diff --git a/conanfile.py b/conanfile.py index f4f51a727..fab1039da 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.20.14" + version = "5.3.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" @@ -25,6 +25,7 @@ class HomestoreConan(ConanFile): "coverage": ['True', 'False'], "sanitize": ['True', 'False'], "testing" : ['full', 'min', 'off', 'epoll_mode', 'spdk_mode'], + "replication" : ['off', 'on'], } default_options = { 'shared': False, @@ -32,6 +33,7 @@ class HomestoreConan(ConanFile): 'coverage': False, 'sanitize': False, 'testing': 'epoll_mode', + 'replication': 'off', } exports_sources = "cmake/*", "src/*", "CMakeLists.txt", "test_wrap.sh", "LICENSE" @@ -43,18 +45,19 @@ def configure(self): if self.settings.build_type == "Debug": if self.options.coverage and self.options.sanitize: raise ConanInvalidConfiguration("Sanitizer does not work with Code Coverage!") - if self.conf.get("tools.build:skip_test", default=False): - if self.options.coverage or self.options.sanitize: - raise ConanInvalidConfiguration("Coverage/Sanitizer requires Testing!") + #if self.conf.get("tools.build:skip_test", default=False): + #if self.options.coverage or self.options.sanitize: + # raise ConanInvalidConfiguration("Coverage/Sanitizer requires Testing!") def build_requirements(self): self.test_requires("benchmark/1.8.2") self.test_requires("gtest/1.14.0") def requirements(self): - self.requires("iomgr/[^11.3]@oss/master", transitive_headers=True) - self.requires("sisl/[^12.2]@oss/master", transitive_headers=True) - self.requires("nuraft_mesg/[~3.8.5]@oss/main", transitive_headers=True) + self.requires("iomgr/[^12.1]@oss/master", transitive_headers=True) + self.requires("sisl/[^13.3]@oss/master", transitive_headers=True) + if str(self.options.replication) == "on": + self.requires("nuraft_mesg/[^4.1]@oss/main", transitive_headers=True) self.requires("farmhash/cci.20190513@", transitive_headers=True) if self.settings.arch in ['x86', 'x86_64']: @@ -104,6 +107,10 @@ def generate(self): tc.variables['MEMORY_SANITIZER_ON'] = 'ON' tc.variables["CONAN_PACKAGE_NAME"] = self.name tc.variables["CONAN_PACKAGE_VERSION"] = self.version + if str(self.options.replication) == "on": + tc.variables["REPLICATION"] = "ON" + else: + tc.variables["REPLICATION"] = "OFF" tc.generate() # This generates "boost-config.cmake" and "grpc-config.cmake" etc in self.generators_folder diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index af80b11b4..7b33a68e8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,15 +8,27 @@ find_package(isa-l QUIET) find_package(iomgr QUIET REQUIRED) find_package(farmhash QUIET REQUIRED) find_package(GTest QUIET REQUIRED) -find_package(NuraftMesg QUIET REQUIRED) +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + find_package(NuraftMesg QUIET REQUIRED) + endif() +endif() list(APPEND COMMON_DEPS iomgr::iomgr farmhash::farmhash - nuraft_mesg::proto - nuraft::nuraft sisl::sisl ) + +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + list(APPEND COMMON_DEPS + nuraft_mesg::proto + nuraft::nuraft + ) + endif() +endif() + if (${isa-l_FOUND}) list(APPEND COMMON_DEPS isa-l::isa-l) else () @@ -42,9 +54,16 @@ add_subdirectory(lib/logstore) add_subdirectory(lib/meta) add_subdirectory(lib/index) add_subdirectory(lib/blkdata_svc/) -add_subdirectory(lib/replication/) +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + add_subdirectory(lib/replication/) + endif() +endif() + +if(NOT DEFINED BUILD_TESTING OR BUILD_TESTING) + add_subdirectory(tests) +endif() -add_subdirectory(tests) set(HOMESTORE_OBJECTS $ $ @@ -53,17 +72,22 @@ set(HOMESTORE_OBJECTS $ $ $ + $ + $ $ - $ lib/homestore.cpp lib/crc.cpp - lib/logging.cpp - #$ - #$ ) + +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + list(APPEND HOMESTORE_OBJECTS $) + endif() +endif() #target_link_libraries(homestore_objs ${COMMON_DEPS}) add_library(homestore STATIC ${HOMESTORE_OBJECTS} ) -target_link_libraries(homestore ${COMMON_DEPS}) +target_compile_definitions (homestore PRIVATE LOG_MODS_V2_SUPPORT) +target_link_libraries(homestore PRIVATE ${COMMON_DEPS}) diff --git a/src/docs/metablk_test_case.md b/src/docs/metablk_test_case.md new file mode 100644 index 000000000..7d9e2c362 --- /dev/null +++ b/src/docs/metablk_test_case.md @@ -0,0 +1,59 @@ +# MetaBlk Manager Testing +```Overall Purpose``` + +The primary goal of this test suite is to thoroughly validate the functionality and robustness of the MetaBlkMgr (Meta Block Manager) component within the HomeStore system. The MetaBlkMgr is responsible for managing metadata blocks, which are crucial for storing and retrieving information about data stored in HomeStore. + +```Key Functionality Under Test``` + +The tests cover a wide range of operations and scenarios related to metadata block management, including: + +`Basic Operations:` + +* Write: Writing new metadata blocks to the storage. +* Read: Reading back previously written metadata blocks. +* Update: Modifying existing metadata blocks. +* Remove: Deleting metadata blocks. + +```Advanced Scenarios:``` + +* Overflow Blocks: Testing the handling of metadata blocks that exceed a certain size and require overflow blocks. +* Compression: Verifying that compression and decompression of metadata blocks work correctly, including scenarios where compression is initially used but later backed off due to poor compression ratio. +* Unaligned Writes: Testing the ability to handle writes to unaligned memory addresses. +* Write to Full: Testing the behavior when the metadata storage space is completely filled. +* Recovery: Simulating system restarts and ensuring that the MetaBlkMgr can correctly recover its state and data from persistent storage. +* Random Load: Running a mix of write, update, and remove operations in a random order to simulate real-world usage patterns. +* Dependency Chain: Testing the dependency chain of the meta sub types. +* Bad Data Recovery: Testing the recovery from bad data. + +```Error Handling and Robustness:``` + +* Data Integrity: Verifying that data written to metadata blocks is read back correctly, using MD5 checksums to detect corruption. +* Resource Management: Ensuring that memory and other resources are properly allocated and deallocated. +* Concurrency: Using mutexes to protect shared data structures and ensure thread safety. +* Assertions: Using HS_DBG_ASSERT and HS_REL_ASSERT to detect unexpected conditions and failures. + +```Configuration and Settings:``` + +* Dynamic Settings: Testing the ability to change settings at runtime, such as the compression ratio limit and whether to skip header size checks during recovery. +* Command-Line Options: Using SISL_OPTIONS to configure test parameters like the number of I/O operations, run time, write/update/remove percentages, and I/O sizes. + +## Test Breakdown + +Here's a more detailed look at the individual tests: + +* min_drive_size_test: Checks if the minimum drive size requirement is met. +* write_to_full_test: Tests the ability to write until the metadata storage is full. +* single_read_test: Tests a single write and read operation. +* random_dependency_test: Tests the dependency chain of the meta sub types. +* recovery_test: Tests the recovery process after a simulated restart. It writes a certain amount of data, restarts HomeStore, and then writes more data to ensure that the recovery process works correctly. +* random_load_test: Performs a random mix of write, update, and remove operations, followed by a recovery test to ensure data integrity. +* RecoveryFromBadData: (Only in prerelease builds) Simulates a scenario where bad data is written to disk due to a bug, and then tests the ability to recover from it. +* CompressionBackoff: Tests the scenario where compression is initially used but later backed off due to a poor compression ratio. + +## Test Structure +* VMetaBlkMgrTest Class: This class is the base for all the tests. It sets up and tears down the HomeStore environment, provides helper functions for common operations, and defines the test logic. +* Param Struct: This struct holds the parameters that can be configured via command-line options. +* sb_info_t Struct: This struct is used to store information about a superblock (metadata block), including its cookie and MD5 checksum. +* meta_op_type enum: This enum defines the type of operations. + +In essence, this test suite is a comprehensive examination of the MetaBlkMgr's capabilities, designed to catch bugs, ensure data integrity, and validate its behavior under various conditions. \ No newline at end of file diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h index beb88b69f..a3e0a7768 100644 --- a/src/include/homestore/blk.h +++ b/src/include/homestore/blk.h @@ -75,6 +75,7 @@ struct BlkId { blk_count_t blk_count() const { return s.m_nblks; } chunk_num_t chunk_num() const { return s.m_chunk_num; } bool is_multi() const { return s.m_is_multi; } + std::pair< BlkId, BlkId > split(blk_count_t count) const; void invalidate(); uint64_t to_integer() const; @@ -117,6 +118,7 @@ struct MultiBlkId : public BlkId { void add(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num); void add(BlkId const&); + std::pair< MultiBlkId, MultiBlkId > split(blk_count_t count) const; uint16_t num_pieces() const; blk_count_t blk_count() const; std::string to_string() const; @@ -248,16 +250,17 @@ VENUM(BlkAllocStatus, uint32_t, ); struct blk_alloc_hints { - blk_temp_t desired_temp{0}; // Temperature hint for the device - std::optional< uint32_t > reserved_blks; // Reserved blks in a chunk - std::optional< uint32_t > pdev_id_hint; // which physical device to pick (hint if any) -1 for don't care - std::optional< chunk_num_t > chunk_id_hint; // any specific chunk id to pick for this allocation - std::optional< MultiBlkId > committed_blk_id; // blk id indicates the blk was already allocated and committed, - // don't allocate and commit again - std::optional< stream_id_t > stream_id_hint; // any specific stream to pick - std::optional< uint64_t > application_hint; // hints in uint64 what will be passed opaque to select_chunk - bool can_look_for_other_chunk{true}; // If alloc on device not available can I pick other device - bool is_contiguous{true}; // Should the entire allocation be one contiguous block + blk_temp_t desired_temp{0}; // Temperature hint for the device + std::optional< uint32_t > reserved_blks{std::nullopt}; // Reserved blks in a chunk + std::optional< uint32_t > pdev_id_hint{std::nullopt}; // which physical device to pick (hint if any) + std::optional< chunk_num_t > chunk_id_hint{std::nullopt}; // any specific chunk id to pick for this allocation + std::optional< MultiBlkId > committed_blk_id{ + std::nullopt}; // blk id indicates the blk was already allocated and committed, don't allocate and commit again + std::optional< stream_id_t > stream_id_hint{std::nullopt}; // any specific stream to pick + std::optional< uint64_t > application_hint{ + std::nullopt}; // hints in uint64 what will be passed opaque to select_chunk + bool can_look_for_other_chunk{true}; // If alloc on device not available can I pick other device + bool is_contiguous{true}; // Should the entire allocation be one contiguous block bool partial_alloc_ok{false}; // ok to allocate only portion of nblks? Mutually exclusive with is_contiguous uint32_t min_blks_per_piece{1}; // blks allocated in a blkid should be atleast this size per entry uint32_t max_blks_per_piece{max_blks_per_blkid()}; // Number of blks on every entry diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index 9671a3901..69b2f2ee4 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -280,19 +280,6 @@ class BlkDataService { std::unique_ptr< BlkReadTracker > m_blk_read_tracker; std::shared_ptr< ChunkSelector > m_custom_chunk_selector; uint32_t m_blk_size; - -private: - // graceful shutdown related - std::atomic_bool m_stopping{false}; - mutable std::atomic_uint64_t pending_request_num{0}; - - bool is_stopping() const { return m_stopping.load(); } - void start_stopping() { m_stopping = true; } - - uint64_t get_pending_request_num() const { return pending_request_num.load(); } - - void incr_pending_request_num() const { pending_request_num++; } - void decr_pending_request_num() const { pending_request_num--; } }; extern BlkDataService& data_service(); diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index 154e3e651..c159dc2f3 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -17,211 +17,208 @@ #include #include +#include #include #include #include -#include "btree_req.hpp" -#include "btree_kv.hpp" +#include +#include #include #include - -SISL_LOGGING_DECL(btree) +#include +#include namespace homestore { -using BtreeNodePtr = boost::intrusive_ptr< BtreeNode >; -using BtreeNodeList = folly::small_vector< BtreeNodePtr, 3 >; +class BtreeStore; -struct BtreeVisualizeVariables { - uint64_t parent; - uint64_t midPoint; - uint64_t index; -}; +template < typename K > +using PutPaginateCookie = unique< BtreeRangePutRequest< K > >; -struct BtreeThreadVariables { - std::vector< btree_locked_node_info > wr_locked_nodes; - std::vector< btree_locked_node_info > rd_locked_nodes; - BtreeNodePtr force_split_node{nullptr}; -}; +template < typename K > +using RemovePaginateCookie = unique< BtreeRangeRemoveRequest< K > >; -struct BTREE_FLIPS { - static constexpr uint32_t INDEX_PARENT_NON_ROOT = 1 << 0; - static constexpr uint32_t INDEX_PARENT_ROOT = 1 << 1; - static constexpr uint32_t INDEX_LEFT_SIBLING = 1 << 2; - static constexpr uint32_t INDEX_RIGHT_SIBLING = 1 << 3; - - uint32_t flips; - BTREE_FLIPS() : flips{0} {} - std::string list() const { - std::string str; - if (flips & INDEX_PARENT_NON_ROOT) { str += "index_parent_non_root,"; } - if (flips & INDEX_PARENT_ROOT) { str += "index_parent_root,"; } - if (flips & INDEX_LEFT_SIBLING) { str += "index_left_sibling,"; } - if (flips & INDEX_RIGHT_SIBLING) { str += "index_right_sibling,"; } - return str; - } - void set_flip(uint32_t flip) { flips |= flip; } - void set_flip(std::string flip) { - if (flip == "index_parent_non_root") { set_flip(INDEX_PARENT_NON_ROOT); } - if (flip == "index_parent_root") { set_flip(INDEX_PARENT_ROOT); } - if (flip == "index_left_sibling") { set_flip(INDEX_LEFT_SIBLING); } - if (flip == "index_right_sibling") { set_flip(INDEX_RIGHT_SIBLING); } - } -}; +template < typename K > +using QueryPaginateCookie = unique< BtreeQueryRequest< K > >; template < typename K, typename V > -class Btree { -protected: - mutable iomgr::FiberManagerLib::shared_mutex m_btree_lock; - BtreeLinkInfo m_root_node_info; - - BtreeMetrics m_metrics; - std::atomic< bool > m_destroyed{false}; - std::atomic< uint64_t > m_total_leaf_nodes{0}; - std::atomic< uint64_t > m_total_interior_nodes{0}; - std::atomic< uint8_t > m_btree_depth{0}; - uint32_t m_node_size{4096}; -#ifndef NDEBUG - std::atomic< uint64_t > m_req_id{0}; -#endif -#ifdef _PRERELEASE - BTREE_FLIPS m_flips; -#endif - // This workaround of BtreeThreadVariables is needed instead of directly declaring statics - // to overcome the gcc bug, pointer here: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66944 - static BtreeThreadVariables* bt_thread_vars() { - auto this_id(boost::this_fiber::get_id()); - static thread_local std::map< boost::fibers::fiber::id, std::unique_ptr< BtreeThreadVariables > > fiber_map; - if (fiber_map.count(this_id)) { return fiber_map[this_id].get(); } - fiber_map[this_id] = std::make_unique< BtreeThreadVariables >(); - return fiber_map[this_id].get(); - } - -protected: - BtreeConfig m_bt_cfg; - +class Btree : public BtreeBase { public: /////////////////////////////////////// All External APIs ///////////////////////////// - Btree(const BtreeConfig& cfg); + Btree(BtreeConfig const& cfg, uuid_t uuid = uuid_t{}, uuid_t parent_uuid = uuid_t{}, uint32_t user_sb_size = 0); + Btree(BtreeConfig const& cfg, superblk< IndexSuperBlock >&& sb); virtual ~Btree(); - template < typename ReqT > - btree_status_t put(ReqT& put_req); + // Destroy the entire btree from persistent and from memory. It is to be noted that all blocks are not destroyed at + // one go. For persistent btree, it might be a staged operation on multiple checkpoints. + folly::Future< folly::Unit > destroy() override; + + // @brief Inserts or updates a key-value pair in the B-tree. + // + // This function inserts a new key-value pair or updates an existing key-value pair in the B-tree + // based on the specified put type. Optionally, it can return the existing value and apply a filter + // callback before insertion. + // + // @param key The key to be inserted or updated. + // @param value The value to be associated with the key. + // @param put_type The type of put operation (e.g., insert, update, upsert). + // @param existing_val Optional pointer to store the existing value prior to update if the key already exists. + // @param filter_cb Optional callback function to apply a filter before insertion. If provided, before putting, if + // an existing key-value pair is found, the filter callback is called with the existing key, value and the new + // value. The callback could return "replace" in that case the existing value is replaced with the new value or it + // could return "keep" in that case key is not modified. + // + // @return The status of the put operation. + // + btree_status_t put_one(BtreeKey const& key, BtreeValue const& value, btree_put_type put_type, + BtreeValue* existing_val = nullptr, put_filter_cb_t filter_cb = nullptr); + + // @brief Inserts or updates a range of key-value pairs in the B-tree. + // + // This function inserts a new range of key-value pairs or updates existing key-value pairs in the B-tree + // based on the specified put type. Optionally, it can return the existing value and apply a filter + // callback before insertion. + // + // This is an unique function which can be used for multiple purpose based on the key type. + // + // Interval Key Behavior: + // If the key is an interval key (which means can next_key be obtained by doing prev_key + 1), for example an + // integer keys. If the input range is provided for an interval key, example [1, 50), then it will behave the + // following way + // 1. If the put_type is INSERT and if a specific key in the interval range is not present in the btree, then it + // will insert it. + // + // 2. If the put_type is UPSERT, then it will insert the keys within the range for which there is no entry in the + // btree. However for keys that exist, it will call the filter_cb(key, current_value, new_value) if provided and + // expects the callback to return the decision. The decision could be + // a. replace - replace the existing value with the new value. Note that the new_value will also be added the + // same offset as the key. So if key range is [1. 50) and if the key is 10, then the value will be added at 10th + // of the original value provided (of course the shifting of 10 can be avoided by the caller by supplying a + // BtreeValue override which simply doesn't add) + // + // b. remove - remove the key from the btree and don't add the new value. This feature is useful when we use the + // btree to maintain multiple versions of the key and when we write the new version of the key, we need to + // remove the older versions of the key along with this write operation. + // + // c. keep - keep the existing value as is and don't add the new value. + // + // 3. If the put_type is UPDATE, then it will only act on keys which already exist and the behavior is identical to + // upsert case above when the key is present. + // + // Non-Interval Key Behavior: + // If the key is not an interval key, then only put_type = UPDATE is supported. It will walk through the keys within + // the range and then do a filter_cb(key, current_value, new_value) if provided and expects the callback to return + // the decision. The decision could be + // a. replace - replace the existing value with the new value for that key. + // b. remove - remove the key from the btree and don't update the new value. + // c. keep - keep the existing value as is and don't modify the key to new value. + // In this non-interval key case, the range of keys are all updated with the same value. + // + // About batch size: + // The batch size is the number of keys that will be processed in one go. It will return with btree_status::has_more + // and the caller is expected to call put_range_next() method with the cookie passed to resume the next batch until + // it returns btree_status::success. It is to be noted that, the batch size is a best effort from the btree and at + // any iteration it could put between 1 to batch_size keys (it will at least put one_key and at most batch_size keys + // per iteration). + // + // @param inp_range The range of keys to insert, upsert or update + // @param put_type The type of put operation (e.g., insert, update, upsert). + // @param value The value to be associated with the key. Behavior is different for interval and non-interval keys + // (see above) + // @param batch_size The number of keys to process in one go. Default is to attempt to process all keys in one go. + // Please see the note above about the batch size. + // @param filter_cb Optional callback function to apply a filter before insertion. (See above for details) + // + // @return The status of the put operation and a cookie, if it returns btree_status::has_more, then the caller is + // expected to call put_range_next() + std::pair< btree_status_t, PutPaginateCookie< K > > + put_range(BtreeKeyRange< K >&& inp_range, btree_put_type put_type, BtreeValue const& value, + uint32_t batch_size = std::numeric_limits< uint32_t >::max(), put_filter_cb_t filter_cb = nullptr); + + // @brief Continuation of the put_range call for the next batch of keys. Calling this method without calling + // put_range first returns error. + // + // @param cookie The cookie returned by the put_range call + // + // @return The status of the put operation and a cookie, if it returns btree_status::has_more, then the caller is + // expected to call put_range_next() again. Failing to do so will result in memory leak. + btree_status_t put_range_next(PutPaginateCookie< K >& cookie); + + // @brief Gets the value associated with the specified key from the B-tree. + // + // @param key The key to search for. + // @param out_val A pointer to store the value associated with the key. (Should be non-nullptr) + // + // @return The status of the get operation. + btree_status_t get_one(BtreeKey const& key, BtreeValue* out_val); + + // @brief Gets any one value associated with the given key range. If the key range matches multiple keys, then btree + // will randomly pick one key and return the value associated with it. + // + // @param inp_range The range of keys to search for. + // @param out_key A pointer to store the picked key of the entry found. (Should be non-nullptr) + // @param out_val A pointer to store the value associated with the picked key. (Should be non-nullptr) + // + // @return The status of the get_any operation. + btree_status_t get_any(BtreeKeyRange< K >&& inp_range, BtreeKey* out_key, BtreeValue* out_val); + + // @brief Removes the key-value pair associated with the specified key from the B-tree. + // + // @param key The key to remove. + // @param out_val An optional pointer to store the value associated with the key before removal. + // + // @return The status of the remove operation. + btree_status_t remove_one(BtreeKey const& key, BtreeValue* out_val); + + // @brief Removes any one key-value pair associated with the given key range. If the key range matches multiple + // keys, then btree will randomly pick one key and remove the key-value pair associated with it. + // + // @param inp_range The range of keys to search for. + // @param out_key A pointer to store the picked key within the range. (Should be non-nullptr). Valid only if return + // status is btree_status_t::success. + // @param out_val A pointer to store the value associated with the picked key. (Should be non-nullptr) Valid only if + // return status is btree_status_t::success. + // + // @return The status of the remove_any operation. + btree_status_t remove_any(BtreeKeyRange< K >&& inp_range, BtreeKey* out_key, BtreeValue* out_val); + + std::pair< btree_status_t, RemovePaginateCookie< K > > + remove_range(BtreeKeyRange< K >&& inp_range, uint32_t batch_size = std::numeric_limits< uint32_t >::max(), + remove_filter_cb_t filter_cb = nullptr); + + btree_status_t remove_range_next(RemovePaginateCookie< K >& cookie); + + std::pair< btree_status_t, QueryPaginateCookie< K > > + query(BtreeKeyRange< K >&& inp_range, std::vector< std::pair< K, V > >& out_kvs, + uint32_t batch_size = std::numeric_limits< uint32_t >::max(), + BtreeQueryType query_type = BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, + get_filter_cb_t filter_cb = nullptr); + + btree_status_t query_next(QueryPaginateCookie< K >& cookie, std::vector< std::pair< K, V > >& out_kvs); - template < typename ReqT > - btree_status_t get(ReqT& get_req) const; - - template < typename ReqT > - btree_status_t remove(ReqT& rreq); - - btree_status_t query(BtreeQueryRequest< K >& query_req, std::vector< std::pair< K, V > >& out_values) const; - - // bool verify_tree(bool update_debug_bm) const; - virtual std::pair< btree_status_t, uint64_t > destroy_btree(void* context); nlohmann::json get_status(int log_level) const; - void dump_tree_to_file(const std::string& file = "") const; - std::string to_custom_string(to_string_cb_t< K, V > const& cb) const; - std::string visualize_tree_keys(const std::string& file) const; - uint64_t count_keys(bnodeid_t bnodeid = 0) const; - std::pair< uint64_t, uint64_t > compute_node_count(); - std::pair< uint64_t, uint64_t > get_num_nodes() const; - uint16_t compute_btree_depth(); - uint16_t get_btree_depth() const; + nlohmann::json get_metrics_in_json(bool updated); - nlohmann::json get_metrics_in_json(bool updated = true); - bnodeid_t root_node_id() const; + std::string to_string() const; - uint64_t root_link_version() const; - void set_root_node_info(const BtreeLinkInfo& info); - - // static void set_io_flip(); - // static void set_error_flip(); -#ifdef _PRERELEASE - void set_flip_point(std::string flip) { m_flips.set_flip(flip); } - void set_flips(std::vector< std::string > flips) { - for (const auto& flip : flips) { - set_flip_point(flip); - } - } - std::string flip_list() const { return m_flips.list(); } -#endif + std::string to_custom_string(BtreeNode::ToStringCallback< K, V > cb) const; -protected: - /////////////////////////// Methods the underlying store is expected to handle /////////////////////////// - virtual BtreeNodePtr alloc_node(bool is_leaf) = 0; - virtual BtreeNode* init_node(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf) const; - virtual btree_status_t read_node_impl(bnodeid_t id, BtreeNodePtr& node) const = 0; - virtual btree_status_t write_node_impl(const BtreeNodePtr& node, void* context) = 0; - virtual btree_status_t refresh_node(const BtreeNodePtr& node, bool for_read_modify_write, void* context) const = 0; - virtual void free_node_impl(const BtreeNodePtr& node, void* context) = 0; - virtual btree_status_t transact_nodes(const BtreeNodeList& new_nodes, const BtreeNodeList& freed_nodes, - const BtreeNodePtr& left_child_node, const BtreeNodePtr& parent_node, - void* context) = 0; - virtual btree_status_t on_root_changed(BtreeNodePtr const& root, void* context) = 0; - virtual std::string btree_store_type() const = 0; - - /////////////////////////// Methods the application use case is expected to handle /////////////////////////// + std::string to_digraph_visualize_format() const; -protected: - btree_status_t create_root_node(void* op_context); - - /////////////////////////////// Internal Node Management Methods //////////////////////////////////// - btree_status_t read_and_lock_node(bnodeid_t id, BtreeNodePtr& node_ptr, locktype_t int_lock_type, - locktype_t leaf_lock_type, void* context) const; - void read_node_or_fail(bnodeid_t id, BtreeNodePtr& node) const; - btree_status_t write_node(const BtreeNodePtr& node, void* context); - void free_node(const BtreeNodePtr& node, locktype_t cur_lock, void* context); - BtreeNodePtr alloc_leaf_node(); - BtreeNodePtr alloc_interior_node(); - - btree_status_t get_child_and_lock_node(const BtreeNodePtr& node, uint32_t index, BtreeLinkInfo& child_info, - BtreeNodePtr& child_node, locktype_t int_lock_type, - locktype_t leaf_lock_type, void* context) const; - btree_status_t upgrade_node_locks(const BtreeNodePtr& parent_node, const BtreeNodePtr& child_node, - locktype_t& parent_cur_lock, locktype_t& child_cur_lock, void* context); - btree_status_t upgrade_node_lock(const BtreeNodePtr& node, locktype_t& cur_lock, void* context); - btree_status_t _lock_node(const BtreeNodePtr& node, locktype_t type, void* context, const char* fname, - int line) const; - void unlock_node(const BtreeNodePtr& node, locktype_t type) const; - - std::pair< btree_status_t, uint64_t > do_destroy(); - void observe_lock_time(const BtreeNodePtr& node, locktype_t type, uint64_t time_spent) const; - - static void _start_of_lock(const BtreeNodePtr& node, locktype_t ltype, const char* fname, int line); - static bool remove_locked_node(const BtreeNodePtr& node, locktype_t ltype, btree_locked_node_info* out_info); - static uint64_t end_of_lock(const BtreeNodePtr& node, locktype_t ltype); - bool can_extents_auto_merge() const { return true; } // TODO: Make this rcu and dynamically settable - -#ifndef NDEBUG - static void check_lock_debug(); -#endif + void dump(const std::string& file, std::string format = "string", + BtreeNode::ToStringCallback< K, V > cb = nullptr) const; - /////////////////////////////////// Helper Methods /////////////////////////////////////// - btree_status_t post_order_traversal(locktype_t acq_lock, const auto& cb); - btree_status_t post_order_traversal(const BtreeNodePtr& node, locktype_t acq_lock, const auto& cb); - void get_all_kvs(std::vector< std::pair< K, V > >& kvs) const; - btree_status_t do_destroy(uint64_t& n_freed_nodes, void* context); - void get_child_node_count(bnodeid_t bnodeid, uint64_t& interior_cnt, uint64_t& leaf_cnt) const; - void to_string(bnodeid_t bnodeid, std::string& buf) const; - void to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, to_string_cb_t< K, V > const& cb, - int nindent = -1) const; - void to_dot_keys(bnodeid_t bnodeid, std::string& buf, std::map< uint32_t, std::vector< uint64_t > >& l_map, - std::map< uint64_t, BtreeVisualizeVariables >& info_map) const; - void sanity_sub_tree(bnodeid_t bnodeid = 0) const; - void validate_node(const bnodeid_t& bnodeid) const; - void validate_node_child_relation(BtreeNodePtr node, BtreeNodePtr& last_child_node) const; - void validate_next_node_relation(BtreeNodePtr node, BtreeNodePtr neighbor_node, BtreeNodePtr last_child_node) const; - void validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const; - void validate_sanity_next_child(const BtreeNodePtr& parent_node, uint32_t ind) const; - void print_node(const bnodeid_t& bnodeid) const; + bnodeid_t root_node_id() const; - void append_route_trace(BtreeRequest& req, const BtreeNodePtr& node, btree_event_t event, uint32_t start_idx = 0, - uint32_t end_idx = 0) const; + uint64_t count_keys(bnodeid_t start_bnodeid = empty_bnodeid) const; - //////////////////////////////// Impl Methods ////////////////////////////////////////// +private: + /////////////////////////////////// Mutate Impl methods ///////////////////////// + template < typename ReqT > + btree_status_t put(ReqT& put_req); ///////// Mutate Impl Methods template < typename ReqT > @@ -237,25 +234,37 @@ class Btree { bool is_split_needed(const BtreeNodePtr& node, ReqT& req) const; btree_status_t split_node(const BtreeNodePtr& parent_node, const BtreeNodePtr& child_node, uint32_t parent_ind, - K* out_split_key, void* context); - btree_status_t mutate_extents_in_leaf(const BtreeNodePtr& my_node, BtreeRangePutRequest< K >& rpreq); + K* out_split_key, CPContext* context); - ///////// Remove Impl Methods + ///////////////////////////////// Get Impl Methods ///////////////////////////////// template < typename ReqT > - btree_status_t check_collapse_root(ReqT& rreq); + btree_status_t get(ReqT& get_req); + + template < typename ReqT > + btree_status_t do_get(const BtreeNodePtr& my_node, ReqT& greq); + + ///////////////////////////////// Remove Impl Methods ///////////////////////////////// + template < typename ReqT > + btree_status_t remove(ReqT& rreq); template < typename ReqT > btree_status_t do_remove(const BtreeNodePtr& my_node, locktype_t curlock, ReqT& rreq); + template < typename ReqT > + btree_status_t check_collapse_root(ReqT& rreq); + btree_status_t merge_nodes(const BtreeNodePtr& parent_node, const BtreeNodePtr& leftmost_node, uint32_t start_indx, - uint32_t end_indx, void* context); - bool remove_extents_in_leaf(const BtreeNodePtr& node, BtreeRangeRemoveRequest< K >& rrreq); + uint32_t end_indx, CPContext* context); + + ///////////////////////////////// Query Impl Methods ///////////////////////////////// + btree_status_t query(BtreeQueryRequest< K >& query_req, std::vector< std::pair< K, V > >& out_values); - ///////// Query Impl Methods btree_status_t do_sweep_query(BtreeNodePtr& my_node, BtreeQueryRequest< K >& qreq, - std::vector< std::pair< K, V > >& out_values) const; + std::vector< std::pair< K, V > >& out_values); + btree_status_t do_traversal_query(const BtreeNodePtr& my_node, BtreeQueryRequest< K >& qreq, - std::vector< std::pair< K, V > >& out_values) const; + std::vector< std::pair< K, V > >& out_values); + #ifdef SERIALIZABLE_QUERY_IMPLEMENTATION btree_status_t do_serialzable_query(const BtreeNodePtr& my_node, BtreeSerializableQueryRequest& qreq, std::vector< std::pair< K, V > >& out_values); @@ -264,8 +273,33 @@ class Btree { std::vector< std::pair< K, V > >& out_values); #endif - ///////// Get Impl Methods - template < typename ReqT > - btree_status_t do_get(const BtreeNodePtr& my_node, ReqT& greq) const; +private: + /////////////////////////////// Internal Node Management Methods //////////////////////////////////// + // BtreeNode* init_node(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, + // BtreeNode::Allocator::Token token) const override; + virtual BtreeNodePtr new_node(bnodeid_t id, bool is_leaf, BtreeNode::Allocator::Token token) const override; + virtual BtreeNodePtr load_node(uint8_t* node_buf, bnodeid_t id, BtreeNode::Allocator::Token token) const override; + + /////////////////////////////////// Helper Methods /////////////////////////////////////// + btree_status_t post_order_traversal(locktype_t acq_lock, const auto& cb); + btree_status_t post_order_traversal(const BtreeNodePtr& node, locktype_t acq_lock, const auto& cb); + void get_all_kvs(std::vector< std::pair< K, V > >& kvs) const; + uint64_t get_btree_node_cnt() const; + uint64_t get_child_node_cnt(bnodeid_t bnodeid) const; + void to_string_internal(bnodeid_t bnodeid, std::string& buf) const; + void to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, + BtreeNode::ToStringCallback< K, V > const& cb) const; + void to_dot_keys(bnodeid_t bnodeid, std::string& buf, std::map< uint32_t, std::vector< uint64_t > >& l_map, + std::map< uint64_t, BtreeVisualizeVariables >& info_map) const; + void validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const; + void validate_sanity_next_child(const BtreeNodePtr& parent_node, uint32_t ind) const; + void print_node(const bnodeid_t& bnodeid) const; + + void append_route_trace(BtreeRequest& req, const BtreeNodePtr& node, btree_event_t event, uint32_t start_idx = 0, + uint32_t end_idx = 0) const; + +protected: + mutable iomgr::FiberManagerLib::shared_mutex m_btree_lock; + std::atomic< bool > m_destroyed{false}; }; } // namespace homestore diff --git a/src/include/homestore/btree/btree.ipp b/src/include/homestore/btree/btree.ipp index ca7a18294..6be5d8988 100644 --- a/src/include/homestore/btree/btree.ipp +++ b/src/include/homestore/btree/btree.ipp @@ -36,252 +36,117 @@ namespace homestore { template < typename K, typename V > -Btree< K, V >::Btree(const BtreeConfig& cfg) : - m_metrics{cfg.name().c_str()}, m_node_size{cfg.node_size()}, m_bt_cfg{cfg} { - m_bt_cfg.set_node_data_size(cfg.node_size() - sizeof(persistent_hdr_t)); +Btree< K, V >::Btree(BtreeConfig const& cfg, uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size) : + BtreeBase::BtreeBase(cfg, uuid, parent_uuid, user_sb_size) { + create_root_node(); } template < typename K, typename V > -Btree< K, V >::~Btree() = default; +Btree< K, V >::Btree(BtreeConfig const& cfg, superblk< IndexSuperBlock >&& sb) : + BtreeBase::BtreeBase(cfg, std::move(sb)) { + if (m_root_node_info.bnode_id() == empty_bnodeid) { + BT_LOG(INFO, "Loaded an empty btree, we are creating a new root node"); + create_root_node(); + } +} + +template < typename K, typename V > +Btree< K, V >::~Btree() { + if (is_ephemeral()) { destroy(); } +} +#if 0 template < typename K, typename V > void Btree< K, V >::set_root_node_info(const BtreeLinkInfo& info) { m_root_node_info = info; } +#endif template < typename K, typename V > -uint16_t Btree< K, V >::get_btree_depth() const {return m_btree_depth;} +btree_status_t Btree< K, V >::put_one(BtreeKey const& key, BtreeValue const& value, btree_put_type put_type, + BtreeValue* existing_val, put_filter_cb_t filter_cb) { + BtreeSinglePutRequest req{*this, &key, &value, put_type, existing_val, std::move(filter_cb)}; + auto const status = put(req); + return status; +} template < typename K, typename V > -std::pair Btree< K, V >::get_num_nodes() const { - return {m_total_interior_nodes, m_total_leaf_nodes}; +std::pair< btree_status_t, PutPaginateCookie< K > > +Btree< K, V >::put_range(BtreeKeyRange< K >&& inp_range, btree_put_type put_type, BtreeValue const& value, + uint32_t batch_size, put_filter_cb_t filter_cb) { + auto req_ptr = std::make_unique< BtreeRangePutRequest< K > >(*this, std::move(inp_range), put_type, &value, + batch_size, std::move(filter_cb)); + auto const status = put(*req_ptr); + return std::pair(status, std::move(req_ptr)); } - template < typename K, typename V > -std::pair< btree_status_t, uint64_t > Btree< K, V >::destroy_btree(void* context) { - btree_status_t ret{btree_status_t::success}; - uint64_t n_freed_nodes{0}; - - bool expected = false; - if (!m_destroyed.compare_exchange_strong(expected, true)) { - BT_LOG(DEBUG, "Btree is already being destroyed, ignorining this request"); - return std::make_pair(btree_status_t::not_found, 0); - } - ret = do_destroy(n_freed_nodes, context); - if (ret == btree_status_t::success) { - BT_LOG(DEBUG, "btree(root: {}) {} nodes destroyed successfully", m_root_node_info.bnode_id(), n_freed_nodes); - } else { - m_destroyed = false; - BT_LOG(ERROR, "btree(root: {}) nodes destroyed failed, ret: {}", m_root_node_info.bnode_id(), ret); - } - - return std::make_pair(ret, n_freed_nodes); +btree_status_t Btree< K, V >::put_range_next(PutPaginateCookie< K >& cookie) { + auto const status = put(*cookie); + if (status != btree_status_t::has_more) { cookie.reset(); } + return status; } template < typename K, typename V > -template < typename ReqT > -btree_status_t Btree< K, V >::put(ReqT& put_req) { - static_assert(std::is_same_v< ReqT, BtreeSinglePutRequest > || std::is_same_v< ReqT, BtreeRangePutRequest< K > >, - "put api is called with non put request type"); - COUNTER_INCREMENT(m_metrics, btree_write_ops_count, 1); - auto acq_lock = locktype_t::READ; - bool is_leaf = false; - - m_btree_lock.lock_shared(); - btree_status_t ret = btree_status_t::success; - -retry: -#ifndef NDEBUG - check_lock_debug(); -#endif - BT_LOG_ASSERT_EQ(bt_thread_vars()->rd_locked_nodes.size(), 0); - BT_LOG_ASSERT_EQ(bt_thread_vars()->wr_locked_nodes.size(), 0); - - BtreeNodePtr root; - ret = read_and_lock_node(m_root_node_info.bnode_id(), root, acq_lock, acq_lock, put_req.m_op_context); - if (ret != btree_status_t::success) { goto out; } - is_leaf = root->is_leaf(); - - if (is_split_needed(root, put_req)) { - // Time to do the split of root. - unlock_node(root, acq_lock); - m_btree_lock.unlock_shared(); - ret = check_split_root(put_req); - BT_LOG_ASSERT_EQ(bt_thread_vars()->rd_locked_nodes.size(), 0); - BT_LOG_ASSERT_EQ(bt_thread_vars()->wr_locked_nodes.size(), 0); - - // We must have gotten a new root, need to start from scratch. - m_btree_lock.lock_shared(); - if (ret != btree_status_t::success) { - LOGERROR("root split failed btree name {}", m_bt_cfg.name()); - goto out; - } - - goto retry; - } else if ((is_leaf) && (acq_lock != locktype_t::WRITE)) { - // Root is a leaf, need to take write lock, instead of read, retry - unlock_node(root, acq_lock); - acq_lock = locktype_t::WRITE; - goto retry; - } else { - ret = do_put(root, acq_lock, put_req); - if ((ret == btree_status_t::retry) || (ret == btree_status_t::has_more)) { - // Need to start from top down again, since there was a split or we have more to insert in case of range put - acq_lock = locktype_t::READ; - BT_LOG(TRACE, "retrying put operation"); - BT_LOG_ASSERT_EQ(bt_thread_vars()->rd_locked_nodes.size(), 0); - BT_LOG_ASSERT_EQ(bt_thread_vars()->wr_locked_nodes.size(), 0); - goto retry; - } - } - -out: - m_btree_lock.unlock_shared(); -#ifndef NDEBUG - check_lock_debug(); -#endif - if (ret != btree_status_t::success && ret != btree_status_t::cp_mismatch) { - BT_LOG(ERROR, "btree put failed {}", ret); - COUNTER_INCREMENT(m_metrics, write_err_cnt, 1); - } - - return ret; +btree_status_t Btree< K, V >::get_one(BtreeKey const& key, BtreeValue* out_val) { + BtreeSingleGetRequest req{*this, &key, out_val}; + return get(req); } template < typename K, typename V > -template < typename ReqT > -btree_status_t Btree< K, V >::get(ReqT& greq) const { - static_assert(std::is_same_v< BtreeSingleGetRequest, ReqT > || std::is_same_v< BtreeGetAnyRequest< K >, ReqT >, - "get api is called with non get request type"); - COUNTER_INCREMENT(m_metrics, btree_query_ops_count, 1); - btree_status_t ret = btree_status_t::success; - - m_btree_lock.lock_shared(); - BtreeNodePtr root; - - ret = read_and_lock_node(m_root_node_info.bnode_id(), root, locktype_t::READ, locktype_t::READ, greq.m_op_context); - if (ret != btree_status_t::success) { goto out; } - - ret = do_get(root, greq); -out: - m_btree_lock.unlock_shared(); - -#ifndef NDEBUG - check_lock_debug(); -#endif - return ret; +btree_status_t Btree< K, V >::get_any(BtreeKeyRange< K >&& inp_range, BtreeKey* out_key, BtreeValue* out_val) { + BtreeGetAnyRequest< K > req{*this, std::move(inp_range), out_key, out_val}; + return get(req); } template < typename K, typename V > -template < typename ReqT > -btree_status_t Btree< K, V >::remove(ReqT& req) { - static_assert(std::is_same_v< ReqT, BtreeSingleRemoveRequest > || - std::is_same_v< ReqT, BtreeRangeRemoveRequest< K > > || - std::is_same_v< ReqT, BtreeRemoveAnyRequest< K > >, - "remove api is called with non remove request type"); - COUNTER_INCREMENT(m_metrics, btree_remove_ops_count, 1); - locktype_t acq_lock = locktype_t::READ; - m_btree_lock.lock_shared(); - -retry: - btree_status_t ret = btree_status_t::success; - BtreeNodePtr root; - ret = read_and_lock_node(m_root_node_info.bnode_id(), root, acq_lock, acq_lock, req.m_op_context); - if (ret != btree_status_t::success) { goto out; } - - if (root->total_entries() == 0) { - if (root->is_leaf()) { - // There are no entries in btree. - unlock_node(root, acq_lock); - m_btree_lock.unlock_shared(); - ret = btree_status_t::not_found; - goto out; - } - - BT_NODE_LOG_ASSERT_EQ(root->has_valid_edge(), true, root, "Orphaned root with no entries and no edge"); - unlock_node(root, acq_lock); - m_btree_lock.unlock_shared(); - - ret = check_collapse_root(req); - if (ret != btree_status_t::success && ret != btree_status_t::merge_not_required) { - LOGERROR("check collapse read failed btree name {}", m_bt_cfg.name()); - goto out; - } - - // We must have gotten a new root, need to start from scratch. - m_btree_lock.lock_shared(); - goto retry; - } else if (root->is_leaf() && (acq_lock != locktype_t::WRITE)) { - // Root is a leaf, need to take write lock, instead of read, retry - unlock_node(root, acq_lock); - acq_lock = locktype_t::WRITE; - goto retry; - } else { - ret = do_remove(root, acq_lock, req); - if (ret == btree_status_t::retry) { - // Need to start from top down again, since there was a merge nodes in-between - acq_lock = locktype_t::READ; - goto retry; - } - } - m_btree_lock.unlock_shared(); - -out: -#ifndef NDEBUG - check_lock_debug(); -#endif - return ret; +btree_status_t Btree< K, V >::remove_one(BtreeKey const& key, BtreeValue* out_val) { + BtreeSingleRemoveRequest req{*this, &key, out_val}; + return remove(req); } template < typename K, typename V > -btree_status_t Btree< K, V >::query(BtreeQueryRequest< K >& qreq, std::vector< std::pair< K, V > >& out_values) const { - COUNTER_INCREMENT(m_metrics, btree_query_ops_count, 1); +btree_status_t Btree< K, V >::remove_any(BtreeKeyRange< K >&& inp_range, BtreeKey* out_key, BtreeValue* out_val) { + BtreeRemoveAnyRequest< K > req{*this, std::move(inp_range), out_key, out_val}; + return remove(req); +} - btree_status_t ret = btree_status_t::success; - if (qreq.batch_size() == 0) { return ret; } +template < typename K, typename V > +std::pair< btree_status_t, RemovePaginateCookie< K > > +Btree< K, V >::remove_range(BtreeKeyRange< K >&& inp_range, uint32_t batch_size, remove_filter_cb_t filter_cb) { + auto req_ptr = + std::make_unique< BtreeRangeRemoveRequest< K > >(*this, std::move(inp_range), batch_size, std::move(filter_cb)); + auto status = remove(*req_ptr); + return std::pair(status, std::move(req_ptr)); +} - m_btree_lock.lock_shared(); - BtreeNodePtr root = nullptr; - ret = read_and_lock_node(m_root_node_info.bnode_id(), root, locktype_t::READ, locktype_t::READ, qreq.m_op_context); - if (ret != btree_status_t::success) { goto out; } - - switch (qreq.query_type()) { - case BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY: - ret = do_sweep_query(root, qreq, out_values); - break; - - case BtreeQueryType::TREE_TRAVERSAL_QUERY: - ret = do_traversal_query(root, qreq, out_values); - break; - - default: - unlock_node(root, locktype_t::READ); - LOGERROR("Query type {} is not supported yet", qreq.query_type()); - break; - } +template < typename K, typename V > +btree_status_t Btree< K, V >::remove_range_next(RemovePaginateCookie< K >& cookie) { + auto const status = remove(*cookie); + if (status != btree_status_t::has_more) { cookie.reset(); } + return status; +} - if ((qreq.query_type() == BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY || - qreq.query_type() == BtreeQueryType::TREE_TRAVERSAL_QUERY)) { - if (out_values.size()) { - K out_last_key = out_values.back().first; - if (out_last_key.compare(qreq.input_range().end_key()) >= 0) { ret = btree_status_t::success; } - qreq.shift_working_range(std::move(out_last_key), false /* non inclusive*/); - } else { - DEBUG_ASSERT_NE(ret, btree_status_t::has_more, "Query returned has_more, but no values added") - } - } +template < typename K, typename V > +std::pair< btree_status_t, QueryPaginateCookie< K > > +Btree< K, V >::query(BtreeKeyRange< K >&& inp_range, // Input range to query for + std::vector< std::pair< K, V > >& out_kvs, // Results will be appended + uint32_t batch_size, // Batch size, default the whole set + BtreeQueryType query_type, // See query_impl for more details + get_filter_cb_t filter_cb // Any filtering condition while picking the result set +) { + auto req_ptr = std::make_unique< BtreeQueryRequest< K > >(*this, std::move(inp_range), query_type, batch_size, + std::move(filter_cb)); + auto status = query(*req_ptr, out_kvs); + return std::pair(status, std::move(req_ptr)); +} -out: - m_btree_lock.unlock_shared(); -#ifndef NDEBUG - check_lock_debug(); -#endif - if ((ret != btree_status_t::success) && (ret != btree_status_t::has_more)) { - BT_LOG(ERROR, "btree query failed {}", ret); - COUNTER_INCREMENT(m_metrics, query_err_cnt, 1); - } - return ret; +template < typename K, typename V > +btree_status_t Btree< K, V >::query_next(QueryPaginateCookie< K >& cookie, std::vector< std::pair< K, V > >& out_kvs) { + if (cookie == nullptr) { return btree_status_t::success; } + auto const status = query(*cookie, out_kvs); + if (status != btree_status_t::has_more) { cookie.reset(); } + return status; } #if 0 @@ -317,39 +182,40 @@ nlohmann::json Btree< K, V >::get_status(int log_level) const { } template < typename K, typename V > -void Btree< K, V >::dump_tree_to_file(const std::string& file) const { +nlohmann::json Btree< K, V >::get_metrics_in_json(bool updated) { + return m_metrics.get_result_in_json(updated); +} + +template < typename K, typename V > +std::string Btree< K, V >::to_string() const { std::string buf; m_btree_lock.lock_shared(); - to_string(m_root_node_info.bnode_id(), buf); + to_string_internal(m_root_node_info.bnode_id(), buf); m_btree_lock.unlock_shared(); + BT_LOG(DEBUG, "Pre order traversal of tree:\n<{}>", buf); - BT_LOG(INFO, "Pre order traversal of tree:\n<{}>", buf); - if (!file.empty()) { - std::ofstream o(file); - o.write(buf.c_str(), buf.size()); - o.flush(); - } + return buf; } template < typename K, typename V > -std::string Btree< K, V >::to_custom_string(to_string_cb_t< K, V > const& cb) const { +std::string Btree< K, V >::to_custom_string(BtreeNode::ToStringCallback< K, V > cb) const { std::string buf; m_btree_lock.lock_shared(); - to_custom_string_internal(m_root_node_info.bnode_id(), buf, cb); + to_custom_string_internal(m_root_node_info.bnode_id(), buf, std::move(cb)); m_btree_lock.unlock_shared(); return buf; } template < typename K, typename V > -std::string Btree< K, V >::visualize_tree_keys(const std::string& file) const { +std::string Btree< K, V >::to_digraph_visualize_format() const { std::map< uint32_t, std::vector< uint64_t > > level_map; std::map< uint64_t, BtreeVisualizeVariables > info_map; std::string buf = "digraph G\n" "{ \n" "ranksep = 3.0;\n" R"(graph [splines="polyline"]; -)"; + )"; m_btree_lock.lock_shared(); to_dot_keys(m_root_node_info.bnode_id(), buf, level_map, info_map); @@ -373,17 +239,38 @@ std::string Btree< K, V >::visualize_tree_keys(const std::string& file) const { } buf += "\n" + result + " }\n"; - if (!file.empty()) { - std::ofstream o(file); - o.write(buf.c_str(), buf.size()); - o.flush(); - } return buf; } template < typename K, typename V > -nlohmann::json Btree< K, V >::get_metrics_in_json(bool updated) { - return m_metrics.get_result_in_json(updated); +void Btree< K, V >::dump(const std::string& file, std::string format, BtreeNode::ToStringCallback< K, V > cb) const { + if (file.empty()) { + BT_LOG(ERROR, "Wrong file name to dump btree"); + return; + } + + std::string buf; + if (format == "string") { + BT_LOG(DEBUG, "Dumping btree in string format"); + buf = to_string(); + } else if (format == "dot") { + BT_LOG(DEBUG, "Dumping btree to dot format"); + buf = to_digraph_visualize_format(); + } else if (format == "custom") { + if (cb == nullptr) { + BT_LOG(WARN, "Custom format requested but no callback provided, dumping as string"); + buf = to_string(); + } else { + buf = to_custom_string(std::move(cb)); + } + } else { + BT_LOG(ERROR, "Invalid format={} to dump btree", format); + return; + } + + std::ofstream o(file); + o.write(buf.c_str(), buf.size()); + o.flush(); } template < typename K, typename V > @@ -392,8 +279,25 @@ bnodeid_t Btree< K, V >::root_node_id() const { } template < typename K, typename V > -uint64_t Btree< K, V >::root_link_version() const { - return m_root_node_info.link_version(); +uint64_t Btree< K, V >::count_keys(bnodeid_t bnodeid) const { + BtreeNodePtr node; + locktype_t acq_lock = locktype_t::READ; + if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return 0; } + uint64_t result = 0; + if (!node->is_leaf()) { + uint32_t i = 0; + while (i < node->total_entries()) { + BtreeLinkInfo p; + node->get_nth_value(i, &p, false); + result += count_keys(p.bnode_id()); + ++i; + } + if (node->has_valid_edge()) { result += count_keys(node->edge_id()); } + } else { + result = node->total_entries(); + } + unlock_node(node, acq_lock); + return result; } // TODO: Commenting out flip till we figure out how to move flip dependency inside sisl package. diff --git a/src/include/homestore/btree/btree_base.hpp b/src/include/homestore/btree/btree_base.hpp new file mode 100644 index 000000000..937a5ed46 --- /dev/null +++ b/src/include/homestore/btree/btree_base.hpp @@ -0,0 +1,171 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace homestore { +class UnderlyingBtree { +public: + virtual ~UnderlyingBtree() = default; + + virtual BtreeNodePtr create_node(bool is_leaf, CPContext* context) = 0; + virtual btree_status_t write_node(BtreeNodePtr const& node, CPContext* context) = 0; + virtual btree_status_t read_node(bnodeid_t id, BtreeNodePtr& node) const = 0; + virtual btree_status_t refresh_node(BtreeNodePtr const& node, bool for_read_modify_write, CPContext* context) = 0; + virtual void remove_node(BtreeNodePtr const& node, CPContext* context) = 0; + virtual btree_status_t transact_nodes(const BtreeNodeList& new_nodes, const BtreeNodeList& removed_nodes, + const BtreeNodePtr& left_child_node, const BtreeNodePtr& parent_node, + CPContext* context) = 0; + virtual BtreeLinkInfo load_root_node_id() = 0; + virtual btree_status_t on_root_changed(BtreeNodePtr const& root, CPContext* context) = 0; + virtual uint64_t space_occupied() const = 0; +}; + +// Btree based implementations superblock area +struct BtreeSuperBlock { + static constexpr size_t underlying_btree_sb_size = + IndexSuperBlock::index_impl_sb_size - sizeof(bnodeid_t) - sizeof(uint64_t) - sizeof(uint32_t); + + bnodeid_t root_node_id{empty_bnodeid}; // Btree Root Node ID + uint64_t root_link_version{0}; + uint32_t node_size{0}; // Node size used for this btree + std::array< uint8_t, underlying_btree_sb_size > underlying_btree_sb; +}; + +class BtreeBase; + +struct BtreeRouteTracer { + SCOPED_ENUM_DECL(Op, uint8_t); + std::vector< bool > m_enabled_ops; + std::vector< std::string > m_ops_routes; + uint32_t m_max_buf_size_per_op; // Max size after which the buffer is rolled over + bool m_log_if_rolled; + mutable iomgr::FiberManagerLib::shared_mutex m_append_mtx; + + BtreeRouteTracer(uint32_t buf_size_per_op = 1 * 1024 * 1024, bool log_if_buf_rolled = false); + void enable(Op op) { m_enabled_ops[uint32_cast(op)] = true; } + void disable(Op op) { m_enabled_ops[uint32_cast(op)] = false; } + void enable_all() { m_enabled_ops.assign(m_enabled_ops.size(), true); } + void disable_all() { m_enabled_ops.assign(m_enabled_ops.size(), false); } + bool is_enabled_for(Op op) const { return m_enabled_ops[uint32_cast(op)]; } + + void append_to(Op op, std::string const& route); + std::string get(Op op) const; + std::vector< std::string > get_all() const; +}; + +SCOPED_ENUM_DEF(BtreeRouteTracer, Op, uint8_t, PUT, GET, REMOVE, QUERY); + +class BtreeStore; + +class BtreeBase : public Index { +public: + BtreeBase(BtreeConfig const& cfg, uuid_t uuid = uuid_t{}, uuid_t parent_uuid = uuid_t{}, uint32_t user_sb_size = 0); + BtreeBase(BtreeConfig const& cfg, superblk< IndexSuperBlock >&& sb); + virtual ~BtreeBase(); + + UnderlyingBtree const* underlying_btree() const { return m_bt_private.get(); } + UnderlyingBtree* underlying_btree() { + return const_cast< UnderlyingBtree* >(s_cast< const BtreeBase* >(this)->underlying_btree()); + } + + BtreeSuperBlock const& bt_super_blk() const { + return *(r_cast< BtreeSuperBlock const* >(super_blk()->underlying_index_sb.data())); + } + + BtreeSuperBlock& bt_super_blk() { + return const_cast< BtreeSuperBlock& >(s_cast< const BtreeBase* >(this)->bt_super_blk()); + } + + virtual BtreeNodePtr new_node(bnodeid_t id, bool is_leaf, BtreeNode::Allocator::Token token) const = 0; + virtual BtreeNodePtr load_node(uint8_t* node_buf, bnodeid_t id, BtreeNode::Allocator::Token token) const = 0; + + // virtual BtreeNode* init_node(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, + // BtreeNode::Allocator::Token token) const = 0; + + uint64_t space_occupied() const override; + uint32_t ordinal() const override; + + virtual uint32_t node_size() const; + std::string name() const; + BtreeRouteTracer& route_tracer(); + BtreeConfig const& bt_config() const { return m_bt_cfg; } + [[nodiscard]] CPGuard bt_cp_guard(); + +public: + virtual btree_status_t write_node(const BtreeNodePtr& node, CPContext* context); + virtual void read_node_or_fail(bnodeid_t id, BtreeNodePtr& node) const; + virtual BtreeNodePtr create_leaf_node(CPContext* context); + virtual BtreeNodePtr create_interior_node(CPContext* context); + virtual void remove_node(const BtreeNodePtr& node, locktype_t cur_lock, CPContext* context); + +protected: + virtual btree_status_t create_root_node(); + virtual BtreeNodePtr clone_temp_node(BtreeNode const& node); + virtual btree_status_t read_and_lock_node(bnodeid_t id, BtreeNodePtr& node_ptr, locktype_t int_lock_type, + locktype_t leaf_lock_type, CPContext* context) const; + virtual btree_status_t get_child_and_lock_node(const BtreeNodePtr& node, uint32_t index, BtreeLinkInfo& child_info, + BtreeNodePtr& child_node, locktype_t int_lock_type, + locktype_t leaf_lock_type, CPContext* context) const; + + virtual btree_status_t upgrade_node_locks(const BtreeNodePtr& parent_node, const BtreeNodePtr& child_node, + locktype_t& parent_cur_lock, locktype_t& child_cur_lock, + CPContext* context); + virtual btree_status_t upgrade_node_lock(const BtreeNodePtr& node, locktype_t& cur_lock, CPContext* context); + virtual btree_status_t _lock_node(const BtreeNodePtr& node, locktype_t type, CPContext* context, const char* fname, + int line) const; + virtual void unlock_node(const BtreeNodePtr& node, locktype_t type) const; + +#ifdef _DEBUG +public: + struct NodeLockInfo { + BtreeNode* node; + Clock::time_point start_time; + const char* fname; + int line; + + void dump() const { LOGINFO("node locked by file: {}, line: {}", fname, line); } + }; + + struct BtreeThreadVariables { + std::vector< BtreeBase::NodeLockInfo > wr_locked_nodes; + std::vector< BtreeBase::NodeLockInfo > rd_locked_nodes; + }; + + // This workaround of BtreeThreadVariables is needed instead of directly declaring statics + // to overcome the gcc bug, pointer here: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66944 + static BtreeThreadVariables* thread_vars() { + auto this_id(boost::this_fiber::get_id()); + static thread_local std::map< boost::fibers::fiber::id, std::unique_ptr< BtreeThreadVariables > > fiber_map; + if (fiber_map.count(this_id)) { return fiber_map[this_id].get(); } + fiber_map[this_id] = std::make_unique< BtreeThreadVariables >(); + return fiber_map[this_id].get(); + } + virtual void observe_lock_time(const BtreeNodePtr& node, locktype_t type, uint64_t time_spent) const; + virtual void check_lock_debug(); + + static void _start_of_lock(const BtreeNodePtr& node, locktype_t ltype, const char* fname, int line); + static bool remove_locked_node(const BtreeNodePtr& node, locktype_t ltype, NodeLockInfo* out_info); + static uint64_t end_of_lock(const BtreeNodePtr& node, locktype_t ltype); +#endif + +protected: + shared< BtreeStore > m_store; + unique< UnderlyingBtree > m_bt_private; + BtreeLinkInfo m_root_node_info; + + BtreeConfig m_bt_cfg; + BtreeMetrics m_metrics; + BtreeRouteTracer m_route_tracer; + std::atomic< uint64_t > m_total_nodes{0}; +}; + +struct BtreeVisualizeVariables { + uint64_t parent; + uint64_t midPoint; + uint64_t index; +}; +} // namespace homestore \ No newline at end of file diff --git a/src/include/homestore/btree/btree_kv.hpp b/src/include/homestore/btree/btree_kv.hpp index cbadc3830..642fa5e96 100644 --- a/src/include/homestore/btree/btree_kv.hpp +++ b/src/include/homestore/btree/btree_kv.hpp @@ -61,7 +61,7 @@ class BtreeKey { // integers, but it needs to be able to get next or prev key from a given key in the key range class BtreeIntervalKey : public BtreeKey { public: - virtual void shift(int n, void* app_ctx) = 0; + virtual void shift(int n) = 0; virtual int distance(BtreeKey const& from) const = 0; bool is_interval_key() const override { return true; } @@ -142,7 +142,7 @@ class BtreeValue { class BtreeIntervalValue : public BtreeValue { public: - virtual void shift(int n, void* app_ctx) = 0; + virtual void shift(int n) = 0; virtual sisl::blob serialize_prefix() const = 0; virtual sisl::blob serialize_suffix() const = 0; @@ -277,4 +277,8 @@ class BtreeLinkInfo : public BtreeValue { } }; +ENUM(put_filter_decision, uint8_t, keep, replace, remove); +using put_filter_cb_t = std::function< put_filter_decision(BtreeKey const&, BtreeValue const&, BtreeValue const&) >; +using remove_filter_cb_t = std::function< bool(BtreeKey const&, BtreeValue const&) >; +using get_filter_cb_t = std::function< bool(BtreeKey const&, BtreeValue const&) >; } // namespace homestore diff --git a/src/include/homestore/btree/btree_store.h b/src/include/homestore/btree/btree_store.h new file mode 100644 index 000000000..1c1b349c3 --- /dev/null +++ b/src/include/homestore/btree/btree_store.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include + +namespace homestore { + +class BtreeBase; +class UnderlyingBtree; +class CPContext; + +class BtreeStore : public IndexStore { +public: + BtreeStore() = default; + virtual ~BtreeStore() = default; + + // All Btree related operations + virtual unique< UnderlyingBtree > create_underlying_btree(BtreeBase& btree, bool load_existing) = 0; + virtual folly::Future< folly::Unit > destroy_underlying_btree(BtreeBase& btree) = 0; + +#if 0 + // Called whenever a particular btree node has been freed. The underlying implementation could use this oppurtunity + // to free any contexts stored for this node. + virtual void on_node_freed(BtreeNode* node) = 0; +#endif + + // When a particular btree is to be destroyed, some stores can support fast destroy mechanism, where all the btree + // nodes can be freed in one go (in a single Checkpoint) without merging the tree and collapsing the tree. This + // saves lots of IOs while destroying a btree. The requirement from the store is that it should be able to destroy + // and free all nodes within single checkpoint. If store doesn't support, then btree library itself will keep + // merging entities and collapsing the tree. + virtual bool is_fast_destroy_supported() const = 0; + + virtual bool is_ephemeral() const = 0; + + virtual uint32_t max_node_size() const = 0; +}; +} // namespace homestore \ No newline at end of file diff --git a/src/include/homestore/btree/detail/btree_common.ipp b/src/include/homestore/btree/detail/btree_common.ipp index b4e730b67..cabb85c46 100644 --- a/src/include/homestore/btree/detail/btree_common.ipp +++ b/src/include/homestore/btree/detail/btree_common.ipp @@ -15,6 +15,7 @@ *********************************************************************************/ #pragma once #include +#include namespace homestore { @@ -86,70 +87,100 @@ void Btree< K, V >::get_all_kvs(std::vector< std::pair< K, V > >& kvs) const { } template < typename K, typename V > -btree_status_t Btree< K, V >::do_destroy(uint64_t& n_freed_nodes, void* context) { - return post_order_traversal(locktype_t::WRITE, - [this, &n_freed_nodes, context](const auto& node, bool is_leaf) -> btree_status_t { - free_node(node, locktype_t::WRITE, context); - ++n_freed_nodes; - return btree_status_t::node_freed; - }); +folly::Future< folly::Unit > Btree< K, V >::destroy() { + bool expected = false; + if (!m_destroyed.compare_exchange_strong(expected, true)) { + BT_LOG(DEBUG, "Btree is already being destroyed, ignoring this request"); + return folly::makeFuture< folly::Unit >(folly::Unit{}); + } + + if (m_store->is_ephemeral()) { + post_order_traversal(locktype_t::WRITE, [this](const auto& node, bool is_leaf) -> btree_status_t { + // On ephemeral btree, we can directly remove the node, however on non-ephemeral btree, we need to do so + // only at checkpoint time, which should be handled by the store themselves. + remove_node(node, locktype_t::WRITE, nullptr); + return btree_status_t::node_freed; + }); + } else if (!m_store->is_fast_destroy_supported()) { + // TODO: Need to be implemented. We need to create a BtreeRangeRemoveRequest and put the entire range in the + // request, which should naturally collapse the tree and remove all the nodes. To generate the entire range we + // have 2 choices: + // a) Do a traversal to the left most and right most and implement a btree node method to get first and last key + // from leaf node and put that in range and traverse again. + // b) Generate a magical BtreeKey called "min" and "max" and put that in the range. However the user of the + // Btree should understand this and should handle in their compare function. + } else { + // Let the store handle the fast delete of btree as part of the destroy_underlying_btree() call. + } + + BT_LOG(DEBUG, "btree(root: {}) destroyed successfully", m_root_node_info.bnode_id()); + return m_store->destroy_underlying_btree(*this); } +#if 0 template < typename K, typename V > -std::pair Btree< K, V >::compute_node_count() { - uint64_t leaf_cnt = 0; - uint64_t interior_cnt = 0; - m_btree_lock.lock_shared(); - get_child_node_count(m_root_node_info.bnode_id(), interior_cnt, leaf_cnt); - m_total_leaf_nodes = leaf_cnt; - m_total_interior_nodes= interior_cnt; - m_btree_lock.unlock_shared(); - return {interior_cnt, leaf_cnt}; +btree_status_t Btree< K, V >::do_destroy(std::function< void(BtreeKey const&, BtreeValue const&) > cb) { + if (m_store->is_fast_destroy_supported()) { + return post_order_traversal(locktype_t::WRITE, [this, &cb](const auto& node, bool is_leaf) -> btree_status_t { + // If callback is defined, then call it for each key-value pair before deleting. It is typically used in + // case the index stores some indirect data and that needs to be freed. + if (cb != nullptr) { + std::vector< std::pair< K, V > > kvs; + node->get_all_kvs([this, &cb](const auto& kvs) { + for (const auto& kv : kvs) { + cb(kv.first, kv.second); + } + }); + } + + // On ephemeral btree, we can directly remove the node, however on non-ephemeral btree, we need to do so + // only at checkpoint time, which should be handled by the store themselves. + if (m_store->is_ephemeral()) { remove_node(node, locktype_t::WRITE, context); } + return btree_status_t::success; + }); + } else { + // TODO: Need to be implemented. We need to create a BtreeRangeRemoveRequest and put the entire range in the + // request, which should naturally collapse the tree and remove all the nodes. To generate the entire range we + // have 2 choices: + // a) Do a traversal to the left most and right most and implement a btree node method to get first and last key + // from leaf node and put that in range and traverse again. + // b) Generate a magical BtreeKey called "min" and "max" and put that in the range. However the user of the + // Btree should understand this and should handle in their compare function. + } } +#endif template < typename K, typename V > -uint16_t Btree< K, V >::compute_btree_depth() { +uint64_t Btree< K, V >::get_btree_node_cnt() const { + uint64_t cnt = 1; /* increment it for root */ m_btree_lock.lock_shared(); - BtreeNodePtr root; - locktype_t acq_lock = locktype_t::READ; - if (read_and_lock_node(m_root_node_info.bnode_id(), root, acq_lock, acq_lock, nullptr) != btree_status_t::success){ return -1; } - m_btree_depth = root->level(); - unlock_node(root, acq_lock); + cnt += get_child_node_cnt(m_root_node_info.bnode_id()); m_btree_lock.unlock_shared(); - return m_btree_depth; + return cnt; } template < typename K, typename V > -void Btree< K, V >::get_child_node_count(bnodeid_t bnodeid, uint64_t& interior_cnt, uint64_t& leaf_cnt) const { +uint64_t Btree< K, V >::get_child_node_cnt(bnodeid_t bnodeid) const { + uint64_t cnt{0}; BtreeNodePtr node; locktype_t acq_lock = locktype_t::READ; - if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return ; } - if(node->is_leaf()) { - ++leaf_cnt; - } else { - ++interior_cnt; - } + if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return cnt; } if (!node->is_leaf()) { - if(node->level()==1){ - leaf_cnt += node->total_entries() + (node->has_valid_edge()?1:0); - }else{ - uint32_t i = 0; - while (i < node->total_entries()) { - BtreeLinkInfo p; - node->get_nth_value(i, &p, false); - get_child_node_count(p.bnode_id(), interior_cnt, leaf_cnt); - ++i; - } - if (node->has_valid_edge()) {get_child_node_count(node->edge_id(), interior_cnt, leaf_cnt); } + uint32_t i = 0; + while (i < node->total_entries()) { + BtreeLinkInfo p = node->get_nth_key< K >(i, false); + cnt += get_child_node_cnt(p.bnode_id()) + 1; + ++i; } + if (node->has_valid_edge()) { cnt += get_child_node_cnt(node->edge_id()) + 1; } } unlock_node(node, acq_lock); - return ; + return cnt; } template < typename K, typename V > -void Btree< K, V >::to_string(bnodeid_t bnodeid, std::string& buf) const { +void Btree< K, V >::to_string_internal(bnodeid_t bnodeid, std::string& buf) const { BtreeNodePtr node; locktype_t acq_lock = locktype_t::READ; @@ -162,35 +193,33 @@ void Btree< K, V >::to_string(bnodeid_t bnodeid, std::string& buf) const { while (i < node->total_entries()) { BtreeLinkInfo p; node->get_nth_value(i, &p, false); - to_string(p.bnode_id(), buf); + to_string_internal(p.bnode_id(), buf); ++i; } - if (node->has_valid_edge()) { to_string(node->edge_id(), buf); } + if (node->has_valid_edge()) { to_string_internal(node->edge_id(), buf); } } unlock_node(node, acq_lock); } template < typename K, typename V > -void Btree< K, V >::to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, to_string_cb_t< K, V > const& cb, - int nindent) const { +void Btree< K, V >::to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, + BtreeNode::ToStringCallback< K, V > const& cb) const { BtreeNodePtr node; locktype_t acq_lock = locktype_t::READ; if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return; } - if (nindent < 0) { nindent = node->level(); } - std::string tabs(3 * (nindent - node->level()), ' '); - fmt::format_to(std::back_inserter(buf), "{}{}\n", tabs, node->to_custom_string(cb)); + fmt::format_to(std::back_inserter(buf), "{}\n", node->to_custom_string(cb)); if (!node->is_leaf()) { uint32_t i = 0; while (i < node->total_entries()) { BtreeLinkInfo p; node->get_nth_value(i, &p, false); - to_custom_string_internal(p.bnode_id(), buf, cb, nindent); + to_custom_string_internal(p.bnode_id(), buf, cb); ++i; } - if (node->has_valid_edge()) { to_custom_string_internal(node->edge_id(), buf, cb, nindent); } + if (node->has_valid_edge()) { to_custom_string_internal(node->edge_id(), buf, cb); } } unlock_node(node, acq_lock); } @@ -225,221 +254,6 @@ void Btree< K, V >::to_dot_keys(bnodeid_t bnodeid, std::string& buf, unlock_node(node, acq_lock); } -template < typename K, typename V > -uint64_t Btree< K, V >::count_keys(bnodeid_t bnodeid) const { - if (bnodeid == 0) { bnodeid = this->root_node_id(); } - BtreeNodePtr node; - locktype_t acq_lock = locktype_t::READ; - if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return 0; } - uint64_t result = 0; - if (!node->is_leaf()) { - uint32_t i = 0; - while (i < node->total_entries()) { - BtreeLinkInfo p; - node->get_nth_value(i, &p, false); - result += count_keys(p.bnode_id()); - ++i; - } - if (node->has_valid_edge()) { result += count_keys(node->edge_id()); } - } else { - result = node->total_entries(); - } - unlock_node(node, acq_lock); - return result; -} - -template -void Btree::validate_node_child_relation(BtreeNodePtr node, BtreeNodePtr& last_child_node) const { - if (node->is_leaf()) { return; } - uint32_t nentries = node->has_valid_edge() ? node->total_entries() + 1 : node->total_entries(); - BtreeNodePtr previous_child = nullptr; - for (uint32_t ind = 0; ind < nentries; ++ind) { - BtreeLinkInfo child_info; - node->get_nth_value(ind, &child_info, false /* copy */); - if (child_info.bnode_id() == empty_bnodeid) { - throw std::runtime_error(fmt::format("{}-th child of node [{}] info has empty bnode_id", ind, node->to_string())); - } - BtreeNodePtr child_node; - if (auto ret = read_node_impl(child_info.bnode_id(), child_node); ret != btree_status_t::success) { - throw std::runtime_error(fmt::format("Failed to read child node [{}] of node [{}]", child_info.bnode_id(), node->to_string())); - } - if (ind == nentries - 1) { last_child_node = child_node; } - if (child_node->is_node_deleted()) { - throw std::runtime_error(fmt::format("Child node [{}] is deleted for parent [{}]", child_node->to_string(), node->to_string())); - } - if (child_node->level() != node->level() - 1) { - throw std::runtime_error(fmt::format("Child node level mismatch node [{}] child level: {}, expected: {}",child_node->to_string(), child_node->level(), node->level() - 1)); - } - - K child_first_key = child_node->get_first_key< K >(); - K child_last_key = child_node->get_last_key< K >(); - K parent_nth_key; - - if(child_node->total_entries() >0) { - if(ind< node->total_entries()){ - parent_nth_key= node->get_nth_key(ind, false /* copy */); - if(child_first_key.compare(parent_nth_key) > 0) { - throw std::runtime_error(fmt::format("{}-th Child node [{}] first key is less than its corresponding parent node [{}] key",ind,child_node->to_string(),node->to_string())); - } - if(child_last_key.compare(parent_nth_key) > 0) { - throw std::runtime_error(fmt::format("{}-th Child node [{}] last key is greater than its corresponding parent node [{}] key",ind, child_node->to_string(), node->to_string())); - } - } - - } else if (!child_node->is_leaf() && !child_node->has_valid_edge()) { - throw std::runtime_error(fmt::format("Interior Child node [{}] cannot be empty", child_node->to_string())); - } - - if(ind > 0){ - if (previous_child->next_bnode()!= child_node->node_id()) { - throw std::runtime_error(fmt::format("Broken child linkage: {}-th Child node [{}] node id is not equal to previous child node [{}] next node",ind, child_node->to_string(), child_node->node_id(), previous_child->to_string())); - } - K last_parent_key = node->get_nth_key< K >(ind-1, false /* copy */); - K previous_child_last_key = previous_child->get_last_key< K >(); - if(child_node->total_entries()){ - if (previous_child->total_entries() && child_first_key.compare(previous_child_last_key) <= 0) { - throw std::runtime_error(fmt::format("Child node [{}] first key is not greater than previous child node [{}] last key",child_node->to_string(), previous_child->to_string())); - } - if(child_first_key.compare(last_parent_key) <= 0) { - throw std::runtime_error(fmt::format("Child node [{}] first key is not greater than previous key ({}-th) parent node [{}] key ",child_node->to_string(),ind-1, node->to_string())); - } - } - } - - previous_child = child_node; - } - if(node->has_valid_edge() && last_child_node->is_leaf() && last_child_node->next_bnode()!=empty_bnodeid) { - // If the last child node is a leaf and has a next_bnode, it cannot be a valid edge. - throw std::runtime_error(fmt::format("Last child node [{}] of node [{}] is the last child but has next_bnode", - last_child_node->to_string(), node->to_string())); - } - if(node->has_valid_edge() && !last_child_node->is_leaf() && !last_child_node->has_valid_edge()) { - throw std::runtime_error(fmt::format("Last child node [{}] of edge node [{}] is not edge", - last_child_node->to_string(), node->to_string())); - } - if(!node->has_valid_edge() && last_child_node->is_leaf() && last_child_node->next_bnode()==empty_bnodeid){ - throw std::runtime_error(fmt::format("node [{}] is not edge but last child node [{}] is leaf and has no next_bnode", - node->to_string(),last_child_node->to_string())); - } - if(!node->has_valid_edge() && !last_child_node->is_leaf() && last_child_node->has_valid_edge()){ - throw std::runtime_error(fmt::format("node [{}] is not edge but last child node [{}] has valid edge", - node->to_string(), last_child_node->to_string())); - } -} - -template < typename K, typename V > -void Btree< K, V >::validate_next_node_relation(BtreeNodePtr node, BtreeNodePtr neighbor_node, - BtreeNodePtr last_child_node) const { - K last_key = node->get_last_key< K >(); - - if (neighbor_node->total_entries() == 0 && !neighbor_node->has_valid_edge() && last_child_node &&last_child_node->next_bnode() != empty_bnodeid) { - throw std::runtime_error(fmt::format("neighbor [{}] has no entries nor valid edge but the last child, [{}] of the parent [{}] has next node id {}",neighbor_node->to_string(), last_child_node->to_string(), node->to_string(), last_child_node->next_bnode())); - } - if ((neighbor_node->total_entries() != 0 || neighbor_node->has_valid_edge()) && last_child_node &&last_child_node->next_bnode() == empty_bnodeid) { - throw std::runtime_error(fmt::format("neighbor [{}] has entries or valid edge but the last child, [{}] of the parent [{}] has no next node id",neighbor_node->to_string(), last_child_node->to_string(), node->to_string())); - } - - if (neighbor_node->is_node_deleted()) { - throw std::runtime_error(fmt::format("Neighbor node [{}] is deleted " , neighbor_node->to_string())); - } - if (neighbor_node->level() != node->level()) { - throw std::runtime_error(fmt::format("Neighbor node [{}] level {} mismatch vs node [{}] level {}", - neighbor_node->to_string(), neighbor_node->level(), node->to_string(), - node->level())); - } - K neighbor_first_key = neighbor_node->get_first_key< K >(); - auto neighbor_entities = neighbor_node->total_entries(); - if (neighbor_entities && neighbor_first_key.compare(last_key) < 0) { - throw std::runtime_error(fmt::format("Neighbor's first key {} is not greater than node's last key {} (node=[{}], neighbor=[{}])", - neighbor_first_key.to_string(), last_key.to_string(), node->to_string(), neighbor_node->to_string())); - } - if (!node->is_leaf()) { - if (!neighbor_node->has_valid_edge() && !neighbor_entities) { - throw std::runtime_error(fmt::format("Interior neighbor node [{}] is empty ", neighbor_node->to_string())); - } - BtreeLinkInfo first_neighbor_info; - neighbor_node->get_nth_value(0, &first_neighbor_info, false /* copy */); - if (last_child_node->next_bnode() != first_neighbor_info.bnode_id()) { - throw std::runtime_error(fmt::format("Last child node's next_bnode (child=[{}]) does not match neighbor's first bnode_id (neighbor=[{}])", last_child_node->to_string(), neighbor_node->to_string())); - - } - } -} - -template -void Btree::validate_node(const bnodeid_t& bnodeid) const { - BtreeNodePtr node; - if (auto ret = read_node_impl(bnodeid, node); ret != btree_status_t::success) { - throw std::runtime_error(fmt::format("node read failed for bnodeid: {} reason: {}", bnodeid, ret)); - } else { - try { - if (node->is_node_deleted()) { return; } - auto nentities = node->total_entries(); - if (!node->is_leaf() && !nentities && !node->has_valid_edge()) { - throw std::runtime_error(fmt::format("Node [{}] has no entries and no valid edge", node->to_string())); - } - if (node->is_leaf() && node->has_valid_edge()) { - throw std::runtime_error(fmt::format("node [{}] is leaf but has valid edge", node->to_string())); - } - if(!node->validate_key_order()){ - throw std::runtime_error(fmt::format("unsorted node's entries [{}]", node->to_string())); - } - - BtreeNodePtr last_child_node; - validate_node_child_relation(node, last_child_node); - - auto neighbor_id = node->next_bnode(); - if (neighbor_id != empty_bnodeid && node->has_valid_edge()) { - throw std::runtime_error(fmt::format("node [{}] has valid edge but next_bnode is not empty", node->to_string())); - } - if (!node->is_leaf() && neighbor_id == empty_bnodeid && !node->has_valid_edge()) { - throw std::runtime_error(fmt::format("node [{}] is interior but has no valid edge and next_bnode is empty", node->to_string())); - } - if (bnodeid == neighbor_id) { - throw std::runtime_error(fmt::format("node [{}] has next_bnode same as itself", node->to_string())); - } - - if (neighbor_id != empty_bnodeid) { - BtreeNodePtr neighbor_node; - if (auto ret = read_node_impl(neighbor_id, neighbor_node); ret != btree_status_t::success) { - throw std::runtime_error(fmt::format("reading neighbor node of [{}] failed for bnodeid: {} reason : {}", node->to_string(), neighbor_id, ret)); - } - validate_next_node_relation(node, neighbor_node, last_child_node); - } - } catch (const std::exception& e) { - LOGERROR("Validation failed for bnodeid: {} error: {}", bnodeid, e.what()); - throw; - } - } -} - - -template < typename K, typename V > -void Btree< K, V >::sanity_sub_tree(bnodeid_t bnodeid) const { - if (bnodeid == 0) { bnodeid = m_root_node_info.bnode_id(); } - BtreeNodePtr node; - if (auto ret = read_node_impl(bnodeid, node); ret != btree_status_t::success) { - LOGINFO("reading node failed for bnodeid: {} reason: {}", bnodeid, ret); - } else { - node->validate_key_order< K >(); - if (node->is_leaf()) { return; } - uint32_t nentries = node->has_valid_edge() ? node->total_entries() + 1 : node->total_entries(); - std::vector< bnodeid_t > child_id_list; - child_id_list.reserve(nentries); - BT_REL_ASSERT_NE(node->has_valid_edge() && node->next_bnode() != empty_bnodeid, true, - "node {} has valid edge and next id is not empty", node->to_string()); - for (uint32_t i = 0; i < nentries; ++i) { - validate_sanity_child(node, i); - BtreeLinkInfo child_info; - node->get_nth_value(i, &child_info, false /* copy */); - child_id_list.push_back(child_info.bnode_id()); - } - for (auto child_id : child_id_list) { - sanity_sub_tree(child_id); - } - } -} - template < typename K, typename V > void Btree< K, V >::validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const { BtreeLinkInfo child_info; @@ -449,7 +263,7 @@ void Btree< K, V >::validate_sanity_child(const BtreeNodePtr& parent_node, uint3 parent_node->get_nth_value(ind, &child_info, false /* copy */); BtreeNodePtr child_node = nullptr; - auto ret = read_node_impl(child_info.bnode_id(), child_node); + auto ret = m_bt_private->read_node(child_info.bnode_id(), child_node); BT_REL_ASSERT_EQ(ret, btree_status_t::success, "read failed, reason: {}", ret); if (child_node->total_entries() == 0) { auto parent_entries = parent_node->total_entries(); @@ -458,36 +272,26 @@ void Btree< K, V >::validate_sanity_child(const BtreeNodePtr& parent_node, uint3 } return; } - BT_REL_ASSERT_NE(child_node->is_node_deleted(), true, "child node {} is deleted", child_node->to_string()); - if (ind >= parent_node->total_entries()) { + child_node->get_first_key(&child_first_key); + child_node->get_last_key(&child_last_key); + BT_REL_ASSERT_LE(child_first_key.compare(&child_last_key), 0); + if (ind == parent_node->total_entries()) { BT_REL_ASSERT_EQ(parent_node->has_valid_edge(), true); - if (ind > 0) { parent_key = parent_node->get_nth_key< K >(ind - 1, false); } + if (ind > 0) { + parent_node->get_nth_key< K >(ind - 1, &parent_key, false); + BT_REL_ASSERT_GT(child_first_key.compare(&parent_key), 0); + BT_REL_ASSERT_LT(parent_key.compare_start(&child_first_key), 0); + } } else { - parent_key = parent_node->get_nth_key< K >(ind, false); - } - K previous_parent_key; - if (ind > 0 && parent_node->total_entries() > 0) { - previous_parent_key = parent_node->get_nth_key< K >(ind - 1, false); - } - for (uint32_t i = 0; i < child_node->total_entries(); ++i) { - K cur_child_key = child_node->get_nth_key< K >(i, false); - if(ind < parent_node->total_entries()){ - BT_REL_ASSERT_LE(cur_child_key.compare(parent_key), 0, " child {} {}-th key is greater than its parent's {} {}-th key", child_node->to_string(), i , parent_node->to_string(), ind); - if(ind>0) { - if(cur_child_key.compare(previous_parent_key) <= 0){ - // there can be a transient case where a key appears in two children. When the replay is done, it should be fixed - // Consider the example Parent P, children C1, C2, C3, C4. A key is deleted resulting in a merge and C3 deleted, and the same key is inserted in the current cp - // Our case is that P is dirtied, C3 deleted, C4 updated and flushed. During recover, we will keep C3 and P remains the same. - // Since C4 is flushed, the key that was removd and inserted will showup in C3 and C4. - // After the replay post recovery, C3 should be gone and the tree is valid again. - BT_LOG(DEBUG, "child {} {}-th key is less than or equal to its parent's {} {}-th key", child_node->to_string(), i, parent_node->to_string(), ind - 1); - } - } - - } else { - BT_REL_ASSERT_GT(cur_child_key.compare(parent_key), 0, - " child {} {}-th key is greater than its parent {} {}-th key", child_node->to_string(), i, - parent_node->to_string(), ind); + parent_node->get_nth_key< K >(ind, &parent_key, false); + BT_REL_ASSERT_LE(child_first_key.compare(&parent_key), 0) + BT_REL_ASSERT_LE(child_last_key.compare(&parent_key), 0) + BT_REL_ASSERT_GE(parent_key.compare_start(&child_first_key), 0) + BT_REL_ASSERT_GE(parent_key.compare_start(&child_first_key), 0) + if (ind != 0) { + parent_node->get_nth_key< K >(ind - 1, &parent_key, false); + BT_REL_ASSERT_GT(child_first_key.compare(&parent_key), 0) + BT_REL_ASSERT_LT(parent_key.compare_start(&child_first_key), 0) } } } @@ -506,7 +310,7 @@ void Btree< K, V >::validate_sanity_next_child(const BtreeNodePtr& parent_node, parent_node->get_nth_value(ind + 1, &child_info, false /* copy */); BtreeNodePtr child_node = nullptr; - auto ret = read_node_impl(child_info.bnode_id(), child_node); + auto ret = m_bt_private->read_node(child_info.bnode_id(), child_node); BT_REL_ASSERT_EQ(ret, btree_status_t::success, "read failed, reason: {}", ret); if (child_node->total_entries() == 0) { @@ -519,10 +323,10 @@ void Btree< K, V >::validate_sanity_next_child(const BtreeNodePtr& parent_node, } /* in case of merge next child will never have zero entries otherwise it would have been merged */ BT_NODE_REL_ASSERT_NE(child_node->total_entries(), 0, child_node); - child_key = child_node->get_first_key< K >(); + child_node->get_first_key(&child_key); parent_node->get_nth_key< K >(ind, &parent_key, false); BT_REL_ASSERT_GT(child_key.compare(&parent_key), 0) - BT_REL_ASSERT_LT(parent_key.compare_start(&child_key), 0); + BT_REL_ASSERT_LT(parent_key.compare_start(&child_key), 0) } template < typename K, typename V > @@ -545,15 +349,15 @@ done: template < typename K, typename V > void Btree< K, V >::append_route_trace(BtreeRequest& req, const BtreeNodePtr& node, btree_event_t event, uint32_t start_idx, uint32_t end_idx) const { - if (req.route_tracing) { - req.route_tracing->emplace_back(trace_route_entry{.node_id = node->node_id(), - .node = node.get(), - .start_idx = start_idx, - .end_idx = end_idx, - .num_entries = node->total_entries(), - .level = node->level(), - .is_leaf = node->is_leaf(), - .event = event}); + if (req.m_route_tracing) { + req.m_route_tracing->emplace_back(trace_route_entry{.node_id = node->node_id(), + .node = node.get(), + .start_idx = start_idx, + .end_idx = end_idx, + .num_entries = node->total_entries(), + .level = node->level(), + .is_leaf = node->is_leaf(), + .event = event}); } } } // namespace homestore diff --git a/src/include/homestore/btree/detail/btree_get_impl.ipp b/src/include/homestore/btree/detail/btree_get_impl.ipp index 4f0c09732..c692b6beb 100644 --- a/src/include/homestore/btree/detail/btree_get_impl.ipp +++ b/src/include/homestore/btree/detail/btree_get_impl.ipp @@ -17,9 +17,34 @@ #include namespace homestore { + +template < typename K, typename V > +template < typename ReqT > +btree_status_t Btree< K, V >::get(ReqT& greq) { + static_assert(std::is_same_v< BtreeSingleGetRequest, ReqT > || std::is_same_v< BtreeGetAnyRequest< K >, ReqT >, + "get api is called with non get request type"); + + btree_status_t ret = btree_status_t::success; + + m_btree_lock.lock_shared(); + BtreeNodePtr root; + + ret = read_and_lock_node(m_root_node_info.bnode_id(), root, locktype_t::READ, locktype_t::READ, greq.m_op_context); + if (ret != btree_status_t::success) { goto out; } + + ret = do_get(root, greq); +out: + m_btree_lock.unlock_shared(); + +#ifndef NDEBUG + check_lock_debug(); +#endif + return ret; +} + template < typename K, typename V > template < typename ReqT > -btree_status_t Btree< K, V >::do_get(const BtreeNodePtr& my_node, ReqT& greq) const { +btree_status_t Btree< K, V >::do_get(const BtreeNodePtr& my_node, ReqT& greq) { btree_status_t ret{btree_status_t::success}; bool found{false}; uint32_t idx; @@ -34,7 +59,7 @@ btree_status_t Btree< K, V >::do_get(const BtreeNodePtr& my_node, ReqT& greq) co if (!found) { ret = btree_status_t::not_found; } else { - if (greq.route_tracing) { append_route_trace(greq, my_node, btree_event_t::READ, idx, idx); } + if (greq.m_route_tracing) { append_route_trace(greq, my_node, btree_event_t::READ, idx, idx); } } unlock_node(my_node, locktype_t::READ); return ret; @@ -47,7 +72,7 @@ btree_status_t Btree< K, V >::do_get(const BtreeNodePtr& my_node, ReqT& greq) co std::tie(found, idx) = my_node->find(greq.key(), &child_info, true); } - if (greq.route_tracing) { append_route_trace(greq, my_node, btree_event_t::READ, idx, idx); } + if (greq.m_route_tracing) { append_route_trace(greq, my_node, btree_event_t::READ, idx, idx); } ASSERT_IS_VALID_INTERIOR_CHILD_INDX(found, idx, my_node); BtreeNodePtr child_node; diff --git a/src/include/homestore/btree/detail/btree_internal.hpp b/src/include/homestore/btree/detail/btree_internal.hpp index 44ba95828..d93ff507c 100644 --- a/src/include/homestore/btree/detail/btree_internal.hpp +++ b/src/include/homestore/btree/detail/btree_internal.hpp @@ -18,9 +18,12 @@ #include #include #include +#include +#include #include #include #include +#include namespace homestore { @@ -51,6 +54,8 @@ namespace homestore { #define BT_NODE_LOG(level, node, msg, ...) \ { LOG##level##MOD_FMT(btree, (_BT_LOG_METHOD_IMPL(, this->m_bt_cfg, node)), msg, ##__VA_ARGS__); } +#define SPECIFIC_BT_LOG(level, bt, msg, ...) \ + { LOG##level##MOD_FMT(btree, (_BT_LOG_METHOD_IMPL(, bt.bt_config(), )), msg, ##__VA_ARGS__); } #if 0 #define THIS_BT_LOG(level, req, msg, ...) \ { \ @@ -195,29 +200,17 @@ using bnodeid_t = uint64_t; static constexpr bnodeid_t empty_bnodeid = std::numeric_limits< bnodeid_t >::max(); static constexpr uint16_t bt_init_crc_16 = 0x8005; -VENUM(btree_node_type, uint32_t, FIXED = 0, VAR_VALUE = 1, VAR_KEY = 2, VAR_OBJECT = 3, PREFIX = 4, COMPACT = 5) +VENUM(btree_node_type, uint32_t, FIXED = 0, VAR_VALUE = 1, VAR_KEY = 2, VAR_OBJECT = 3, FIXED_PREFIX = 4, COMPACT = 5) -#ifdef USE_STORE_TYPE -VENUM(btree_store_type, uint8_t, MEM = 0, SSD = 1) -#endif - -ENUM(btree_status_t, uint32_t, success, not_found, retry, has_more, node_read_failed, already_exists, filtered_out, - space_not_avail, cp_mismatch, merge_not_required, merge_failed, crc_mismatch, not_supported, node_freed, stopping) - -/*ENUM(btree_node_write_type, uint8_t, - new_node, // Node write whenever a new node is created. - inplace_leaf, // Node write after an entry is updated/added in leaf without changing btree structure, most common - inplace_interior, // Node write after a structure change, but this interior node is changed in-place only. - after_shift // Node write after a structure change, but this node has its keys shifted to other node. -);*/ +ENUM(btree_status_t, uint32_t, success, not_found, retry, has_more, node_read_failed, put_failed, space_not_avail, + cp_mismatch, merge_not_required, merge_failed, crc_mismatch, not_supported, node_freed) class BtreeNode; +using BtreeNodePtr = boost::intrusive_ptr< BtreeNode >; +using BtreeNodeList = folly::small_vector< BtreeNodePtr, 3 >; void intrusive_ptr_add_ref(BtreeNode* node); void intrusive_ptr_release(BtreeNode* node); -template < typename K, typename V > -using to_string_cb_t = std::function< std::string(std::vector< std::pair< K, V > > const&) >; - ENUM(btree_event_t, uint8_t, READ, MUTATE, REMOVE, SPLIT, REPAIR, MERGE); struct trace_route_entry { @@ -238,62 +231,38 @@ struct trace_route_entry { }; struct BtreeConfig { - uint32_t m_node_size; - uint32_t m_node_data_size; + uint32_t m_node_size{0}; uint8_t m_ideal_fill_pct{90}; uint8_t m_suggested_min_pct{30}; uint8_t m_split_pct{50}; uint32_t m_max_merge_nodes{3}; -#ifdef _PRERELEASE - // These are for testing purpose only - uint64_t m_max_keys_in_node{0}; - uint64_t m_min_keys_in_node{0}; -#endif bool m_rebalance_turned_on{false}; + bool m_merge_turned_on{true}; btree_node_type m_leaf_node_type{btree_node_type::VAR_OBJECT}; btree_node_type m_int_node_type{btree_node_type::VAR_KEY}; - std::string m_btree_name; // Unique name for the btree - bool m_merge_turned_on{true}; - uint8_t m_max_merge_level{1}; + IndexStore::Type m_store_type{IndexStore::Type::COPY_ON_WRITE_BTREE}; + std::string m_btree_name{""}; // Unique name for the btree private: uint32_t m_suggested_min_size; // Precomputed values uint32_t m_ideal_fill_size; public: - BtreeConfig(uint32_t node_size, const std::string& btree_name = "") : - m_node_size{node_size}, m_btree_name{btree_name.empty() ? std::string("btree") : btree_name} { - set_node_data_size(node_size - 512); // Just put estimate at this point of time. + void finalize(uint32_t node_header_size) { + m_ideal_fill_size = (uint32_t)((m_node_size - node_header_size) * m_ideal_fill_pct) / 100; + m_suggested_min_size = (uint32_t)((m_node_size - node_header_size) * m_suggested_min_pct) / 100; } - virtual ~BtreeConfig() = default; uint32_t node_size() const { return m_node_size; }; - - void set_node_data_size(uint32_t data_size) { - m_node_data_size = data_size; - m_ideal_fill_size = (uint32_t)(m_node_data_size * m_ideal_fill_pct) / 100; // Recompute the values - m_suggested_min_size = (uint32_t)(m_node_data_size * m_suggested_min_pct) / 100; - } - uint32_t split_size(uint32_t filled_size) const { return uint32_cast(filled_size * m_split_pct) / 100; } uint32_t ideal_fill_size() const { return m_ideal_fill_size; } uint32_t suggested_min_size() const { return m_suggested_min_size; } - uint32_t node_data_size() const { return m_node_data_size; } - - void set_ideal_fill_pct(uint8_t pct) { - m_ideal_fill_pct = pct; - m_ideal_fill_size = (uint32_t)(node_data_size() * m_ideal_fill_pct) / 100; - } - - void set_suggested_min_size(uint8_t pct) { - m_suggested_min_pct = pct; - m_suggested_min_size = (uint32_t)(node_data_size() * m_suggested_min_pct) / 100; - } const std::string& name() const { return m_btree_name; } btree_node_type leaf_node_type() const { return m_leaf_node_type; } btree_node_type interior_node_type() const { return m_int_node_type; } + IndexStore::Type store_type() const { return m_store_type; } }; class BtreeMetrics : public sisl::MetricsGroup { @@ -321,6 +290,8 @@ class BtreeMetrics : public sisl::MetricsGroup { REGISTER_COUNTER(btree_retry_count, "number of retries"); REGISTER_COUNTER(write_err_cnt, "number of errors in write"); REGISTER_COUNTER(query_err_cnt, "number of errors in query"); + REGISTER_COUNTER(read_node_count_in_write_ops, "number of nodes read in write_op"); + REGISTER_COUNTER(read_node_count_in_query_ops, "number of nodes read in query_op"); REGISTER_COUNTER(btree_write_ops_count, "number of btree operations"); REGISTER_COUNTER(btree_query_ops_count, "number of btree operations"); REGISTER_COUNTER(btree_remove_ops_count, "number of btree operations"); diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp index 441a3fed0..0a8f57686 100644 --- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp +++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp @@ -18,6 +18,80 @@ namespace homestore { +template < typename K, typename V > +template < typename ReqT > +btree_status_t Btree< K, V >::put(ReqT& put_req) { + static_assert(std::is_same_v< ReqT, BtreeSinglePutRequest > || std::is_same_v< ReqT, BtreeRangePutRequest< K > >, + "put api is called with non put request type"); + COUNTER_INCREMENT(m_metrics, btree_write_ops_count, 1); + auto acq_lock = locktype_t::READ; + bool is_leaf = false; + + m_btree_lock.lock_shared(); + btree_status_t ret = btree_status_t::success; + +retry: + auto cpg = bt_cp_guard(); + put_req.m_op_context = cpg.context(cp_consumer_t::INDEX_SVC); + +#ifdef _DEBUG + check_lock_debug(); + BT_LOG_ASSERT_EQ(BtreeBase::thread_vars()->rd_locked_nodes.size(), 0); + BT_LOG_ASSERT_EQ(BtreeBase::thread_vars()->wr_locked_nodes.size(), 0); +#endif + + BtreeNodePtr root; + ret = read_and_lock_node(m_root_node_info.bnode_id(), root, acq_lock, acq_lock, put_req.m_op_context); + if (ret != btree_status_t::success) { goto out; } + is_leaf = root->is_leaf(); + + if (is_split_needed(root, put_req)) { + // Time to do the split of root. + unlock_node(root, acq_lock); + m_btree_lock.unlock_shared(); + ret = check_split_root(put_req); + BT_DBG_ASSERT_EQ(BtreeBase::thread_vars()->rd_locked_nodes.size(), 0); + BT_DBG_ASSERT_EQ(BtreeBase::thread_vars()->wr_locked_nodes.size(), 0); + + // We must have gotten a new root, need to start from scratch. + m_btree_lock.lock_shared(); + if (ret != btree_status_t::success) { + LOGERROR("root split failed btree name {}", m_bt_cfg.name()); + goto out; + } + + goto retry; + } else if ((is_leaf) && (acq_lock != locktype_t::WRITE)) { + // Root is a leaf, need to take write lock, instead of read, retry + unlock_node(root, acq_lock); + acq_lock = locktype_t::WRITE; + goto retry; + } else { + ret = do_put(root, acq_lock, put_req); + if ((ret == btree_status_t::retry) || (ret == btree_status_t::has_more) || + (ret == btree_status_t::cp_mismatch)) { + // Need to start from top down again, since there was a split or we have more to insert in case of range put + acq_lock = locktype_t::READ; + BT_LOG(TRACE, "retrying put operation because btree reported retriable status {}", ret); + BT_DBG_ASSERT_EQ(BtreeBase::thread_vars()->rd_locked_nodes.size(), 0); + BT_DBG_ASSERT_EQ(BtreeBase::thread_vars()->wr_locked_nodes.size(), 0); + goto retry; + } + } + +out: + m_btree_lock.unlock_shared(); +#ifndef NDEBUG + check_lock_debug(); +#endif + if (ret != btree_status_t::success) { + BT_LOG(ERROR, "btree put failed {}", ret); + COUNTER_INCREMENT(m_metrics, write_err_cnt, 1); + } + + return ret; +} + /* This function does the heavy lifiting of co-ordinating inserts. It is a recursive function which walks * down the tree. * @@ -56,7 +130,7 @@ retry: const auto matched = my_node->match_range(req.working_range(), start_idx, end_idx); if (!matched) { BT_NODE_LOG_ASSERT(false, my_node, "match_range returns 0 entries for interior node is not valid pattern"); - ret = btree_status_t::not_found; + ret = btree_status_t::put_failed; goto out; } } else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) { @@ -68,7 +142,7 @@ retry: BT_NODE_DBG_ASSERT((curlock == locktype_t::READ || curlock == locktype_t::WRITE), my_node, "unexpected locktype {}", curlock); - if (req.route_tracing) { append_route_trace(req, my_node, btree_event_t::READ, start_idx, end_idx); } + if (req.m_route_tracing) { append_route_trace(req, my_node, btree_event_t::READ, start_idx, end_idx); } curr_idx = start_idx; while (curr_idx <= end_idx) { // iterate all matched childrens @@ -105,7 +179,7 @@ retry: unlock_lambda(child_node, child_cur_lock); if (ret != btree_status_t::success) { goto out; } - if (req.route_tracing) { append_route_trace(req, child_node, btree_event_t::SPLIT); } + if (req.m_route_tracing) { append_route_trace(req, child_node, btree_event_t::SPLIT); } COUNTER_INCREMENT(m_metrics, btree_split_count, 1); goto retry; // After split, retry search and walk down. } @@ -175,20 +249,22 @@ btree_status_t Btree< K, V >::mutate_write_leaf_node(const BtreeNodePtr& my_node if constexpr (std::is_same_v< ReqT, BtreeRangePutRequest< K > >) { K last_failed_key; ret = to_variant_node(my_node)->multi_put(req.working_range(), req.input_range().start_key(), *req.m_newval, - req.m_put_type, &last_failed_key, req.m_filter_cb, req.m_app_context); + req.m_put_type, &last_failed_key, req.m_filter_cb); if (ret == btree_status_t::has_more) { req.shift_working_range(std::move(last_failed_key), true /* make it including last_failed_key */); } else if (ret == btree_status_t::success) { req.shift_working_range(); } } else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) { - ret = - to_variant_node(my_node)->put(req.key(), req.value(), req.m_put_type, req.m_existing_val, req.m_filter_cb); + if (!to_variant_node(my_node)->put(req.key(), req.value(), req.m_put_type, req.m_existing_val, + req.m_filter_cb)) { + ret = btree_status_t::put_failed; + } COUNTER_INCREMENT(m_metrics, btree_obj_count, 1); } if ((ret == btree_status_t::success) || (ret == btree_status_t::has_more)) { - if (req.route_tracing) { append_route_trace(req, my_node, btree_event_t::MUTATE); } + if (req.m_route_tracing) { append_route_trace(req, my_node, btree_event_t::MUTATE); } write_node(my_node, req.m_op_context); } return ret; @@ -212,7 +288,7 @@ btree_status_t Btree< K, V >::check_split_root(ReqT& req) { goto done; } - new_root = alloc_interior_node(); + new_root = create_interior_node(req.m_op_context); if (new_root == nullptr) { ret = btree_status_t::space_not_avail; unlock_node(root, locktype_t::WRITE); @@ -225,23 +301,22 @@ btree_status_t Btree< K, V >::check_split_root(ReqT& req) { root = std::move(new_root); // We need to notify about the root change, before splitting the node, so that correct dependencies are set - ret = on_root_changed(root, req.m_op_context); + ret = m_bt_private->on_root_changed(root, req.m_op_context); if (ret != btree_status_t::success) { - free_node(root, locktype_t::WRITE, req.m_op_context); + remove_node(root, locktype_t::WRITE, req.m_op_context); unlock_node(child_node, locktype_t::WRITE); goto done; } ret = split_node(root, child_node, root->total_entries(), &split_key, req.m_op_context); if (ret != btree_status_t::success) { - free_node(root, locktype_t::WRITE, req.m_op_context); + remove_node(root, locktype_t::WRITE, req.m_op_context); root = std::move(child_node); - on_root_changed(root, req.m_op_context); // Revert it back + m_bt_private->on_root_changed(root, req.m_op_context); // Revert it back unlock_node(root, locktype_t::WRITE); } else { - if (req.route_tracing) { append_route_trace(req, child_node, btree_event_t::SPLIT); } + if (req.m_route_tracing) { append_route_trace(req, child_node, btree_event_t::SPLIT); } m_root_node_info = BtreeLinkInfo{root->node_id(), root->link_version()}; - this->m_btree_depth = root->level(); unlock_node(child_node, locktype_t::WRITE); COUNTER_INCREMENT(m_metrics, btree_depth, 1); } @@ -253,10 +328,10 @@ done: template < typename K, typename V > btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const BtreeNodePtr& child_node, - uint32_t parent_ind, K* out_split_key, void* context) { + uint32_t parent_ind, K* out_split_key, CPContext* context) { BtreeNodePtr child_node1 = child_node; BtreeNodePtr child_node2; - child_node2.reset(child_node1->is_leaf() ? alloc_leaf_node().get() : alloc_interior_node().get()); + child_node2.reset(child_node1->is_leaf() ? create_leaf_node(context).get() : create_interior_node(context).get()); if (child_node2 == nullptr) { return (btree_status_t::space_not_avail); } @@ -268,7 +343,7 @@ btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const uint32_t child1_filled_size = child_node1->node_data_size() - child_node1->available_size(); auto split_size = m_bt_cfg.split_size(child1_filled_size); - uint32_t res = child_node1->move_out_to_right_by_size(m_bt_cfg, *child_node2, split_size); + uint32_t res = child_node1->move_out_to_right_by_size(*child_node2, split_size); BT_NODE_REL_ASSERT_GT(res, 0, child_node1, "Unable to split entries in the child node"); // means cannot split entries @@ -296,7 +371,7 @@ btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const BT_NODE_LOG(DEBUG, child_node1, "Left child"); BT_NODE_LOG(DEBUG, child_node2, "Right child"); - ret = transact_nodes({child_node2}, {}, child_node1, parent_node, context); + ret = m_bt_private->transact_nodes({child_node2}, {}, child_node1, parent_node, context); // NOTE: Do not access parentInd after insert, since insert would have // shifted parentNode to the right. diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index 5cdaa94c5..8bf83966c 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -15,143 +15,202 @@ *********************************************************************************/ #pragma once -#include -#include +#include +#include #include #include #include #include -#include "btree_internal.hpp" +#include #include #include namespace homestore { ENUM(locktype_t, uint8_t, NONE, READ, WRITE) -#pragma pack(1) -struct transient_hdr_t { - mutable iomgr::FiberManagerLib::shared_mutex lock; - sisl::atomic_counter< uint16_t > upgraders{0}; +class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { + using node_find_result_t = std::pair< bool, uint32_t >; - /* these variables are accessed without taking lock and are not expected to change after init */ - uint8_t leaf_node{0}; - uint64_t max_keys_in_node{0}; - uint64_t min_keys_in_node{0}; // to specify the threshold for triggering merge +public: + static constexpr uint8_t BTREE_NODE_VERSION = 1; + static constexpr uint8_t BTREE_NODE_MAGIC = 0xab; - bool is_leaf() const { return (leaf_node != 0); } -}; +#pragma pack(1) + struct PersistentHeader { + uint8_t magic{BTREE_NODE_MAGIC}; // offset=0 + uint8_t version{BTREE_NODE_VERSION}; // offset=1 + uint16_t checksum{0}; // offset=2 + + uint32_t nentries : 30; // offset 4 + uint32_t leaf : 1; + uint32_t node_deleted : 1; + + bnodeid_t node_id{empty_bnodeid}; // offset=8 + bnodeid_t next_node{empty_bnodeid}; // offset=16 + + uint64_t node_gen{0}; // offset=24: Generation of this node, incremented on every update + uint64_t link_version{0}; // offset=32: Version of the link between its parent, updated if structure changes + BtreeLinkInfo::bnode_link_info edge_info; // offset=40: Edge entry information + + int64_t modified_cp_id{-1}; // offset=56: Checkpoint ID of the last modification of this node + uint16_t level; // offset=64: Level of the node within the tree + uint16_t node_size; // offset=66: Size of node, max 64K + uint8_t node_type; // offset=68: Type of the node (simple vs varlen etc..) + uint8_t reserved[3]{0, 0, 0}; // offset=69-72: Reserved + + PersistentHeader() : nentries{0}, leaf{0}, node_deleted{0} {} + std::string to_string() const { + auto snext = (next_node == empty_bnodeid) ? "" : " next=" + std::to_string(next_node); + auto sedge = (edge_info.m_bnodeid == empty_bnodeid) + ? "" + : fmt::format(" edge={}.{}", edge_info.m_bnodeid, edge_info.m_link_version); + return fmt::format("magic={} version={} csum={} node_id={}{} nentries={} node_type={} is_leaf={} " + "node_deleted={} node_gen={} modified_cp_id={} link_version={}{} level={} ", + magic, version, checksum, node_id, snext, nentries, node_type, leaf, node_deleted, + node_gen, modified_cp_id, link_version, sedge, level); + } + + std::string to_compact_string() const { + auto snext = (next_node == empty_bnodeid) ? "" : " next=" + std::to_string(next_node); + auto sedge = (edge_info.m_bnodeid == empty_bnodeid) + ? "" + : fmt::format(" edge={}.{}", edge_info.m_bnodeid, edge_info.m_link_version); + return fmt::format("id={}{}{} {} level={} nentries={}{} mod_cp={}", node_id, snext, sedge, + leaf ? "LEAF" : "INTERIOR", level, nentries, (node_deleted == 0x1) ? " Deleted" : "", + modified_cp_id); + } + }; #pragma pack() -static constexpr uint8_t BTREE_NODE_VERSION = 1; -static constexpr uint8_t BTREE_NODE_MAGIC = 0xab; + struct Allocator { + using Token = uint8_t; + std::function< uint8_t*(uint32_t size) > alloc_btree_node; + std::function< void(BtreeNode* node) > free_btree_node; + std::function< uint8_t*(uint32_t size) > alloc_node_buf; + std::function< void(uint8_t*) > free_node_buf; + static constexpr Token default_token = 0; + + Allocator() : + alloc_btree_node{[](uint32_t size) -> uint8_t* { return new uint8_t[size]; }}, + free_btree_node{[](BtreeNode* node) { + node->~BtreeNode(); + delete[] uintptr_cast(node); + }}, + alloc_node_buf{[](uint32_t size) { return new uint8_t[size]; }}, + free_node_buf{[](uint8_t* buf) { delete[] buf; }} {} + + Allocator(std::function< uint8_t*(uint32_t size) > alloc_node_cb, + std::function< void(BtreeNode* node) > free_node_cb, + std::function< uint8_t*(uint32_t size) > alloc_buf_cb, std::function< void(uint8_t*) > free_buf_cb) : + alloc_btree_node{std::move(alloc_node_cb)}, + free_btree_node{std::move(free_node_cb)}, + alloc_node_buf{std::move(alloc_buf_cb)}, + free_node_buf{std::move(free_buf_cb)} {} + Allocator(const Allocator& b) = default; + Allocator(Allocator&& b) = default; + Allocator& operator=(const Allocator& b) = default; + Allocator& operator=(Allocator&& b) = default; + ~Allocator() = default; + + struct List { + std::vector< Allocator > vec; + std::mutex mtx; + List() : vec{1, Allocator{}} {} + }; + + static List& allocators() { + static List s_allocators; + return s_allocators; + } -#pragma pack(1) -struct persistent_hdr_t { - uint8_t magic{BTREE_NODE_MAGIC}; // offset=0 - uint8_t version{BTREE_NODE_VERSION}; // offset=1 - uint16_t checksum{0}; // offset=2 - - uint32_t nentries : 30; // offset 4 - uint32_t leaf : 1; - uint32_t node_deleted : 1; - - bnodeid_t node_id{empty_bnodeid}; // offset=8 - bnodeid_t next_node{empty_bnodeid}; // offset=16 - - uint64_t node_gen{0}; // offset=24: Generation of this node, incremented on every update - uint64_t link_version{0}; // offset=32: Version of the link between its parent, updated if structure changes - BtreeLinkInfo::bnode_link_info edge_info; // offset=40: Edge entry information - - int64_t modified_cp_id{-1}; // offset=56: Checkpoint ID of the last modification of this node - uint16_t level; // offset=64: Level of the node within the tree - uint16_t node_size; // offset=66: Size of node, max 64K - uint8_t node_type; // offset=68: Type of the node (simple vs varlen etc..) - uint8_t reserved[3]{0, 0, 0}; // offset=69-72: Reserved - - persistent_hdr_t() : nentries{0}, leaf{0}, node_deleted{0} {} - std::string to_string() const { - auto snext = (next_node == empty_bnodeid) ? "" : " next=" + std::to_string(next_node); - auto sedge = (edge_info.m_bnodeid == empty_bnodeid) - ? "" - : fmt::format(" edge={}.{}", edge_info.m_bnodeid, edge_info.m_link_version); - return fmt::format("magic={} version={} csum={} node_id={}{} nentries={} node_type={} is_leaf={} " - "node_deleted={} node_gen={} modified_cp_id={} link_version={}{} level={} ", - magic, version, checksum, node_id, snext, nentries, node_type, leaf, node_deleted, node_gen, - modified_cp_id, link_version, sedge, level); - } - - std::string to_compact_string() const { - auto snext = (next_node == empty_bnodeid) ? "" : " next=" + std::to_string(next_node); - auto sedge = (edge_info.m_bnodeid == empty_bnodeid) - ? "" - : fmt::format(" edge={}.{}", edge_info.m_bnodeid, edge_info.m_link_version); - return fmt::format("id={}{}{} {} level={} nentries={} mod_cp={}{}", node_id, snext, sedge, - leaf ? "LEAF" : "INTERIOR", level, nentries, modified_cp_id, - node_deleted == 0x1 ? " Deleted" : " LIVE"); - } -}; -#pragma pack() + static Allocator& get(Allocator::Token token) { return allocators().vec[token]; } + static Token add(Allocator a) { + std::unique_lock lg{allocators().mtx}; + allocators().vec.emplace_back(std::move(a)); + return s_cast< Allocator::Token >(allocators().vec.size() - 1); + } -class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { - using node_find_result_t = std::pair< bool, uint32_t >; + static void remove(Token t) { + std::unique_lock lg{allocators().mtx}; + if (t == allocators().vec.size() - 1) { + allocators().vec.erase(allocators().vec.end() - 1); + } else { + allocators().vec[t] = Allocator{}; + } + } + }; -public: - sisl::atomic_counter< int32_t > m_refcount{0}; - transient_hdr_t m_trans_hdr; - uint8_t* m_phys_node_buf; + uint8_t* m_phys_node_buf; // Pointer to the physical node buffer + sisl::atomic_counter< int32_t > m_refcount{0}; // Refcount of the node + + Allocator::Token m_token; + std::atomic< uint8_t > m_phys_buf_share_count{0}; + uint16_t m_variant_private_data{0}; // Data specific to variant (to reuse this wasted 16 bit space) + + mutable iomgr::FiberManagerLib::shared_mutex m_lock; public: - BtreeNode(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, BtreeConfig const& cfg) : - m_phys_node_buf{node_buf} { - if (init_buf) { - new (node_buf) persistent_hdr_t{}; - set_node_id(id); - set_leaf(is_leaf); - set_node_size(cfg.node_size()); - } else { - DEBUG_ASSERT_EQ(node_id(), id); - DEBUG_ASSERT_EQ(magic(), BTREE_NODE_MAGIC); - DEBUG_ASSERT_EQ(version(), BTREE_NODE_VERSION); - } - m_trans_hdr.leaf_node = is_leaf; -#ifdef _PRERELEASE - m_trans_hdr.max_keys_in_node = cfg.m_max_keys_in_node; - m_trans_hdr.min_keys_in_node = cfg.m_min_keys_in_node; -#endif + BtreeNode(bnodeid_t id, bool is_leaf, uint32_t node_size, Allocator::Token token) : + m_phys_node_buf{Allocator::get(token).alloc_node_buf(node_size)}, m_token{token} { + new (m_phys_node_buf) PersistentHeader{}; + set_node_id(id); + set_leaf(is_leaf); + set_node_size(node_size); + } + + BtreeNode(uint8_t* node_buf, bnodeid_t id, Allocator::Token token) : m_phys_node_buf{node_buf}, m_token{token} { + DEBUG_ASSERT_EQ(node_id(), id); + DEBUG_ASSERT_EQ(magic(), BTREE_NODE_MAGIC); + DEBUG_ASSERT_EQ(version(), BTREE_NODE_VERSION); } - virtual ~BtreeNode() = default; - // Identify if a node is a leaf node or not, from raw buffer, by just reading persistent_hdr_t - static bool identify_leaf_node(uint8_t* buf) { return (r_cast< persistent_hdr_t* >(buf))->leaf; } - static std::string to_string_buf(uint8_t* buf) { return (r_cast< persistent_hdr_t* >(buf))->to_compact_string(); } + virtual ~BtreeNode() { + DEBUG_ASSERT_EQ(m_phys_buf_share_count.load(), 0, + "We are being asked to destruct node while its buffer is still shared"); + Allocator::get(m_token).free_node_buf(m_phys_node_buf); + } + + // Identify if a node is a leaf node or not, from raw buffer, by just reading PersistentHeader + static bool identify_leaf_node(uint8_t* buf) { return (r_cast< PersistentHeader* >(buf))->leaf; } + static std::string to_string_buf(uint8_t* buf) { return (r_cast< PersistentHeader* >(buf))->to_compact_string(); } static BtreeLinkInfo::bnode_link_info identify_edge_info(uint8_t* buf) { - return (r_cast< persistent_hdr_t* >(buf))->edge_info; + return (r_cast< PersistentHeader* >(buf))->edge_info; } static bool is_valid_node(sisl::blob const& buf) { - auto phdr = r_cast< persistent_hdr_t const* >(buf.cbytes()); + auto phdr = r_cast< PersistentHeader const* >(buf.cbytes()); if ((phdr->magic != BTREE_NODE_MAGIC) || (phdr->version != BTREE_NODE_VERSION)) { return false; } if ((uint32_cast(phdr->node_size) + 1) != buf.size()) { return false; } if (phdr->node_id == empty_bnodeid) { return false; } - auto const exp_checksum = crc16_t10dif(bt_init_crc_16, (buf.cbytes() + sizeof(persistent_hdr_t)), - buf.size() - sizeof(persistent_hdr_t)); + auto const exp_checksum = crc16_t10dif(bt_init_crc_16, (buf.cbytes() + sizeof(PersistentHeader)), + buf.size() - sizeof(PersistentHeader)); if (phdr->checksum != exp_checksum) { return false; } return true; } - static void revert_node_delete(uint8_t* buf) { - auto phdr = r_cast< persistent_hdr_t* >(buf); - phdr->node_deleted = 0x0; + static void set_modified_cp_id(uint8_t* buf, int64_t cp_id) { + auto phdr = r_cast< PersistentHeader* >(buf); + phdr->modified_cp_id = cp_id; } - static int64_t get_modified_cp_id(uint8_t* buf) { - auto phdr = r_cast< persistent_hdr_t const* >(buf); + static int64_t get_modified_cp_id(uint8_t const* buf) { + auto phdr = r_cast< PersistentHeader const* >(buf); return phdr->modified_cp_id; } + static bool is_node_deleted(uint8_t const* buf) { + auto phdr = r_cast< PersistentHeader const* >(buf); + return phdr->node_deleted == 0x1; + } + + static bnodeid_t get_node_id(uint8_t const* buf) { + auto phdr = r_cast< PersistentHeader const* >(buf); + return phdr->node_id; + } + /// @brief Finds the index of the entry with the specified key in the node. /// /// This method performs a binary search on the node to find the index of the entry with the specified key. @@ -260,6 +319,11 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { return found; } + virtual void overwrite(const BtreeNode& other_node) { + DEBUG_ASSERT_EQ(node_size(), other_node.node_size(), "{}", get_persistent_header_const()->to_string()); + std::memcpy(m_phys_node_buf, other_node.m_phys_node_buf, other_node.node_size()); + } + void get_adjacent_indicies(uint32_t cur_ind, std::vector< uint32_t >& indices_list, uint32_t max_indices) const { uint32_t i = 0; uint32_t start_ind; @@ -304,6 +368,15 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { return get_nth_key< K >(0, true); } + template < typename K, typename V > + void get_all_kvs(std::vector< std::pair< K, V > >& kvs) const { + for (uint32_t i{0}; i < total_entries(); ++i) { + V v; + get_nth_value(i, &v, true); + kvs.emplace_back(std::make_pair(get_nth_key< K >(i, true), v)); + } + } + template < typename K > bool validate_key_order() const { for (auto i = 1u; i < total_entries(); ++i) { @@ -334,43 +407,38 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { uint16_t level() const { return get_persistent_header_const()->level; } // uint32_t total_entries() const { return (has_valid_edge() ? total_entries() + 1 : total_entries()); } - uint64_t max_keys_in_node() const { return m_trans_hdr.max_keys_in_node; } - uint64_t min_keys_in_node() const { return m_trans_hdr.min_keys_in_node; } void lock(locktype_t l) const { if (l == locktype_t::READ) { - m_trans_hdr.lock.lock_shared(); + m_lock.lock_shared(); } else if (l == locktype_t::WRITE) { - m_trans_hdr.lock.lock(); + m_lock.lock(); } } void unlock(locktype_t l) const { if (l == locktype_t::READ) { - m_trans_hdr.lock.unlock_shared(); + m_lock.unlock_shared(); } else if (l == locktype_t::WRITE) { - m_trans_hdr.lock.unlock(); + m_lock.unlock(); } } void lock_upgrade() { - m_trans_hdr.upgraders.increment(1); this->unlock(locktype_t::READ); this->lock(locktype_t::WRITE); - m_trans_hdr.upgraders.decrement(1); } - void lock_acknowledge() { m_trans_hdr.upgraders.decrement(1); } - bool any_upgrade_waiters() const { return (!m_trans_hdr.upgraders.testz()); } + template < typename K, typename V > + using ToStringCallback = std::function< std::string(std::vector< std::pair< K, V > > const&) >; template < typename K, typename V > - std::string to_custom_string(to_string_cb_t< K, V > const& cb) const { + std::string to_custom_string(ToStringCallback< K, V > const& cb) const { std::string snext = (this->next_bnode() == empty_bnodeid) ? "" : fmt::format(" next_node={}", this->next_bnode()); - auto str = - fmt::format("id={}.{} level={} nEntries={} {}{} node_gen={} {} ", this->node_id(), this->link_version(), - this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), snext, - this->node_gen(), this->is_node_deleted() ? " **DELETED**" : ""); + auto str = fmt::format("id={}.{} level={} nEntries={} {}{} node_gen={} ", this->node_id(), this->link_version(), + this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), snext, + this->node_gen()); if (this->has_valid_edge()) { fmt::format_to(std::back_inserter(str), " edge={}.{}", this->edge_info().m_bnodeid, this->edge_info().m_link_version); @@ -396,6 +464,12 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { } fmt::format_to(std::back_inserter(str), "]"); } + + // Should not happen + if (this->is_node_deleted()) { + fmt::format_to(std::back_inserter(str), " **DELETED** "); + } + return str; } @@ -404,23 +478,40 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { virtual btree_status_t insert(uint32_t ind, const BtreeKey& key, const BtreeValue& val) = 0; virtual void remove(uint32_t ind) { remove(ind, ind); } virtual void remove(uint32_t ind_s, uint32_t ind_e) = 0; - virtual void remove_all(const BtreeConfig& cfg) = 0; + virtual void remove_all() = 0; virtual void update(uint32_t ind, const BtreeValue& val) = 0; virtual void update(uint32_t ind, const BtreeKey& key, const BtreeValue& val) = 0; - virtual uint32_t move_out_to_right_by_entries(const BtreeConfig& cfg, BtreeNode& other_node, uint32_t nentries) = 0; - virtual uint32_t move_out_to_right_by_size(const BtreeConfig& cfg, BtreeNode& other_node, uint32_t size) = 0; - virtual uint32_t copy_by_size(const BtreeConfig& cfg, const BtreeNode& other_node, uint32_t start_idx, - uint32_t size) = 0; - virtual uint32_t copy_by_entries(const BtreeConfig& cfg, const BtreeNode& other_node, uint32_t start_idx, - uint32_t nentries) = 0; - /*virtual uint32_t move_in_from_right_by_entries(const BtreeConfig& cfg, BtreeNode& other_node, - uint32_t nentries) = 0; - virtual uint32_t move_in_from_right_by_size(const BtreeConfig& cfg, BtreeNode& other_node, uint32_t size) = 0;*/ + virtual uint32_t move_out_to_right_by_entries(BtreeNode& other_node, uint32_t nentries) = 0; + virtual uint32_t move_out_to_right_by_size(BtreeNode& other_node, uint32_t size) = 0; + + /// @brief Appends entries copied from another SimpleNode into this node, up to a specified size limit. + /// + /// Copies entries starting from the `other_cursor` index in `other` node and appends them + /// to the current node (`this`). Copying stops when either the source node runs out of entries + /// starting from the cursor, or the occupied size of the current node reaches `upto_size`, + /// or the current node runs out of available entry slots. + /// + /// @param o The source BtreeNode (expected to be the same variant as this) to copy entries from. + /// @param other_cursor [in, out] The starting index within `other` node to begin copying. + /// This cursor is advanced by the number of entries successfully copied. + /// @param upto_size The target maximum occupied size for the current node after appending. + /// @param copy_only_if_fits Should the copy happen only if all entries from cursor till end fits to `this` node. + /// + /// @return If any entries have been copied. + /// @note Assumes appropriate node locks are held externally. + virtual bool append_copy_in_upto_size(const BtreeNode& other_node, uint32_t& other_cursor, uint32_t upto_size, + bool copy_only_if_fits) = 0; + +#if 0 + virtual uint32_t copy_by_size(const BtreeNode& other_node, uint32_t start_idx, uint32_t size) = 0; + virtual uint32_t copy_by_entries(const BtreeNode& other_node, uint32_t start_idx, uint32_t nentries) = 0; + virtual uint32_t num_entries_by_size(uint32_t start_idx, uint32_t size) const = 0; +#endif virtual uint32_t available_size() const = 0; virtual bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const = 0; - virtual uint32_t num_entries_by_size(uint32_t start_idx, uint32_t size) const = 0; + virtual uint32_t get_entries_size(uint32_t start_idx, uint32_t end_idx) const = 0; virtual int compare_nth_key(const BtreeKey& cmp_key, uint32_t ind) const = 0; virtual void get_nth_key_internal(uint32_t ind, BtreeKey& out_key, bool copykey) const = 0; @@ -472,19 +563,44 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { } public: - void update_phys_buf(uint8_t* buf) { - m_phys_node_buf = buf; - on_update_phys_buf(); + uint8_t* share_phys_node_buf() { + uint8_t* old_phys_buf{nullptr}; + auto share_count = m_phys_buf_share_count.load(); + if (share_count != 0) { + // Buffer was already shared with another party, we need to make a copy and share the new one + auto new_buf = Allocator::get(m_token).alloc_node_buf(node_size()); + std::memcpy(new_buf, m_phys_node_buf, node_size()); + old_phys_buf = m_phys_node_buf; + m_phys_node_buf = new_buf; + } + share_count = m_phys_buf_share_count.fetch_add(1); + if (old_phys_buf && (share_count == 0)) { + // We have checked if buffer was shared and actually copied the buffer, but before we increment the counter, + // release_buf has been called and reduced the count to 1. If thats the case, we have 2 unshared buffers and + // one of them has to be freed + Allocator::get(m_token).free_node_buf(old_phys_buf); + } + return m_phys_node_buf; + } + + void release_phys_node_buf(uint8_t* buf) { + auto const cur_count = m_phys_buf_share_count.fetch_sub(1); + if (cur_count > 1) { + // After sharing, the phys_node_buf was copied and modified, this release is not for the buf that is + // currently held, so we have to free the buf + DEBUG_ASSERT_NE((void*)buf, (void*)m_phys_node_buf, + "We are asked to release current version buf, but with shared count more than 1, which " + "means there is some out-of-order release going on"); + Allocator::get(m_token).free_node_buf(buf); + } } - // This method is called when the physical buffer is updated. - // Derived classes can override this method to perform additional actions. - virtual void on_update_phys_buf() = 0; - persistent_hdr_t* get_persistent_header() { return r_cast< persistent_hdr_t* >(m_phys_node_buf); } - const persistent_hdr_t* get_persistent_header_const() const { - return r_cast< const persistent_hdr_t* >(m_phys_node_buf); + + PersistentHeader* get_persistent_header() { return r_cast< PersistentHeader* >(m_phys_node_buf); } + const PersistentHeader* get_persistent_header_const() const { + return r_cast< const PersistentHeader* >(m_phys_node_buf); } - uint8_t* node_data_area() { return (m_phys_node_buf + sizeof(persistent_hdr_t)); } - const uint8_t* node_data_area_const() const { return (m_phys_node_buf + sizeof(persistent_hdr_t)); } + uint8_t* node_data_area() { return (m_phys_node_buf + sizeof(PersistentHeader)); } + const uint8_t* node_data_area_const() const { return (m_phys_node_buf + sizeof(PersistentHeader)); } uint8_t magic() const { return get_persistent_header_const()->magic; } void set_magic() { get_persistent_header()->magic = BTREE_NODE_MAGIC; } @@ -495,6 +611,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { void set_node_id(bnodeid_t id) { get_persistent_header()->node_id = id; } bnodeid_t node_id() const { return get_persistent_header_const()->node_id; } + int64_t get_modified_cp_id() const { return get_persistent_header_const()->modified_cp_id; } void set_checksum() { get_persistent_header()->checksum = crc16_t10dif(bt_init_crc_16, node_data_area_const(), node_data_size()); @@ -511,18 +628,15 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { } void set_total_entries(uint32_t n) { get_persistent_header()->nentries = n; } - void inc_entries() { ++get_persistent_header()->nentries; } - void dec_entries() { --get_persistent_header()->nentries; } - - void add_entries(uint32_t addn) { get_persistent_header()->nentries += addn; } - void sub_entries(uint32_t subn) { get_persistent_header()->nentries -= subn; } + void add_entries(uint32_t addn = 1u) { get_persistent_header()->nentries += addn; } + void sub_entries(uint32_t subn = 1u) { get_persistent_header()->nentries -= subn; } void set_leaf(bool leaf) { get_persistent_header()->leaf = leaf; } void set_node_type(btree_node_type t) { get_persistent_header()->node_type = uint32_cast(t); } void set_node_size(uint32_t size) { get_persistent_header()->node_size = s_cast< uint16_t >(size - 1); } uint64_t node_gen() const { return get_persistent_header_const()->node_gen; } uint32_t node_size() const { return s_cast< uint32_t >(get_persistent_header_const()->node_size) + 1; } - uint32_t node_data_size() const { return node_size() - sizeof(persistent_hdr_t); } + uint32_t node_data_size() const { return node_size() - sizeof(PersistentHeader); } void inc_gen() { get_persistent_header()->node_gen++; } void set_gen(uint64_t g) { get_persistent_header()->node_gen = g; } @@ -536,13 +650,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { BtreeLinkInfo link_info() const { return BtreeLinkInfo{node_id(), link_version()}; } virtual uint32_t occupied_size() const { return (node_data_size() - available_size()); } - bool is_merge_needed(const BtreeConfig& cfg) const { - if (level() > cfg.m_max_merge_level) { return false; } -#ifdef _PRERELEASE - if (min_keys_in_node()) { return total_entries() < min_keys_in_node(); } -#endif - return (occupied_size() < cfg.suggested_min_size()); - } + bool is_merge_needed(const BtreeConfig& cfg) const { return (occupied_size() < cfg.suggested_min_size()); } bnodeid_t next_bnode() const { return get_persistent_header_const()->next_node; } void set_next_bnode(bnodeid_t b) { get_persistent_header()->next_node = b; } @@ -566,17 +674,16 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { friend void intrusive_ptr_add_ref(BtreeNode* node) { node->m_refcount.increment(1); } friend void intrusive_ptr_release(BtreeNode* node) { - if (node->m_refcount.decrement_testz(1)) { delete node; } + if (node->m_refcount.decrement_testz(1)) { + // Do not delete it here, since node is generally an offset inside actual allocation and delete will fail + // here (with asan). So let the on_node_freed from the underlying store delete the allocation. + if (Allocator::get(node->m_token).free_btree_node) { + Allocator::get(node->m_token).free_btree_node(node); + } else { + delete node; + } + } } }; -struct btree_locked_node_info { - BtreeNode* node; - Clock::time_point start_time; - const char* fname; - int line; - - void dump() const { LOGINFO("node locked by file: {}, line: {}", fname, line); } -}; - } // namespace homestore diff --git a/src/include/homestore/btree/detail/btree_node_mgr.ipp b/src/include/homestore/btree/detail/btree_node_mgr.ipp index e5e74e2b0..688d49003 100644 --- a/src/include/homestore/btree/detail/btree_node_mgr.ipp +++ b/src/include/homestore/btree/detail/btree_node_mgr.ipp @@ -16,406 +16,162 @@ #pragma once #include -#include -#include -#include +#include +#include +#include #include // #include #include namespace homestore { - -#define lock_node(a, b, c) _lock_node(a, b, c, __FILE__, __LINE__) - -template < typename K, typename V > -btree_status_t Btree< K, V >::create_root_node(void* op_context) { - // Assign one node as root node and also create a child leaf node and set it as edge - BtreeNodePtr root = alloc_leaf_node(); - if (root == nullptr) { return btree_status_t::space_not_avail; } - - root->set_level(0u); - auto ret = write_node(root, op_context); - if (ret != btree_status_t::success) { - free_node(root, locktype_t::NONE, op_context); - return btree_status_t::space_not_avail; - } - - m_root_node_info = BtreeLinkInfo{root->node_id(), root->link_version()}; - ret = on_root_changed(root, op_context); - if (ret != btree_status_t::success) { - free_node(root, locktype_t::NONE, op_context); - m_root_node_info = BtreeLinkInfo{}; - } - return ret; -} - -/* - * It reads the node and take a lock of the node. - */ -template < typename K, typename V > -btree_status_t Btree< K, V >::read_and_lock_node(bnodeid_t id, BtreeNodePtr& node_ptr, locktype_t int_lock_type, - locktype_t leaf_lock_type, void* context) const { - auto ret = read_node_impl(id, node_ptr); - if (node_ptr == nullptr) { - BT_LOG(ERROR, "read failed, reason: {}", ret); - return ret; - } - - auto acq_lock = (node_ptr->is_leaf()) ? leaf_lock_type : int_lock_type; - ret = lock_node(node_ptr, acq_lock, context); - if (ret != btree_status_t::success) { BT_LOG(ERROR, "Node lock and refresh failed"); } - - return ret; -} - -template < typename K, typename V > -btree_status_t Btree< K, V >::get_child_and_lock_node(const BtreeNodePtr& node, uint32_t index, - BtreeLinkInfo& child_info, BtreeNodePtr& child_node, - locktype_t int_lock_type, locktype_t leaf_lock_type, - void* context) const { - if (index == node->total_entries()) { - if (!node->has_valid_edge()) { - BT_NODE_LOG_ASSERT(false, node, "Child index {} does not have valid bnode_id", index); - return btree_status_t::not_found; - } - child_info = node->get_edge_value(); - } else { - BT_NODE_LOG_ASSERT_LT(index, node->total_entries(), node); - node->get_nth_value(index, &child_info, false /* copy */); - } - - return (read_and_lock_node(child_info.bnode_id(), child_node, int_lock_type, leaf_lock_type, context)); -} - -template < typename K, typename V > -btree_status_t Btree< K, V >::write_node(const BtreeNodePtr& node, void* context) { - COUNTER_INCREMENT_IF_ELSE(m_metrics, node->is_leaf(), btree_leaf_node_writes, btree_int_node_writes, 1); - HISTOGRAM_OBSERVE_IF_ELSE(m_metrics, node->is_leaf(), btree_leaf_node_occupancy, btree_int_node_occupancy, - ((m_node_size - node->available_size()) * 100) / m_node_size); - - return (write_node_impl(node, context)); -} - -/* Caller of this api doesn't expect read to fail in any circumstance */ -template < typename K, typename V > -void Btree< K, V >::read_node_or_fail(bnodeid_t id, BtreeNodePtr& node) const { - BT_NODE_REL_ASSERT_EQ(read_node_impl(id, node), btree_status_t::success, node); +template < typename T, typename... Args > +static BtreeNode* do_create_node(BtreeNode::Allocator::Token token, Args&&... args) { + uint8_t* ptr = BtreeNode::Allocator::get(token).alloc_btree_node(sizeof(T)); + T* node = new (ptr) T(std::forward< Args >(args)..., token); + return dynamic_cast< BtreeNode* >(node); } -/* - * This function upgrades the parent node and child node locks from read lock to write lock and take required steps if - * things have changed during the upgrade. - * - * Inputs: - * parent_node - Parent Node to upgrade - * child_node - Child Node to upgrade - * child_cur_lock - Current child node which is held - * context - Context to pass down - * - * Returns - If successfully able to upgrade both the nodes, return success, else return status of upgrade_node. - * In case of not success, all nodes locks are released. - * - * NOTE: This function expects both the parent_node and child_node to be already locked. Parent node is - * expected to be read locked and child node could be either read or write locked. - */ -template < typename K, typename V > -btree_status_t Btree< K, V >::upgrade_node_locks(const BtreeNodePtr& parent_node, const BtreeNodePtr& child_node, - locktype_t& parent_cur_lock, locktype_t& child_cur_lock, - void* context) { - btree_status_t ret = btree_status_t::success; - - auto const parent_prev_gen = parent_node->node_gen(); - auto const child_prev_gen = child_node->node_gen(); +template < typename K, typename V, typename... Args > +static BtreeNode* do_form_node(btree_node_type node_type, BtreeNode::Allocator::Token token, Args&&... args) { + BtreeNode* n{nullptr}; + switch (node_type) { + case btree_node_type::VAR_OBJECT: + n = do_create_node< VarObjSizeNode< K, V > >(token, std::forward< Args >(args)...); + break; - unlock_node(child_node, child_cur_lock); - unlock_node(parent_node, parent_cur_lock); + case btree_node_type::FIXED: + n = do_create_node< SimpleNode< K, V > >(token, std::forward< Args >(args)...); + break; - ret = lock_node(parent_node, locktype_t::WRITE, context); - if (ret != btree_status_t::success) { - parent_cur_lock = child_cur_lock = locktype_t::NONE; - return ret; - } + case btree_node_type::VAR_VALUE: + n = do_create_node< VarValueSizeNode< K, V > >(token, std::forward< Args >(args)...); + break; - ret = lock_node(child_node, locktype_t::WRITE, context); - if (ret != btree_status_t::success) { - unlock_node(parent_node, locktype_t::WRITE); - parent_cur_lock = child_cur_lock = locktype_t::NONE; - return ret; - } + case btree_node_type::VAR_KEY: + n = do_create_node< VarKeySizeNode< K, V > >(token, std::forward< Args >(args)...); + break; - // If the node things have been changed between unlock and lock example, it has been made invalid (probably by merge - // nodes) ask caller to start over again. - if(parent_prev_gen != parent_node->node_gen() || child_prev_gen != child_node->node_gen()) { - COUNTER_INCREMENT(m_metrics, btree_num_pc_gen_mismatch, 1); - } - if (parent_node->is_node_deleted() || (parent_prev_gen != parent_node->node_gen()) || - child_node->is_node_deleted() || (child_prev_gen != child_node->node_gen())) { - unlock_node(child_node, locktype_t::WRITE); - unlock_node(parent_node, locktype_t::WRITE); - parent_cur_lock = child_cur_lock = locktype_t::NONE; - return btree_status_t::retry; - } + case btree_node_type::FIXED_PREFIX: + n = do_create_node< FixedPrefixNode< K, V > >(token, std::forward< Args >(args)...); + break; - parent_cur_lock = child_cur_lock = locktype_t::WRITE; -#if 0 -#ifdef _PRERELEASE - { - auto time = iomgr_flip::instance()->get_test_flip< uint64_t >("btree_upgrade_delay"); - if (time) { std::this_thread::sleep_for(std::chrono::microseconds{time.get()}); } + default: + RELEASE_ASSERT(false, "Unsupported node type {}", node_type); + break; } -#endif -#endif + return n; +} #if 0 -#ifdef _PRERELEASE - { - int is_leaf = 0; - - if (child_node && child_node->is_leaf()) { is_leaf = 1; } - if (iomgr_flip::instance()->test_flip("btree_upgrade_node_fail", is_leaf)) { - unlock_node(my_node, cur_lock); - cur_lock = locktype_t::NONE; - if (child_node) { - unlock_node(child_node, child_cur_lock); - child_cur_lock = locktype_t::NONE; - } - ret = btree_status_t::retry; - } - } -#endif -#endif - - return ret; -} +template < typename V, typename... Args > +static BtreeNode* do_form_node(btree_node_type node_type, BtreeNode::Allocator::Token token, Args&&... args) { + BtreeNode* n{nullptr}; + switch (node_type) { + case btree_node_type::VAR_OBJECT: + n = is_leaf ? do_create_node< VarObjSizeNode< K, V > >(token, std::forward< Args >(args)...) + : do_create_node< VarObjSizeNode< K, BtreeLinkInfo > >(token, std::forward< Args >(args)...); + break; -template < typename K, typename V > -btree_status_t Btree< K, V >::upgrade_node_lock(const BtreeNodePtr& node, locktype_t& cur_lock, void* context) { - auto const prev_gen = node->node_gen(); + case btree_node_type::FIXED: + n = is_leaf ? do_create_node< SimpleNode< K, V > >(token, std::forward< Args >(args)...) + : do_create_node< SimpleNode< K, BtreeLinkInfo > >(token, std::forward< Args >(args)...); + break; - unlock_node(node, cur_lock); - cur_lock = locktype_t::NONE; + case btree_node_type::VAR_VALUE: + n = is_leaf ? do_create_node< VarValueSizeNode< K, V > >(token, std::forward< Args >(args)...) + : do_create_node< VarValueSizeNode< K, BtreeLinkInfo > >(token, std::forward< Args >(args)...); + break; - auto ret = lock_node(node, locktype_t::WRITE, context); - if (ret != btree_status_t::success) { return ret; } - if(prev_gen != node->node_gen()) { - COUNTER_INCREMENT(m_metrics, btree_num_gen_mismatch, 1); - } - if (node->is_node_deleted() || (prev_gen != node->node_gen())) { - unlock_node(node, locktype_t::WRITE); - return btree_status_t::retry; - } - cur_lock = locktype_t::WRITE; - return ret; -} + case btree_node_type::VAR_KEY: + n = is_leaf ? do_create_node< VarKeySizeNode< K, V > >(token, std::forward< Args >(args)...) + : do_create_node< VarKeySizeNode< K, BtreeLinkInfo > >(token, std::forward< Args >(args)...); + break; -template < typename K, typename V > -btree_status_t Btree< K, V >::_lock_node(const BtreeNodePtr& node, locktype_t type, void* context, const char* fname, - int line) const { - _start_of_lock(node, type, fname, line); - node->lock(type); + case btree_node_type::PREFIX: + n = is_leaf ? do_create_node< FixedPrefixNode< K, V > >(token, std::forward< Args >(args)...) + : do_create_node< FixedPrefixNode< K, BtreeLinkInfo > >(token, std::forward< Args >(args)...); + break; - auto ret = refresh_node(node, (type == locktype_t::WRITE), context); - if (ret != btree_status_t::success) { - node->unlock(type); - end_of_lock(node, type); - return ret; + default: + RELEASE_ASSERT(false, "Unsupported node type {}", node_type); + break; } - - return btree_status_t::success; -} - -template < typename K, typename V > -void Btree< K, V >::unlock_node(const BtreeNodePtr& node, locktype_t type) const { - node->unlock(type); - auto time_spent = end_of_lock(node, type); - observe_lock_time(node, type, time_spent); + return n; } +#endif template < typename K, typename V > -BtreeNodePtr Btree< K, V >::alloc_leaf_node() { - BtreeNodePtr n = alloc_node(true /* is_leaf */); - if (n) { - COUNTER_INCREMENT(m_metrics, btree_leaf_node_count, 1); - ++m_total_leaf_nodes; +BtreeNodePtr Btree< K, V >::new_node(bnodeid_t id, bool is_leaf, BtreeNode::Allocator::Token token) const { + BtreeNodePtr n; + if (is_leaf) { + n = BtreeNodePtr{do_form_node< K, V >(m_bt_cfg.leaf_node_type(), token, id, is_leaf, m_bt_cfg.node_size())}; + } else { + n = BtreeNodePtr{ + do_form_node< K, BtreeLinkInfo >(m_bt_cfg.interior_node_type(), token, id, is_leaf, m_bt_cfg.node_size())}; } return n; } template < typename K, typename V > -BtreeNodePtr Btree< K, V >::alloc_interior_node() { - BtreeNodePtr n = alloc_node(false /* is_leaf */); - if (n) { - COUNTER_INCREMENT(m_metrics, btree_int_node_count, 1); - ++m_total_interior_nodes; +BtreeNodePtr Btree< K, V >::load_node(uint8_t* node_buf, bnodeid_t id, BtreeNode::Allocator::Token token) const { + BtreeNodePtr n; + if (BtreeNode::identify_leaf_node(node_buf)) { + n = BtreeNodePtr{do_form_node< K, V >(m_bt_cfg.leaf_node_type(), token, node_buf, id)}; + } else { + n = BtreeNodePtr{do_form_node< K, BtreeLinkInfo >(m_bt_cfg.interior_node_type(), token, node_buf, id)}; } return n; } -template < typename T, typename... Args > -static BtreeNode* create_node(Args&&... args) { - return dynamic_cast< BtreeNode* >(new T(std::forward< Args >(args)...)); -} - +#if 0 template < typename K, typename V > -BtreeNode* Btree< K, V >::init_node(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf) const { +BtreeNode* Btree< K, V >::load_node(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, + BtreeNode::Allocator::Token token) const { BtreeNode* n{nullptr}; btree_node_type node_type = is_leaf ? m_bt_cfg.leaf_node_type() : m_bt_cfg.interior_node_type(); switch (node_type) { case btree_node_type::VAR_OBJECT: - n = is_leaf ? create_node< VarObjSizeNode< K, V > >(node_buf, id, init_buf, true, this->m_bt_cfg) - : create_node< VarObjSizeNode< K, BtreeLinkInfo > >(node_buf, id, init_buf, false, this->m_bt_cfg); + n = is_leaf + ? do_create_node< VarObjSizeNode< K, V > >(token, node_buf, id, init_buf, true, m_bt_cfg.node_size()) + : do_create_node< VarObjSizeNode< K, BtreeLinkInfo > >(token, node_buf, id, init_buf, false, + m_bt_cfg.node_size()); break; case btree_node_type::FIXED: - n = is_leaf ? create_node< SimpleNode< K, V > >(node_buf, id, init_buf, true, this->m_bt_cfg) - : create_node< SimpleNode< K, BtreeLinkInfo > >(node_buf, id, init_buf, false, this->m_bt_cfg); + n = is_leaf ? do_create_node< SimpleNode< K, V > >(token, node_buf, id, init_buf, true, m_bt_cfg.node_size()) + : do_create_node< SimpleNode< K, BtreeLinkInfo > >(token, node_buf, id, init_buf, false, + m_bt_cfg.node_size()); break; case btree_node_type::VAR_VALUE: n = is_leaf - ? create_node< VarValueSizeNode< K, V > >(node_buf, id, init_buf, true, this->m_bt_cfg) - : create_node< VarValueSizeNode< K, BtreeLinkInfo > >(node_buf, id, init_buf, false, this->m_bt_cfg); + ? do_create_node< VarValueSizeNode< K, V > >(token, node_buf, id, init_buf, true, m_bt_cfg.node_size()) + : do_create_node< VarValueSizeNode< K, BtreeLinkInfo > >(token, node_buf, id, init_buf, false, + m_bt_cfg.node_size()); break; case btree_node_type::VAR_KEY: - n = is_leaf ? create_node< VarKeySizeNode< K, V > >(node_buf, id, init_buf, true, this->m_bt_cfg) - : create_node< VarKeySizeNode< K, BtreeLinkInfo > >(node_buf, id, init_buf, false, this->m_bt_cfg); + n = is_leaf + ? do_create_node< VarKeySizeNode< K, V > >(token, node_buf, id, init_buf, true, m_bt_cfg.node_size()) + : do_create_node< VarKeySizeNode< K, BtreeLinkInfo > >(token, node_buf, id, init_buf, false, + m_bt_cfg.node_size()); break; case btree_node_type::PREFIX: - n = is_leaf ? create_node< FixedPrefixNode< K, V > >(node_buf, id, init_buf, true, this->m_bt_cfg) - : create_node< FixedPrefixNode< K, BtreeLinkInfo > >(node_buf, id, init_buf, false, this->m_bt_cfg); + n = is_leaf + ? do_create_node< FixedPrefixNode< K, V > >(token, node_buf, id, init_buf, true, m_bt_cfg.node_size()) + : do_create_node< FixedPrefixNode< K, BtreeLinkInfo > >(token, node_buf, id, init_buf, false, + m_bt_cfg.node_size()); break; default: BT_REL_ASSERT(false, "Unsupported node type {}", node_type); break; } - return n; -} - -/* Note:- This function assumes that access of this node is thread safe. */ -template < typename K, typename V > -void Btree< K, V >::free_node(const BtreeNodePtr& node, locktype_t cur_lock, void* context) { - BT_NODE_LOG(TRACE, node, "Freeing node"); - - COUNTER_DECREMENT_IF_ELSE(m_metrics, node->is_leaf(), btree_leaf_node_count, btree_int_node_count, 1); - if (cur_lock != locktype_t::NONE) { - BT_NODE_DBG_ASSERT_NE(cur_lock, locktype_t::READ, node, "We can't free a node with read lock type right?"); - node->set_node_deleted(); - unlock_node(node, cur_lock); - } - node->is_leaf()?--m_total_leaf_nodes:--m_total_interior_nodes; - free_node_impl(node, context); - // intrusive_ptr_release(node.get()); -} - -template < typename K, typename V > -void Btree< K, V >::observe_lock_time(const BtreeNodePtr& node, locktype_t type, uint64_t time_spent) const { - if (time_spent == 0) { return; } - - if (type == locktype_t::READ) { - HISTOGRAM_OBSERVE_IF_ELSE(m_metrics, node->is_leaf(), btree_inclusive_time_in_leaf_node, - btree_inclusive_time_in_int_node, time_spent); - } else { - HISTOGRAM_OBSERVE_IF_ELSE(m_metrics, node->is_leaf(), btree_exclusive_time_in_leaf_node, - btree_exclusive_time_in_int_node, time_spent); - } -} - -template < typename K, typename V > -void Btree< K, V >::_start_of_lock(const BtreeNodePtr& node, locktype_t ltype, const char* fname, int line) { - btree_locked_node_info info; - -#ifndef NDEBUG - info.fname = fname; - info.line = line; -#endif - - info.start_time = Clock::now(); - info.node = node.get(); - if (ltype == locktype_t::WRITE) { - bt_thread_vars()->wr_locked_nodes.push_back(info); - LOGTRACEMOD(btree, "ADDING node {} to write locked nodes list, its size={}", info.node->node_id(), - bt_thread_vars()->wr_locked_nodes.size()); - } else if (ltype == locktype_t::READ) { - bt_thread_vars()->rd_locked_nodes.push_back(info); - LOGTRACEMOD(btree, "ADDING node {} to read locked nodes list, its size={}", info.node->node_id(), - bt_thread_vars()->rd_locked_nodes.size()); - } else { - DEBUG_ASSERT(false, "Invalid locktype_t {}", ltype); - } -} - -template < typename K, typename V > -bool Btree< K, V >::remove_locked_node(const BtreeNodePtr& node, locktype_t ltype, btree_locked_node_info* out_info) { - auto pnode_infos = - (ltype == locktype_t::WRITE) ? &bt_thread_vars()->wr_locked_nodes : &bt_thread_vars()->rd_locked_nodes; - - if (!pnode_infos->empty()) { - auto info = pnode_infos->back(); - if (info.node == node.get()) { - *out_info = info; - pnode_infos->pop_back(); - LOGTRACEMOD(btree, "REMOVING node {} from {} locked nodes list, its size = {}",info.node->node_id(), - (ltype == locktype_t::WRITE) ? "write" : "read", pnode_infos->size()); - return true; - } else if (pnode_infos->size() > 1) { - info = pnode_infos->at(pnode_infos->size() - 2); - if (info.node == node.get()) { - *out_info = info; - pnode_infos->at(pnode_infos->size() - 2) = pnode_infos->back(); - pnode_infos->pop_back(); - LOGTRACEMOD(btree, "REMOVING node {} from {} locked nodes list, its size = {}", info.node->node_id(), - (ltype == locktype_t::WRITE) ? "write" : "read", pnode_infos->size()); - return true; - } - } - } - -#ifndef NDEBUG - if (pnode_infos->empty()) { - LOGERRORMOD(btree, "locked_node_list: node = {} not found, locked node list empty", (void*)node.get()); - } else if (pnode_infos->size() == 1) { - LOGERRORMOD(btree, "locked_node_list: node = {} not found, total list count = 1, Expecting node = {}", - (void*)node.get(), (void*)pnode_infos->back().node); - } else { - LOGERRORMOD(btree, "locked_node_list: node = {} not found, total list count = {}, Expecting nodes = {} or {}", - (void*)node.get(), pnode_infos->size(), (void*)pnode_infos->back().node, - (void*)pnode_infos->at(pnode_infos->size() - 2).node); - } -#endif - return false; -} - -template < typename K, typename V > -uint64_t Btree< K, V >::end_of_lock(const BtreeNodePtr& node, locktype_t ltype) { - btree_locked_node_info info; - if (!remove_locked_node(node, ltype, &info)) { - DEBUG_ASSERT(false, "Expected node = {} is not there in locked_node_list", node->node_id()); - return 0; - } - // DEBUG_ASSERT_EQ(node.get(), info.node); - return get_elapsed_time_ns(info.start_time); -} - -#ifndef NDEBUG -template < typename K, typename V > -void Btree< K, V >::check_lock_debug() { - // both wr_locked_nodes and rd_locked_nodes are thread_local; - // nothing will be dumpped if there is no assert failure; - for (const auto& x : bt_thread_vars()->wr_locked_nodes) { - x.dump(); - } - for (const auto& x : bt_thread_vars()->rd_locked_nodes) { - x.dump(); - } - DEBUG_ASSERT_EQ(bt_thread_vars()->wr_locked_nodes.size(), 0); - DEBUG_ASSERT_EQ(bt_thread_vars()->rd_locked_nodes.size(), 0); + return n; } #endif - } // namespace homestore diff --git a/src/include/homestore/btree/detail/btree_query_impl.ipp b/src/include/homestore/btree/detail/btree_query_impl.ipp index 8d21c26b7..55196ca4c 100644 --- a/src/include/homestore/btree/detail/btree_query_impl.ipp +++ b/src/include/homestore/btree/detail/btree_query_impl.ipp @@ -18,9 +18,59 @@ namespace homestore { +template < typename K, typename V > +btree_status_t Btree< K, V >::query(BtreeQueryRequest< K >& qreq, std::vector< std::pair< K, V > >& out_values) { + COUNTER_INCREMENT(m_metrics, btree_query_ops_count, 1); + + btree_status_t ret = btree_status_t::success; + if (qreq.batch_size() == 0) { return ret; } + + m_btree_lock.lock_shared(); + BtreeNodePtr root = nullptr; + ret = read_and_lock_node(m_root_node_info.bnode_id(), root, locktype_t::READ, locktype_t::READ, qreq.m_op_context); + if (ret != btree_status_t::success) { goto out; } + + switch (qreq.query_type()) { + case BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY: + ret = do_sweep_query(root, qreq, out_values); + break; + + case BtreeQueryType::TREE_TRAVERSAL_QUERY: + ret = do_traversal_query(root, qreq, out_values); + break; + + default: + unlock_node(root, locktype_t::READ); + LOGERROR("Query type {} is not supported yet", qreq.query_type()); + break; + } + + if ((qreq.query_type() == BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY || + qreq.query_type() == BtreeQueryType::TREE_TRAVERSAL_QUERY)) { + if (out_values.size()) { + K out_last_key = out_values.back().first; + if (out_last_key.compare(qreq.input_range().end_key()) >= 0) { ret = btree_status_t::success; } + qreq.shift_working_range(std::move(out_last_key), false /* non inclusive*/); + } else { + DEBUG_ASSERT_NE(ret, btree_status_t::has_more, "Query returned has_more, but no values added") + } + } + +out: + m_btree_lock.unlock_shared(); +#ifndef NDEBUG + check_lock_debug(); +#endif + if ((ret != btree_status_t::success) && (ret != btree_status_t::has_more)) { + BT_LOG(ERROR, "btree query failed {}", ret); + COUNTER_INCREMENT(m_metrics, query_err_cnt, 1); + } + return ret; +} + template < typename K, typename V > btree_status_t Btree< K, V >::do_sweep_query(BtreeNodePtr& my_node, BtreeQueryRequest< K >& qreq, - std::vector< std::pair< K, V > >& out_values) const { + std::vector< std::pair< K, V > >& out_values) { btree_status_t ret = btree_status_t::success; if (my_node->is_leaf()) { BT_NODE_DBG_ASSERT_GT(qreq.batch_size(), 0, my_node); @@ -40,7 +90,7 @@ btree_status_t Btree< K, V >::do_sweep_query(BtreeNodePtr& my_node, BtreeQueryRe start_ind, end_ind, &out_values, qreq.filter()); count += cur_count; - if (qreq.route_tracing) { + if (qreq.m_route_tracing) { append_route_trace(qreq, my_node, btree_event_t::READ, start_ind, start_ind + cur_count); } @@ -69,7 +119,7 @@ btree_status_t Btree< K, V >::do_sweep_query(BtreeNodePtr& my_node, BtreeQueryRe BtreeLinkInfo start_child_info; [[maybe_unused]] const auto [isfound, idx] = my_node->find(qreq.first_key(), &start_child_info, false); ASSERT_IS_VALID_INTERIOR_CHILD_INDX(isfound, idx, my_node); - if (qreq.route_tracing) { append_route_trace(qreq, my_node, btree_event_t::READ, idx, idx); } + if (qreq.m_route_tracing) { append_route_trace(qreq, my_node, btree_event_t::READ, idx, idx); } BtreeNodePtr child_node; ret = read_and_lock_node(start_child_info.bnode_id(), child_node, locktype_t::READ, locktype_t::READ, @@ -81,7 +131,7 @@ btree_status_t Btree< K, V >::do_sweep_query(BtreeNodePtr& my_node, BtreeQueryRe template < typename K, typename V > btree_status_t Btree< K, V >::do_traversal_query(const BtreeNodePtr& my_node, BtreeQueryRequest< K >& qreq, - std::vector< std::pair< K, V > >& out_values) const { + std::vector< std::pair< K, V > >& out_values) { btree_status_t ret = btree_status_t::success; uint32_t idx; @@ -93,7 +143,7 @@ btree_status_t Btree< K, V >::do_traversal_query(const BtreeNodePtr& my_node, Bt auto cur_count = to_variant_node(my_node)->multi_get(qreq.working_range(), qreq.batch_size() - uint32_cast(out_values.size()), start_ind, end_ind, &out_values, qreq.filter()); - if (qreq.route_tracing) { + if (qreq.m_route_tracing) { append_route_trace(qreq, my_node, btree_event_t::READ, start_ind, start_ind + cur_count); } unlock_node(my_node, locktype_t::READ); @@ -117,7 +167,7 @@ btree_status_t Btree< K, V >::do_traversal_query(const BtreeNodePtr& my_node, Bt BT_NODE_LOG_ASSERT_LE(start_idx, end_idx, my_node); idx = start_idx; - if (qreq.route_tracing) { append_route_trace(qreq, my_node, btree_event_t::READ, start_idx, end_idx); } + if (qreq.m_route_tracing) { append_route_trace(qreq, my_node, btree_event_t::READ, start_idx, end_idx); } while (idx <= end_idx) { BtreeLinkInfo child_info; my_node->get_nth_value(idx, &child_info, false); diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp index ce2954706..04e483377 100644 --- a/src/include/homestore/btree/detail/btree_remove_impl.ipp +++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp @@ -17,6 +17,75 @@ #include namespace homestore { +template < typename K, typename V > +template < typename ReqT > +btree_status_t Btree< K, V >::remove(ReqT& req) { + static_assert(std::is_same_v< ReqT, BtreeSingleRemoveRequest > || + std::is_same_v< ReqT, BtreeRangeRemoveRequest< K > > || + std::is_same_v< ReqT, BtreeRemoveAnyRequest< K > >, + "remove api is called with non remove request type"); + + locktype_t acq_lock = locktype_t::READ; + m_btree_lock.lock_shared(); + +retry: + btree_status_t ret = btree_status_t::success; + auto cpg = bt_cp_guard(); + req.m_op_context = cpg.context(cp_consumer_t::INDEX_SVC); + + BtreeNodePtr root; + ret = read_and_lock_node(m_root_node_info.bnode_id(), root, acq_lock, acq_lock, req.m_op_context); + if (ret == btree_status_t::cp_mismatch) { + goto retry; + } else if (ret != btree_status_t::success) { + goto out; + } + + if (root->total_entries() == 0) { + if (root->is_leaf()) { + // There are no entries in btree. + unlock_node(root, acq_lock); + m_btree_lock.unlock_shared(); + ret = btree_status_t::not_found; + goto out; + } + + BT_NODE_LOG_ASSERT_EQ(root->has_valid_edge(), true, root, "Orphaned root with no entries and no edge"); + unlock_node(root, acq_lock); + m_btree_lock.unlock_shared(); + + ret = check_collapse_root(req); + if (ret != btree_status_t::success && ret != btree_status_t::merge_not_required && + ret != btree_status_t::cp_mismatch) { + LOGERROR("check collapse read failed btree name {}", m_bt_cfg.name()); + goto out; + } + + // We must have gotten a new root, need to start from scratch. + m_btree_lock.lock_shared(); + goto retry; + } else if (root->is_leaf() && (acq_lock != locktype_t::WRITE)) { + // Root is a leaf, need to take write lock, instead of read, retry + unlock_node(root, acq_lock); + acq_lock = locktype_t::WRITE; + goto retry; + } else { + ret = do_remove(root, acq_lock, req); + if ((ret == btree_status_t::retry) || (ret == btree_status_t::cp_mismatch)) { + // Need to start from top down again, since there was a merge nodes in-between + acq_lock = locktype_t::READ; + goto retry; + } + } + m_btree_lock.unlock_shared(); + +out: +#ifndef NDEBUG + check_lock_debug(); +#endif + return ret; +} + template < typename K, typename V > template < typename ReqT > btree_status_t Btree< K, V >::do_remove(const BtreeNodePtr& my_node, locktype_t curlock, ReqT& req) { @@ -34,8 +103,7 @@ btree_status_t Btree< K, V >::do_remove(const BtreeNodePtr& my_node, locktype_t if constexpr (std::is_same_v< ReqT, BtreeSingleRemoveRequest >) { if ((modified = my_node->remove_one(req.key(), nullptr, req.m_outval))) { ++removed_count; } } else if constexpr (std::is_same_v< ReqT, BtreeRangeRemoveRequest< K > >) { - removed_count = - to_variant_node(my_node)->multi_remove(req.working_range(), req.m_filter_cb, req.m_app_context); + removed_count = to_variant_node(my_node)->multi_remove(req.working_range(), req.m_filter_cb); modified = (removed_count != 0); req.shift_working_range(); } else if constexpr (std::is_same_v< ReqT, BtreeRemoveAnyRequest< K > >) { @@ -47,7 +115,7 @@ btree_status_t Btree< K, V >::do_remove(const BtreeNodePtr& my_node, locktype_t if (modified) { write_node(my_node, req.m_op_context); COUNTER_DECREMENT(m_metrics, btree_obj_count, removed_count); - if (req.route_tracing) { append_route_trace(req, my_node, btree_event_t::REMOVE); } + if (req.m_route_tracing) { append_route_trace(req, my_node, btree_event_t::REMOVE); } } unlock_node(my_node, curlock); @@ -86,7 +154,7 @@ retry: end_idx = start_idx = (end_idx - start_idx) / 2; // Pick the middle, TODO: Ideally we need to pick random } - if (req.route_tracing) { append_route_trace(req, my_node, btree_event_t::READ, start_idx, end_idx); } + if (req.m_route_tracing) { append_route_trace(req, my_node, btree_event_t::READ, start_idx, end_idx); } curr_idx = start_idx; while (curr_idx <= end_idx) { BtreeLinkInfo child_info; @@ -114,7 +182,7 @@ retry: unlock_lambda(child_node, child_cur_lock); goto out_return; } else if (ret == btree_status_t::success) { - if (req.route_tracing) { append_route_trace(req, child_node, btree_event_t::MERGE); } + if (req.m_route_tracing) { append_route_trace(req, child_node, btree_event_t::MERGE); } unlock_lambda(child_node, child_cur_lock); COUNTER_INCREMENT(m_metrics, btree_merge_count, 1); goto retry; @@ -200,18 +268,17 @@ btree_status_t Btree< K, V >::check_collapse_root(ReqT& req) { goto done; } - ret = on_root_changed(child, req.m_op_context); + ret = m_bt_private->on_root_changed(child, req.m_op_context); if (ret != btree_status_t::success) { unlock_node(child, locktype_t::WRITE); unlock_node(root, locktype_t::WRITE); goto done; } - if (req.route_tracing) { append_route_trace(req, root, btree_event_t::MERGE); } + if (req.m_route_tracing) { append_route_trace(req, root, btree_event_t::MERGE); } - free_node(root, locktype_t::WRITE, req.m_op_context); + remove_node(root, locktype_t::WRITE, req.m_op_context); m_root_node_info = child->link_info(); - this->m_btree_depth = child->level(); unlock_node(child, locktype_t::WRITE); COUNTER_DECREMENT(m_metrics, btree_depth, 1); @@ -222,7 +289,146 @@ done: template < typename K, typename V > btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const BtreeNodePtr& leftmost_node, - uint32_t start_idx, uint32_t end_idx, void* context) { + uint32_t start_idx, uint32_t end_idx, CPContext* context) { + if (!m_bt_cfg.m_merge_turned_on) { return btree_status_t::merge_not_required; } + + auto read_child_node = [this, context](BtreeNodePtr const& parent_node, uint32_t idx, + BtreeNodePtr& child_node) -> btree_status_t { + if (idx == parent_node->total_entries()) { + BT_NODE_LOG_ASSERT(parent_node->has_valid_edge(), parent_node, + "Assertion failure, expected valid edge for parent node"); + } + + BtreeLinkInfo child_info; + parent_node->get_nth_value(idx, &child_info, false /* copy */); + + auto const ret = + read_and_lock_node(child_info.bnode_id(), child_node, locktype_t::WRITE, locktype_t::WRITE, context); + if (ret == btree_status_t::success) { BT_NODE_LOG_ASSERT_EQ(child_node->is_node_deleted(), false, child_node); } + return ret; + }; + + auto erase_last_node_in_list = [this, &leftmost_node](BtreeNodeList& list, bool node_removal, CPContext* context) { + auto& node = list.back(); + if (node_removal) { + remove_node(node, locktype_t::NONE, context); + } else { + unlock_node(node, locktype_t::WRITE); + } + list.erase(list.end() - 1); + }; + + btree_status_t ret{btree_status_t::success}; + BtreeNodeList old_nodes; + BtreeNodeList new_nodes; + old_nodes.reserve(3); + new_nodes.reserve(3); + + // Loop variables + BtreeNodePtr old_node{nullptr}; + BtreeNodePtr cloned_new_node{clone_temp_node(*leftmost_node)}; + BtreeNodePtr new_node{cloned_new_node}; + uint32_t idx = start_idx + 1; + uint32_t src_cursor{0}; + bool dst_filled{false}; + BtreeNodePtr last_new_node{nullptr}; + bnodeid_t next_node_id; + + while (idx <= end_idx) { + if (old_node == nullptr) { + ret = read_child_node(parent_node, idx, old_node); + if (ret != btree_status_t::success) { goto out; } + + old_nodes.push_back(old_node); + src_cursor = 0; + } + + if (new_node == nullptr) { + new_node = leftmost_node->is_leaf() ? create_leaf_node(context) : create_interior_node(context); + new_nodes.emplace_back(new_node); + } + + if (idx == end_idx) { + // Special handling for the last node, we will do the merge of last old node, only if that node can be + // completely placed/appended into the new node. + auto const copied = new_node->append_copy_in_upto_size(*old_node, src_cursor, m_bt_cfg.ideal_fill_size(), + /*copy_only_if_fits=*/true); + if (!copied) { + // Last old node doesn't fit fully into the new node, it is possible that previous old nodes fits into + // one new node and we created a second new node, but the last old node doesn't completely fit into the + // new last node and hence did not move anything. In that case, we can skip the empty new node. + if (new_node->total_entries() == 0) { + erase_last_node_in_list(new_nodes, /*node_removal=*/true, context); + } + erase_last_node_in_list(old_nodes, /*node_removal=*/false, context); + } + break; + } else { + new_node->append_copy_in_upto_size(*old_node, src_cursor, m_bt_cfg.ideal_fill_size(), + /*copy_only_if_fits=*/false); + if (src_cursor == old_node->total_entries()) { + // We have copied all the entries from old_node, so we can move onto next old node + old_node = nullptr; + ++idx; + } else { + // Looks like we have filled the new node, so we need to create a new one + new_node = nullptr; + } + } + } + + // We commit the merge, only if we actually remove at least 1 node by merging. + if (new_nodes.size() >= old_nodes.size()) { + ret = btree_status_t::merge_not_required; + goto out; + } + + // Remove excess entries from the parent node + parent_node->remove(start_idx + new_nodes.size() + 1, start_idx + old_nodes.size()); + + // parent_node->remove(start_idx + 1, start_idx + old_nodes.size()); + + // Update all the new node entries to parent and while iterating update their node links. + idx = start_idx + new_nodes.size(); + next_node_id = old_nodes.back()->next_bnode(); + for (auto it = new_nodes.rbegin(); it != new_nodes.rend(); ++it) { + (*it)->set_next_bnode(next_node_id); + auto this_node_id = (*it)->node_id(); + if ((*it)->total_entries()) { + parent_node->update(idx--, (*it)->get_last_key< K >(), BtreeLinkInfo{this_node_id, 0}); + } + next_node_id = this_node_id; + } + + // We need to copy the cloned node back to leftmost_node and update it with latest next node + leftmost_node->overwrite(*cloned_new_node); + leftmost_node->set_next_bnode(next_node_id); + if (leftmost_node->total_entries()) { + leftmost_node->inc_link_version(); + parent_node->update(start_idx, leftmost_node->get_last_key< K >(), leftmost_node->link_info()); + } + + ret = m_bt_private->transact_nodes(new_nodes, old_nodes, leftmost_node, parent_node, context); + +out: + // Do free/unlock based on success/failure in reverse order + if (ret != btree_status_t::success) { + for (auto it = old_nodes.rbegin(); it != old_nodes.rend(); ++it) { + unlock_node(*it, locktype_t::WRITE); + } + + for (auto it = new_nodes.rbegin(); it != new_nodes.rend(); ++it) { + remove_node(*it, locktype_t::NONE, context); + } + } + + return ret; +} + +#if 0 +template < typename K, typename V > +btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const BtreeNodePtr& leftmost_node, + uint32_t start_idx, uint32_t end_idx, CPContext* context) { if (!m_bt_cfg.m_merge_turned_on) { return btree_status_t::merge_not_required; } btree_status_t ret{btree_status_t::success}; BtreeNodeList old_nodes; @@ -232,10 +438,6 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const uint32_t balanced_size{0}; int32_t available_size{0}; uint32_t num_nodes{0}; - uint32_t expected_holes{0}; - uint32_t expected_tail{0}; - uint32_t init_holes{0}; - uint32_t init_tail{0}; struct _leftmost_src_info { std::vector< uint32_t > ith_nodes; @@ -252,11 +454,6 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const _src_cursor_info src_cursor; total_size = leftmost_node->occupied_size(); - uint32_t expected_entities = leftmost_node->total_entries(); -#ifdef _PRERELEASE - const uint64_t max_keys = leftmost_node->max_keys_in_node(); -#endif - for (auto indx = start_idx + 1; indx <= end_idx; ++indx) { if (indx == parent_node->total_entries()) { BT_NODE_LOG_ASSERT(parent_node->has_valid_edge(), parent_node, @@ -285,10 +482,6 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const // Only option is to rebalance the nodes across. If we are asked not to do so, skip it. if (!m_bt_cfg.m_rebalance_turned_on) { ret = btree_status_t::merge_not_required; - BT_NODE_LOG( - DEBUG, parent_node, - "MERGE disqualified for parent node {} leftmost_node {}! num_nodes {} is more than old_nodes.size() {}", - parent_node->to_string(), leftmost_node->to_string(), num_nodes, old_nodes.size()); goto out; } } @@ -297,10 +490,6 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const if (leftmost_node->occupied_size() > balanced_size) { // If for some reason balancing increases the current size, give up. // TODO: Is this a real case, isn't happening would mean some sort of bug in calculation of is_merge_needed? - BT_NODE_LOG( - DEBUG, parent_node, - "MERGE disqualified for parent node {} leftmost_node {}! current size {} is more than balanced size {}", - parent_node->to_string(), leftmost_node->to_string(), leftmost_node->occupied_size(), balanced_size); BT_NODE_DBG_ASSERT(false, leftmost_node, "Didn't expect current size is more than balanced size without rebalancing"); ret = btree_status_t::merge_not_required; @@ -308,51 +497,23 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const } // First try to see how many entries you can fit in the leftmost node within the balanced size. We are checking - // leftmost node as special case without moving, because that is the only node which is modified in-place and hence - // doing a dry run and if for some reason there is a problem in balancing the nodes, then it is easy to give up. + // leftmost node as special case without moving, because that is the only node which is modified in-place and + // hence doing a dry run and if for some reason there is a problem in balancing the nodes, then it is easy to + // give up. available_size = static_cast< int32_t >(balanced_size) - leftmost_node->occupied_size(); - if (leftmost_node->get_node_type() == btree_node_type::PREFIX) { - auto cur_node = static_cast< FixedPrefixNode< K, V >* >(leftmost_node.get()); - expected_holes = cur_node->num_prefix_holes(); - init_holes = expected_holes; - expected_tail = cur_node->cprefix_header()->tail_slot; - init_tail = expected_tail; - } src_cursor.ith_node = old_nodes.size(); for (uint32_t i{0}; (i < old_nodes.size() && available_size >= 0); ++i) { leftmost_src.ith_nodes.push_back(i); - // TODO: check whether value size of the node is greater than available_size? If so nentries is 0. Suppose if a - // node contains one entry and the value size is much bigger than available size - auto nentries = old_nodes[i]->num_entries_by_size(0, available_size); - -#ifdef _PRERELEASE - if (max_keys) { - if (expected_entities + nentries > max_keys) { - nentries = max_keys - expected_entities; - expected_entities = max_keys; - } else { - expected_entities += nentries; - } - } -#endif - + // TODO: check whether value size of the node is greater than available_size? If so nentries is 0. Suppose + // if a node contains one entry and the value size is much bigger than available size + auto const nentries = old_nodes[i]->num_entries_by_size(0, available_size); if ((old_nodes[i]->total_entries() - nentries) == 0) { // Entire node goes in + available_size -= old_nodes[i]->occupied_size(); + // For prefix nodes, compaction will make the size smaller, so we can compact saving to available size; + // hence it cannot get negative. if (old_nodes[i]->get_node_type() == btree_node_type::PREFIX) { auto cur_node = static_cast< FixedPrefixNode< K, V >* >(old_nodes[i].get()); - auto c_used_slot = cur_node->cprefix_header()->used_slots; - expected_holes = c_used_slot > init_holes ? 0 : (expected_holes - c_used_slot); - expected_tail = init_tail + (expected_holes > 0 ? 0 : (c_used_slot - init_holes)); - BT_NODE_DBG_ASSERT_EQ(expected_tail >= init_tail, true, leftmost_node, - "Expected tail {} is not greater than initial tail {}", expected_tail, init_tail); - auto prefix_increased_size = - (expected_tail - init_tail) * FixedPrefixNode< K, V >::prefix_entry::size(); - auto suffix_increased_size = cur_node->total_entries() * FixedPrefixNode< K, V >::suffix_entry::size(); - - available_size -= (prefix_increased_size + suffix_increased_size); - init_holes = expected_holes; - init_tail = expected_tail; - } else { - available_size -= old_nodes[i]->occupied_size(); + available_size += cur_node->compact_saving(); } BT_NODE_DBG_ASSERT_EQ(available_size >= 0, true, leftmost_node, "negative available size"); if (i >= old_nodes.size() - 1) { @@ -374,7 +535,7 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const available_size = 0; while (src_cursor.ith_node < old_nodes.size()) { if (available_size == 0) { - new_node.reset(leftmost_node->is_leaf() ? alloc_leaf_node().get() : alloc_interior_node().get()); + new_node = leftmost_node->is_leaf() ? create_leaf_node(context) : create_interior_node(context); if (new_node == nullptr) { ret = btree_status_t::merge_failed; goto out; @@ -385,7 +546,7 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const } auto& old_ith_node = old_nodes[src_cursor.ith_node]; - auto const nentries = new_node->copy_by_size(m_bt_cfg, *old_ith_node, src_cursor.nth_entry, available_size); + auto const nentries = new_node->copy_by_size(*old_ith_node, src_cursor.nth_entry, available_size); total_size -= new_node->occupied_size(); if (old_ith_node->total_entries() == (src_cursor.nth_entry + nentries)) { // Copied entire node @@ -393,9 +554,9 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const src_cursor.nth_entry = 0; available_size = balanced_size - new_node->occupied_size(); } else { - // If it is the last node supposed to be, check if the remaining entries can be copied and not creating a - // new nodes. This will make the last new node a little skewed from balanced size due to large key/values - // but avoid making extra new node. + // If it is the last node supposed to be, check if the remaining entries can be copied and not creating + // a new nodes. This will make the last new node a little skewed from balanced size due to large + // key/values but avoid making extra new node. if (new_nodes.size() == num_nodes - 1 && total_size < new_node->available_size()) { available_size = new_node->available_size(); src_cursor.nth_entry += nentries; @@ -411,37 +572,28 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const // better merge next time. if (new_nodes.size() > old_nodes.size()) { ret = btree_status_t::merge_not_required; - BT_NODE_LOG( - DEBUG, parent_node, - "MERGE disqualified for parent node {} leftmost_node {}! new nodes size {} is more than old nodes size {}", - parent_node->to_string(), leftmost_node->to_string(), new_nodes.size(), old_nodes.size()); goto out; } - // There is a case where we are rebalancing and the second node which rebalanced didn't move any size, in that case - // the first node is going to be exactly same and we will do again merge, so bail out here. - if ((new_nodes.size() == old_nodes.size()) && (old_nodes[0]->occupied_size() == new_nodes[0]->occupied_size())) { + // There is a case where we are rebalancing and the second node which rebalanced didn't move any size, in that + // case the first node is going to be exactly same and we will do again merge, so bail out here. + if ((new_nodes.size() == old_nodes.size()) && (old_nodes[0]->occupied_size() >= new_nodes[0]->occupied_size())) { ret = btree_status_t::merge_not_required; - BT_NODE_LOG(DEBUG, parent_node, - "MERGE disqualified for parent node {} leftmost_node {}! old nodes occupied size {} is more than " - "as new nodes occupied size {}", - parent_node->to_string(), leftmost_node->to_string(), old_nodes[0]->occupied_size(), - new_nodes[0]->occupied_size()); goto out; } if (!K::is_fixed_size()) { // we first calculate the least amount of space being released after removing excess children. the key size - // cannot be taken account; so we know for sure that value (i.e., linkinfo) and also its record will be freed. - // If the end_idx is the parent's edge, the space is not released eventually. + // cannot be taken account; so we know for sure that value (i.e., linkinfo) and also its record will be + // freed. If the end_idx is the parent's edge, the space is not released eventually. auto excess_releasing_nodes = old_nodes.size() - new_nodes.size() - (parent_node->total_entries() == end_idx) ? 1 : 0; auto minimum_releasing_excess_size = excess_releasing_nodes * (BtreeLinkInfo::get_fixed_size()); // aside from releasing size due to excess node, K::get_max_size is needed for each updating element - // at worst case (linkinfo and record remain the same for old and new nodes). The number of updating elements - // are the size of the new nodes (the last key of the last new node is not getting updated; hence excluded) plus - // the leftmost node. + // at worst case (linkinfo and record remain the same for old and new nodes). The number of updating + // elements are the size of the new nodes (the last key of the last new node is not getting updated; hence + // excluded) plus the leftmost node. if (parent_node->available_size() + minimum_releasing_excess_size < (1 + new_nodes.size() ? new_nodes.size() - 1 : 0) * K::get_max_size()) { BT_NODE_LOG(DEBUG, parent_node, @@ -456,7 +608,7 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const { for (uint32_t i{0}; i < leftmost_src.ith_nodes.size(); ++i) { auto const idx = leftmost_src.ith_nodes[i]; - leftmost_node->copy_by_entries(m_bt_cfg, *old_nodes[idx], 0, + leftmost_node->copy_by_entries(*old_nodes[idx], 0, (i == leftmost_src.ith_nodes.size() - 1) ? leftmost_src.last_node_upto : std::numeric_limits< uint32_t >::max()); @@ -544,7 +696,7 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const } #endif - ret = transact_nodes(new_nodes, old_nodes, leftmost_node, parent_node, context); + ret = m_bt_private->transact_nodes(new_nodes, old_nodes, leftmost_node, parent_node, context); } out: @@ -556,9 +708,10 @@ out: } for (auto it = new_nodes.rbegin(); it != new_nodes.rend(); ++it) { BT_NODE_LOG(DEBUG, (*it).get(), "Freeing this new node as part of unsuccessful merge"); - free_node(*it, locktype_t::NONE, context); + remove_node(*it, locktype_t::NONE, context); } } return ret; } +#endif } // namespace homestore diff --git a/src/include/homestore/btree/btree_req.hpp b/src/include/homestore/btree/detail/btree_req.hpp similarity index 61% rename from src/include/homestore/btree/btree_req.hpp rename to src/include/homestore/btree/detail/btree_req.hpp index 4e28dec8e..1a4668db2 100644 --- a/src/include/homestore/btree/btree_req.hpp +++ b/src/include/homestore/btree/detail/btree_req.hpp @@ -16,36 +16,37 @@ #pragma once #include #include +#include namespace homestore { struct BtreeRequest; +class CPContext; typedef std::pair< BtreeKey, BtreeValue > btree_kv_t; // Base class for any btree operations struct BtreeRequest { - BtreeRequest() = default; - BtreeRequest(void* app_ctx, void* op_ctx) : m_app_context{app_ctx}, m_op_context{op_ctx} {} - - void enable_route_tracing() { - route_tracing = std::make_unique< std::vector< trace_route_entry > >(); - route_tracing->reserve(8); + BtreeRequest(BtreeBase& btree, bool enable_tracing) : m_btree{btree} { + if (enable_tracing) { + m_route_tracing = std::make_unique< std::vector< trace_route_entry > >(); + m_route_tracing->reserve(8); + } } std::string route_string() const { std::string out; - if (route_tracing) { - fmt::format_to(std::back_inserter(out), "Route size={}\n", route_tracing->size()); - for (const auto& r : *route_tracing) { + if (m_route_tracing) { + fmt::format_to(std::back_inserter(out), "Route size={}\n", m_route_tracing->size()); + for (const auto& r : *m_route_tracing) { fmt::format_to(std::back_inserter(out), "{}\n", r.to_string()); } } return out; } - void* m_app_context{nullptr}; - void* m_op_context{nullptr}; - std::unique_ptr< std::vector< trace_route_entry > > route_tracing{nullptr}; + BtreeBase& m_btree; + CPContext* m_op_context{nullptr}; + std::unique_ptr< std::vector< trace_route_entry > > m_route_tracing{nullptr}; }; // Base class for all range related operations @@ -71,8 +72,9 @@ struct BtreeRangeRequest : public BtreeRequest { } protected: - BtreeRangeRequest(BtreeKeyRange< K >&& input_range, void* app_context = nullptr, uint32_t batch_size = UINT32_MAX) : - BtreeRequest{app_context, nullptr}, m_search_state{std::move(input_range)}, m_batch_size{batch_size} {} + BtreeRangeRequest(BtreeBase& btree, BtreeKeyRange< K >&& input_range, uint32_t batch_size = UINT32_MAX, + bool enable_tracing = false) : + BtreeRequest{btree, enable_tracing}, m_search_state{std::move(input_range)}, m_batch_size{batch_size} {} private: BtreeTraversalState< K > m_search_state; @@ -80,14 +82,22 @@ struct BtreeRangeRequest : public BtreeRequest { }; /////////////////////////// 1: Put Operations ///////////////////////////////////// -ENUM(put_filter_decision, uint8_t, keep, replace, remove); -using put_filter_cb_t = std::function< put_filter_decision(BtreeKey const&, BtreeValue const&, BtreeValue const&) >; - struct BtreeSinglePutRequest : public BtreeRequest { public: - BtreeSinglePutRequest(const BtreeKey* k, const BtreeValue* v, btree_put_type put_type, + BtreeSinglePutRequest(BtreeBase& btree, const BtreeKey* k, const BtreeValue* v, btree_put_type put_type, BtreeValue* existing_val = nullptr, put_filter_cb_t filter_cb = nullptr) : - m_k{k}, m_v{v}, m_put_type{put_type}, m_existing_val{existing_val}, m_filter_cb{std::move(filter_cb)} {} + BtreeRequest{btree, btree.route_tracer().is_enabled_for(BtreeRouteTracer::Op::PUT)}, + m_k{k}, + m_v{v}, + m_put_type{put_type}, + m_existing_val{existing_val}, + m_filter_cb{std::move(filter_cb)} {} + + ~BtreeSinglePutRequest() { + if (this->m_route_tracing) { + this->m_btree.route_tracer().append_to(BtreeRouteTracer::Op::PUT, this->route_string()); + } + } const BtreeKey& key() const { return *m_k; } const BtreeValue& value() const { return *m_v; } @@ -102,14 +112,21 @@ struct BtreeSinglePutRequest : public BtreeRequest { template < typename K > struct BtreeRangePutRequest : public BtreeRangeRequest< K > { public: - BtreeRangePutRequest(BtreeKeyRange< K >&& inp_range, btree_put_type put_type, const BtreeValue* value, - void* app_context = nullptr, uint32_t batch_size = std::numeric_limits< uint32_t >::max(), + BtreeRangePutRequest(BtreeBase& btree, BtreeKeyRange< K >&& inp_range, btree_put_type put_type, + const BtreeValue* value, uint32_t batch_size = std::numeric_limits< uint32_t >::max(), put_filter_cb_t filter_cb = nullptr) : - BtreeRangeRequest< K >(std::move(inp_range), app_context, batch_size), + BtreeRangeRequest< K >{btree, std::move(inp_range), batch_size, + btree.route_tracer().is_enabled_for(BtreeRouteTracer::Op::PUT)}, m_put_type{put_type}, m_newval{value}, m_filter_cb{std::move(filter_cb)} {} + ~BtreeRangePutRequest() { + if (this->m_route_tracing) { + this->m_btree.route_tracer().append_to(BtreeRouteTracer::Op::PUT, this->route_string()); + } + } + const btree_put_type m_put_type{btree_put_type::UPDATE}; const BtreeValue* m_newval; put_filter_cb_t m_filter_cb; @@ -118,7 +135,16 @@ struct BtreeRangePutRequest : public BtreeRangeRequest< K > { /////////////////////////// 2: Remove Operations ///////////////////////////////////// struct BtreeSingleRemoveRequest : public BtreeRequest { public: - BtreeSingleRemoveRequest(const BtreeKey* k, BtreeValue* out_val) : m_k{k}, m_outval{out_val} {} + BtreeSingleRemoveRequest(BtreeBase& btree, const BtreeKey* k, BtreeValue* out_val) : + BtreeRequest{btree, btree.route_tracer().is_enabled_for(BtreeRouteTracer::Op::REMOVE)}, + m_k{k}, + m_outval{out_val} {} + + ~BtreeSingleRemoveRequest() { + if (this->m_route_tracing) { + this->m_btree.route_tracer().append_to(BtreeRouteTracer::Op::REMOVE, this->route_string()); + } + } const BtreeKey& key() const { return *m_k; } const BtreeValue& value() const { return *m_outval; } @@ -130,32 +156,56 @@ struct BtreeSingleRemoveRequest : public BtreeRequest { template < typename K > struct BtreeRemoveAnyRequest : public BtreeRequest { public: - BtreeRemoveAnyRequest(BtreeKeyRange< K >&& inp_range, BtreeKey* out_key, BtreeValue* out_val) : - m_range{std::move(inp_range)}, m_outkey{out_key}, m_outval{out_val} {} + BtreeRemoveAnyRequest(BtreeBase& btree, BtreeKeyRange< K >&& inp_range, BtreeKey* out_key, BtreeValue* out_val) : + BtreeRequest{btree, btree.route_tracer().is_enabled_for(BtreeRouteTracer::Op::REMOVE)}, + m_range{std::move(inp_range)}, + m_outkey{out_key}, + m_outval{out_val} {} + + ~BtreeRemoveAnyRequest() { + if (this->m_route_tracing) { + this->m_btree.route_tracer().append_to(BtreeRouteTracer::Op::REMOVE, this->route_string()); + } + } BtreeKeyRange< K > m_range; BtreeKey* m_outkey; BtreeValue* m_outval; }; -using remove_filter_cb_t = std::function< bool(BtreeKey const&, BtreeValue const&) >; - template < typename K > struct BtreeRangeRemoveRequest : public BtreeRangeRequest< K > { public: remove_filter_cb_t m_filter_cb; public: - BtreeRangeRemoveRequest(BtreeKeyRange< K >&& inp_range, void* app_context = nullptr, + BtreeRangeRemoveRequest(BtreeBase& btree, BtreeKeyRange< K >&& inp_range, uint32_t batch_size = std::numeric_limits< uint32_t >::max(), remove_filter_cb_t filter_cb = nullptr) : - BtreeRangeRequest< K >(std::move(inp_range), app_context, batch_size), m_filter_cb{std::move(filter_cb)} {} + BtreeRangeRequest< K >(btree, std::move(inp_range), batch_size, + btree.route_tracer().is_enabled_for(BtreeRouteTracer::Op::REMOVE)), + m_filter_cb{std::move(filter_cb)} {} + + ~BtreeRangeRemoveRequest() { + if (this->m_route_tracing) { + this->m_btree.route_tracer().append_to(BtreeRouteTracer::Op::REMOVE, this->route_string()); + } + } }; /////////////////////////// 3: Get Operations ///////////////////////////////////// struct BtreeSingleGetRequest : public BtreeRequest { public: - BtreeSingleGetRequest(const BtreeKey* k, BtreeValue* out_val) : m_k{k}, m_outval{out_val} {} + BtreeSingleGetRequest(BtreeBase& btree, const BtreeKey* k, BtreeValue* out_val) : + BtreeRequest{btree, btree.route_tracer().is_enabled_for(BtreeRouteTracer::Op::GET)}, + m_k{k}, + m_outval{out_val} {} + + ~BtreeSingleGetRequest() { + if (this->m_route_tracing) { + this->m_btree.route_tracer().append_to(BtreeRouteTracer::Op::GET, this->route_string()); + } + } const BtreeKey& key() const { return *m_k; } const BtreeValue& value() const { return *m_outval; } @@ -167,8 +217,17 @@ struct BtreeSingleGetRequest : public BtreeRequest { template < typename K > struct BtreeGetAnyRequest : public BtreeRequest { public: - BtreeGetAnyRequest(BtreeKeyRange< K >&& range, BtreeKey* out_key, BtreeValue* out_val) : - m_range{std::move(range)}, m_outkey{out_key}, m_outval{out_val} {} + BtreeGetAnyRequest(BtreeBase& btree, BtreeKeyRange< K >&& range, BtreeKey* out_key, BtreeValue* out_val) : + BtreeRequest{btree, btree.route_tracer().is_enabled_for(BtreeRouteTracer::Op::GET)}, + m_range{std::move(range)}, + m_outkey{out_key}, + m_outval{out_val} {} + + ~BtreeGetAnyRequest() { + if (this->m_route_tracing) { + this->m_btree.route_tracer().append_to(BtreeRouteTracer::Op::GET, this->route_string()); + } + } BtreeKeyRange< K > m_range; BtreeKey* m_outkey; @@ -195,21 +254,24 @@ ENUM(BtreeQueryType, uint8_t, // This is both inefficient and quiet intrusive/unsafe query, where it locks the range // that is being queried for and do not allow any insert or update within that range. It // essentially create a serializable level of isolation. - SERIALIZABLE_QUERY) - -using get_filter_cb_t = std::function< bool(BtreeKey const&, BtreeValue const&) >; + SERIALIZABLE_QUERY); template < typename K > struct BtreeQueryRequest : public BtreeRangeRequest< K > { public: - BtreeQueryRequest(BtreeKeyRange< K >&& inp_range, + BtreeQueryRequest(BtreeBase& btree, BtreeKeyRange< K >&& inp_range, BtreeQueryType query_type = BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, - uint32_t batch_size = UINT32_MAX, get_filter_cb_t filter_cb = nullptr, - void* app_context = nullptr) : - BtreeRangeRequest< K >{std::move(inp_range), app_context, batch_size}, + uint32_t batch_size = UINT32_MAX, get_filter_cb_t filter_cb = nullptr) : + BtreeRangeRequest< K >{btree, std::move(inp_range), batch_size, + btree.route_tracer().is_enabled_for(BtreeRouteTracer::Op::QUERY)}, m_query_type{query_type}, m_filter_cb{std::move(filter_cb)} {} - ~BtreeQueryRequest() = default; + + ~BtreeQueryRequest() { + if (this->m_route_tracing) { + this->m_btree.route_tracer().append_to(BtreeRouteTracer::Op::QUERY, this->route_string()); + } + } // virtual bool is_serializable() const = 0; BtreeQueryType query_type() const { return m_query_type; } diff --git a/src/include/homestore/btree/mem_btree.hpp b/src/include/homestore/btree/mem_btree.hpp deleted file mode 100644 index ce606fc5a..000000000 --- a/src/include/homestore/btree/mem_btree.hpp +++ /dev/null @@ -1,86 +0,0 @@ -/********************************************************************************* - * Modifications Copyright 2017-2019 eBay Inc. - * - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed - * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR - * CONDITIONS OF ANY KIND, either express or implied. See the License for the - * specific language governing permissions and limitations under the License. - * - *********************************************************************************/ -#pragma once -#ifdef StoreSpecificBtreeNode -#undef StoreSpecificBtreeNode -#endif - -#define StoreSpecificBtreeNode BtreeNode - -#include "btree.ipp" - -namespace homestore { -template < typename K, typename V > -class MemBtree : public Btree< K, V > { -private: - std::vector< std::shared_ptr< uint8_t[] > > node_buf_ptr_vec; - -public: - MemBtree(const BtreeConfig& cfg) : Btree< K, V >(cfg) { - BT_LOG(INFO, "New {} being created: Node size {}", btree_store_type(), cfg.node_size()); - auto const status = this->create_root_node(nullptr); - if (status != btree_status_t::success) { throw std::runtime_error(fmt::format("Unable to create root node")); } - } - - virtual ~MemBtree() { - const auto [ret, free_node_cnt] = this->destroy_btree(nullptr); - BT_LOG_ASSERT_EQ(ret, btree_status_t::success, "btree destroy failed"); - } - - std::string btree_store_type() const override { return "MEM_BTREE"; } - -private: - BtreeNodePtr alloc_node(bool is_leaf) override { - std::shared_ptr< uint8_t[] > ptr(new uint8_t[this->m_bt_cfg.node_size()]); - node_buf_ptr_vec.emplace_back(ptr); - - auto new_node = this->init_node(ptr.get(), bnodeid_t{0}, true, is_leaf); - new_node->set_node_id(bnodeid_t{r_cast< std::uintptr_t >(new_node)}); - new_node->m_refcount.increment(); - return BtreeNodePtr{new_node}; - } - - btree_status_t write_node_impl(const BtreeNodePtr& node, void* context) { return btree_status_t::success; } - - btree_status_t read_node_impl(bnodeid_t id, BtreeNodePtr& node) const override { - node.reset(r_cast< BtreeNode* >(id)); - return btree_status_t::success; - } - - btree_status_t refresh_node(const BtreeNodePtr& node, bool for_read_modify_write, void* context) const override { - return btree_status_t::success; - } - - void free_node_impl(const BtreeNodePtr& node, void* context) override { intrusive_ptr_release(node.get()); } - - btree_status_t transact_nodes(const BtreeNodeList& new_nodes, const BtreeNodeList& freed_nodes, - const BtreeNodePtr& left_child_node, const BtreeNodePtr& parent_node, - void* context) override { - for (const auto& node : new_nodes) { - this->write_node(node, context); - } - this->write_node(left_child_node, context); - this->write_node(parent_node, context); - - for (const auto& node : freed_nodes) { - this->free_node(node, locktype_t::WRITE, context); - } - return btree_status_t::success; - } - - btree_status_t on_root_changed(BtreeNodePtr const&, void*) override { return btree_status_t::success; } -}; -} // namespace homestore diff --git a/src/include/homestore/btree/node_variant/mini_trie_node.hpp b/src/include/homestore/btree/node_variant/mini_trie_node.hpp new file mode 100644 index 000000000..1bdae91f9 --- /dev/null +++ b/src/include/homestore/btree/node_variant/mini_trie_node.hpp @@ -0,0 +1,988 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ + +#pragma once + +#include +#include +#include + +namespace homestore { +// Internal format of variable node: +// [Persistent Header][MiniTrieNodeheader][ArenaBitset][Entry1].. ... [Prefix1][Value][Suffix1][value][Suffix2][value] +// +template < typename K, typename V > +class MiniTrieNode : public VariantNode< K, V > { +public: + using BtreeNode::get_nth_key_internal; + using BtreeNode::get_nth_key_size; + using BtreeNode::get_nth_obj_size; + using BtreeNode::get_nth_value; + using BtreeNode::get_nth_value_size; + using BtreeNode::node_data_area; + using BtreeNode::node_data_area_size; + using BtreeNode::node_data_size; + using BtreeNode::node_gen; + using BtreeNode::to_string; + using BtreeNode::total_entries; + using VariantNode< K, V >::get_nth_value; + +private: +#pragma pack(1) + struct Header { + uint16_t tail_offset; // Offset where kv can be inserted + uint16_t hole_size{0}; // How much space are holes + + Header(uint16_t node_data_size) : tail_offset{node_data_size} {} + std::string to_string() const { return fmt::format("tail_offset={} hole_size={}", tail_offset, hole_size); } + }; + + struct Entry { + uint16_t prefix_offset; // Where prefix key starts + uint16_t prefix_size; // Size of the prefix key + uint16_t suffix_size; // Size of the suffix key + uint16_t value_size; // Size of the value + uint16_t kv_offset; // Offset of where both suffix key and value is present + }; + + struct PrefixInfo { + uint16_t refcount{1}; // Number of entries pointing to this prefix + uint8_t key[1]; // Prefix key + + static uint16_t size(uint16_t key_size) { return key_size + sizeof(PrefixInfo) - 1; } + + PrefixInfo(sisl::blob const& k) { std::memcpy(&key[0], k.cbytes(), k.size()); } + std::string to_string() const { return fmt::format("refcount={} key={}", refcount, key); } + }; +#pragma pack() + + MiniTrieNode(uint8_t* node_buf, bnodeid_t id, bool init, bool is_leaf, uint32_t node_size, + bool is_temp_node = false) : + VariantNode< K, V >(node_buf, id, init, is_leaf, node_size, is_temp_node) { + if (init) { new (this->node_data_area()) Header(this->node_data_size()); } + } + + virtual ~MiniTrieNode() = default; + + ////////////////////////////////////// All overrides of BtreeNode /////////////////////////////////// + btree_status_t insert(uint32_t idx, const BtreeKey& key, const BtreeValue& val) override { + uint32_t prev_match_size{0}; + uint32_t next_match_size{0}; + + auto kblob = key.serialize(); + auto vblob = val.serialize(); + if (idx > 0) { prev_match_size = check_prefix_match(kblob, nth_entry(idx - 1)); } + if ((idx + 1) < total_entries()) { next_match_size = check_prefix_match(kblob, nth_entry(idx + 1)); } + + if (prev_match_size == 0 && next_match_size == 0) { + insert_standalone(idx, kblob, vblob); + } else if (prev_match_size >= next_match_size) { + insert_dependent(idx, idx - 1, prev_match_size, extract_suffix_blob(klob, prev_match_size), vblob); + } else { + insert_dependent(idx, idx + 1, next_match_size, extract_suffix_blob(kblob, next_match_size), vblob); + } + inc_gen(); + return btree_status_t::success; + } + + void update(uint32_t idx, const BtreeValue& val) override { + if (update_if_edge(idx, val)) { return; } + + auto new_val_blob = val.serialize(); + auto const gen{node_gen() + 1}; + + Entry const* e = nth_entry(idx); + auto cur_val_blob = value(e); + if (new_val_blob.size() <= cur_val_blob.size()) { + // Same or smaller size value update, we can just update in-place and free up anything remaining + std::memcpy(offset_to_ptr(e->kv_offset + e->suffix_size), new_val_blob.cbytes(), new_val_blob.size()); + e->value_size = new_val_blob.size(); + free_space(e->kv_offset + e->suffix_size + e->value_size, cur_val_blob.size() - new_val_blob.size()); + } else if (new_val_blob.size() <= immediate_available_space()) { + // We have enough room at the tail, so we can update them in the additional space and free up the + // existing one. + auto kv_offset = alloc_space(new_val_blob.size() + e->suffix_key_size); + std::memcpy(offset_to_ptr(kv_offset), offset_to_ptr(e->kv_offset), e->suffix_key_size); + std::memcpy(offset_to_ptr(kv_offset + e->suffix_key_size), new_val_blob.cbytes(), new_val_blob.size()); + free_space(e->kv_offset, e->suffix_key_size + e->value_size); + e->kv_offset = kv_offset; + e->value_size = new_val_blob.size(); + } else { + // We could possibly compact and try above step, but it is no guarantee that compact will be able to + // generate enough space to first insert an additional value and then remove. So we instead do a get with + // copy, remove and insert to make sure enough room. + K k = get_nth_key(idx, true /* copy */); + remove(idx); + insert(idx, k, val); + } + set_gen(gen); + } + + void update(uint32_t idx, BtreeKey const& key, BtreeValue const& val) override { + DEBUG_ASSERT_LT(idx, this->total_entries(), "Using wrong update method to update edge?"); + auto const gen{this->node_gen() + 1}; + remove(idx); + insert(idx, key, val); + set_gen(gen); + } + + void remove(uint32_t idx) override { remove(idx, idx); } + + void remove(uint32_t idx_s, uint32_t idx_e) override { + auto const gen{this->node_gen() + 1}; + if (idx_e == total_entries()) { + remove_if_edge(idx_e); + if (idx_e-- == 0) { goto done; } + } + + for (uint32_t idx{idx_s}; idx <= idx_e; ++idx) { + auto e = nth_entry(idx); + if (--get_prefix_info(e).refcount == 0) { free_space(e->prefix_offset, PrefixInfo::size(e->prefix_size)); } + free_space(e->kv_offset, e->suffix_size + e->value_size); + } + + std::memmove(uintptr_cast(nth_entry(idx_s), uintptr_cast(nth_entry(idx_e + 1)), + (total_entries() - idx_e - 1) * sizeof(Entry))); + sub_entries(idx_e - idx_s + 1); + done: + set_gen(gen); + } + + void remove_all() override { + new (node_data_area()) Header(this->node_data_size()); // Reset the header + sub_entries(this->total_entries()); + invalidate_edge(); + inc_gen(); + } + + uint32_t available_size() const override { return immediate_available_space() + header()->hole_size; } + + bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const override { + if (put_type == btree_put_type::UPDATE) { + needed_size = value_size; + } else { + needed_size = sizeof(Entry) + PrefixInfo::size(key_size) + value_size; + } + return (available_size() >= needed_size); + } + + void get_nth_key_internal(uint32_t idx, BtreeKey& out_key, bool copy) const override { + DEBUG_ASSERT_LT(idx, this->total_entries(), "node={}", to_string()); + auto prefix_blob = nth_prefix(idx); + auto suffix_blob = nth_suffix(idx); + + if (suffix.size() == 0) { + out_key.deserialize(prefix_blob, copy); + } else { + out_key.deserialize(prefix_blob, suffix_blob, copy); + } + } + + void get_nth_key_size(uint32_t idx) const override { + DEBUG_ASSERT_LT(idx, this->total_entries(), "node={}", to_string()); + auto e = nth_entry(idx); + return e->prefix_size + e->suffix_size; + } + + void get_nth_value(uint32_t idx, BtreeValue* out_val, bool copy) const override { + DEBUG_ASSERT_LT(idx, this->total_entries(), "node={}", to_string()); + out_val->deserialize(nth_value(idx), copy); + } + + uint32_t get_nth_value_size(uint32_t idx) const override { + DEBUG_ASSERT_LT(idx, this->total_entries(), "node={}", to_string()); + return nth_entry(idx)->value_size; + } + + uint32_t move_out_to_right_by_entries(BtreeNode& o, uint32_t nentries) override { + auto& other = static_cast< MiniTrieNode& >(o); + auto const this_gen{this->node_gen() + 1}; + auto const other_gen{other.node_gen() + 1}; + + if (nentries == 0) { return 0; /* Nothing to move */ } + + uint32_t nmoved{0}; + if (!this->is_leaf() && this->has_valid_edge()) { + other.set_edge_info(this->edge_info()); + this->invalidate_edge(); + ++nmoved; + } + + while ((nmoved < nentries) && (this->total_entries() > 0)) { + auto const idx = this->total_entries() - 1; + other.insert(0, get_nth_key(idx, false), get_nth_value(idx, false)); + remove(idx); + ++nmoved; + } + + this->set_gen(this_gen); + other.set_gen(other_gen); + return nmoved; + } + + uint32_t move_out_to_right_by_size(BtreeNode& o, uint32_t size) override { + auto& other = static_cast< MiniTrieNode& >(o); + auto const this_gen{this->node_gen() + 1}; + auto const other_gen{other.node_gen() + 1}; + + uint32_t nmoved{0}; + if (!this->is_leaf() && this->has_valid_edge()) { + other.set_edge_info(this->edge_info()); + this->invalidate_edge(); + ++nmoved; + } + + if (total_entries() == 0) { goto done; } + uint32_t idx = this->total_entries() - 1; + while (idx > 0) { + if (available_size() >= size) { + // We have enough space on current node after moving some entries + break; + } + other.insert(0, get_nth_key(idx, false), get_nth_value(idx, false)); + remove(idx--); + ++nmoved; + } + + done: + set_gen(this_gen); + other.set_gen(other_gen); + return nmoved; + } + + uint32_t copy_by_size(BtreeNode const& o, uint32_t start_idx, uint32_t size) override { + auto& other = static_cast< const MiniTrieNode& >(o); + auto const other_gen{other.node_gen() + 1}; + + uint32_t idx = start_idx; + uint32_t n = 0; + while (idx < total_entries()) { + auto kblob = get_nth_key(idx, false); + auto vblob = get_nth_value(idx, false); + + // We reached threshold of how much we could move + if ((kblob.size() + vblob.size() + sizeof(Entry)) > size) { break; } + + insert(this->total_entries(), kblob, vblob); + ++n; + ++idx; + } + set_gen(this_gen + 1); + return n; + } +}; +} // namespace homestore + +#if 0 + + /* Insert the key and value in provided index + * Assumption: Node lock is already taken */ + btree_status_t insert(uint32_t ind, const BtreeKey& key, const BtreeValue& val) override { + LOGTRACEMOD(btree, "{}:{}", key.to_string(), val.to_string()); + auto sz = insert(ind, key.serialize(), val.serialize()); +#ifndef NDEBUG + validate_sanity(); +#endif + return (sz == 0) ? btree_status_t::space_not_avail : btree_status_t::success; + } + +#ifndef NDEBUG + void validate_sanity() { + uint32_t i{0}; + // validate if keys are in ascending order + K prevKey; + while (i < this->total_entries()) { + K key = BtreeNode::get_nth_key< K >(i, false); + uint64_t kp = *(uint64_t*)key.serialize().bytes(); + if (i > 0 && prevKey.compare(key) > 0) { + DEBUG_ASSERT(false, "Found non sorted entry: {} -> {}", kp, to_string()); + } + prevKey = key; + ++i; + } + } +#endif + + /* Update a value in a given index to the provided value. It will support change in size of the new value. + * Assumption: Node lock is already taken, size check for the node to support new value is already done */ + void update(uint32_t ind, const BtreeValue& val) override { + // If we are updating the edge value, none of the other logic matter. Just update edge value and move on + if (ind == this->total_entries()) { + DEBUG_ASSERT_EQ(this->is_leaf(), false); + this->set_edge_value(val); + this->inc_gen(); + } else { + K key = BtreeNode::get_nth_key< K >(ind, true); + update(ind, key, val); + } + } + + // TODO - currently we do not support variable size key + void update(uint32_t ind, const BtreeKey& key, const BtreeValue& val) override { + LOGTRACEMOD(btree, "Update called:{}", to_string()); + DEBUG_ASSERT_LE(ind, this->total_entries()); + + // If we are updating the edge value, none of the other logic matter. Just update edge value and move on + if (ind == this->total_entries()) { + DEBUG_ASSERT_EQ(this->is_leaf(), false); + this->set_edge_value(val); + this->inc_gen(); + return; + } + + // Determine if we are doing same size update or smaller size update, in that case, reuse the space. + uint16_t new_obj_size = key.serialized_size() + val.serialized_size(); + uint16_t cur_obj_size = get_nth_obj_size(ind); + + if (cur_obj_size >= new_obj_size) { + uint8_t* key_ptr = (uint8_t*)get_nth_obj(ind); + uint8_t* val_ptr = key_ptr + key.serialized_size(); + sisl::blob kblob = key.serialize(); + sisl::blob vblob = val.serialize(); + + DEBUG_ASSERT_EQ(kblob.size(), key.serialized_size(), + "Key Serialized size returned different after serialization"); + DEBUG_ASSERT_EQ(vblob.size(), val.serialized_size(), + "Value Serialized size returned different after serialization"); + + // we can avoid memcpy if addresses of val_ptr and vblob.bytes is same. In place update + if (key_ptr != kblob.cbytes()) { std::memcpy(key_ptr, kblob.cbytes(), kblob.size()); } + if (val_ptr != vblob.cbytes()) { std::memcpy(val_ptr, vblob.cbytes(), vblob.size()); } + set_nth_key_len(get_nth_record_mutable(ind), kblob.size()); + set_nth_value_len(get_nth_record_mutable(ind), vblob.size()); + get_var_node_header()->m_available_space += cur_obj_size - new_obj_size; + this->inc_gen(); + } else { + remove(ind, ind); + insert(ind, key, val); + LOGTRACEMOD(btree, "Size changed for either key or value. Had to delete and insert :{}", to_string()); + } + } + + // ind_s and ind_e are inclusive + void remove(uint32_t ind_s, uint32_t ind_e) override { + uint32_t total_entries = this->total_entries(); + assert(total_entries >= ind_s); + assert(total_entries >= ind_e); + uint32_t recSize = this->get_record_size(); + uint32_t no_of_elem = ind_e - ind_s + 1; + if (ind_e == this->total_entries()) { + assert(!this->is_leaf() && this->has_valid_edge()); + + V last_1_val; + get_nth_value(ind_s - 1, &last_1_val, false); + this->set_edge_value(last_1_val); + + for (uint32_t i = ind_s - 1; i < total_entries; i++) { + get_var_node_header()->m_available_space += get_nth_key_size(i) + get_nth_value_size(i) + recSize; + } + this->sub_entries(total_entries - ind_s + 1); + } else { + // claim available memory + for (uint32_t i = ind_s; i <= ind_e; i++) { + get_var_node_header()->m_available_space += get_nth_key_size(i) + get_nth_value_size(i) + recSize; + } + uint8_t* rec_ptr = get_nth_record_mutable(ind_s); + memmove(rec_ptr, rec_ptr + recSize * no_of_elem, (this->total_entries() - ind_e - 1) * recSize); + + this->sub_entries(no_of_elem); + } + this->inc_gen(); + } + + void remove_all() override { + this->sub_entries(this->total_entries()); + this->invalidate_edge(); + this->inc_gen(); + get_var_node_header()->m_init_available_space = this->node_data_size(); + get_var_node_header()->m_tail_arena_offset = this->node_data_size(); + get_var_node_header()->m_available_space = get_var_node_header()->m_tail_arena_offset - sizeof(var_node_header); +#ifndef NDEBUG + validate_sanity(); +#endif + } + + /*V get(uint32_t ind, bool copy) const { + // Need edge index + if (ind == this->total_entries()) { + assert(!this->is_leaf()); + assert(this->has_valid_edge()); + return this->get_edge_value(); + } else { + return get_nth_value(ind, copy); + } + }*/ + + uint32_t move_out_to_right_by_entries(const BtreeConfig& cfg, BtreeNode& o, uint32_t nentries) override { + auto& other = static_cast< VariableNode& >(o); + const auto this_gen = this->node_gen(); + const auto other_gen = other.node_gen(); + + const auto this_nentries = this->total_entries(); + nentries = std::min(nentries, this_nentries); + if (nentries == 0) { return 0; /* Nothing to move */ } + + const uint32_t start_ind = this_nentries - 1; + const uint32_t end_ind = this_nentries - nentries; + uint32_t ind = start_ind; + bool full_move{false}; + while (ind >= end_ind) { + // Get the ith key and value blob and then remove the entry from here and insert to the other node + sisl::blob const kb{get_nth_obj(ind), get_nth_key_size(ind)}; + sisl::blob const vb{kb.cbytes() + kb.size(), get_nth_value_size(ind)}; + + auto sz = other.insert(0, kb, vb); + if (!sz) { break; } + if (ind == 0) { + full_move = true; + break; + } + --ind; + } + + if (!this->is_leaf() && (other.total_entries() != 0)) { + // Incase this node is an edge node, move the stick to the right hand side node + other.set_edge_info(this->edge_info()); + this->invalidate_edge(); + } + remove(full_move ? 0u : ind + 1, start_ind); // Remove all entries in bulk + + // Remove and insert would have set the gen multiple increments, just reset it to increment only by 1 + // TODO: This is bit ugly but needed in-order to avoid repeat the same code again, but see if we can produce + // interface around it. + this->set_gen(this_gen + 1); + other.set_gen(other_gen + 1); + + return (start_ind - ind); + } + + uint32_t move_out_to_right_by_size(const BtreeConfig& cfg, BtreeNode& o, uint32_t size_to_move) override { + auto& other = static_cast< VariableNode& >(o); + auto this_gen = this->node_gen(); + auto other_gen = other.node_gen(); + uint32_t nmoved{0}; + + uint32_t ind = this->total_entries() - 1; + while (ind > 0) { + sisl::blob const kb{get_nth_obj(ind), get_nth_key_size(ind)}; + sisl::blob const vb{kb.cbytes() + kb.size(), get_nth_value_size(ind)}; + + if ((kb.size() + vb.size() + this->get_record_size()) > size_to_move) { + // We reached threshold of how much we could move + break; + } + + auto sz = other.insert(0, kb, vb); // Keep on inserting on the first index, thus moving everything to right + + --ind; + ++nmoved; + size_to_move -= sz; + } + remove(ind + 1, this->total_entries() - 1); + + if (!this->is_leaf() && (other.total_entries() != 0)) { + // Incase this node is an edge node, move the stick to the right hand side node + other.set_edge_info(this->edge_info()); + this->invalidate_edge(); + } + + // Remove and insert would have set the gen multiple increments, just reset it to increment only by 1 + // TODO: This is bit ugly but needed in-order to avoid repeat the same code again, but see if we can produce + // interface around it. + this->set_gen(this_gen + 1); + other.set_gen(other_gen + 1); + + return nmoved; + } + + uint32_t num_entries_by_size(uint32_t start_idx, uint32_t size) const override { + auto idx = start_idx; + uint32_t cum_size{0}; + + while (idx < this->total_entries()) { + uint32_t const rec_size = this->get_record_size() + get_nth_key_size(idx) + get_nth_value_size(idx); + cum_size += rec_size; + if (cum_size > size) { break; } + ++idx; + } + + return idx - start_idx; + } + + uint32_t copy_by_size(const BtreeConfig& cfg, const BtreeNode& o, uint32_t start_idx, uint32_t copy_size) override { + auto& other = static_cast< const VariableNode& >(o); + auto this_gen = this->node_gen(); + + auto idx = start_idx; + uint32_t n = 0; + while (idx < other.total_entries()) { + sisl::blob const kb{(uint8_t*)other.get_nth_obj(idx), other.get_nth_key_size(idx)}; + sisl::blob const vb{kb.cbytes() + kb.size(), other.get_nth_value_size(idx)}; + + // We reached threshold of how much we could move + if ((kb.size() + vb.size() + other.get_record_size()) > copy_size) { break; } + + auto sz = insert(this->total_entries(), kb, vb); + if (sz == 0) { break; } + ++n; + ++idx; + copy_size -= sz; + } + this->set_gen(this_gen + 1); + + // If we copied everything from start_idx till end and if its an edge node, need to copy the edge id as well. + if (other.has_valid_edge() && ((start_idx + n) == other.total_entries())) { + this->set_edge_info(other.edge_info()); + } + return n; + } + + uint32_t copy_by_entries(const BtreeConfig& cfg, const BtreeNode& o, uint32_t start_idx, + uint32_t nentries) override { + auto& other = static_cast< const VariableNode& >(o); + auto this_gen = this->node_gen(); + + nentries = std::min(nentries, other.total_entries() - start_idx); + auto idx = start_idx; + uint32_t n = 0; + while (n < nentries) { + sisl::blob const kb{other.get_nth_obj(idx), other.get_nth_key_size(idx)}; + sisl::blob const vb{kb.cbytes() + kb.size(), other.get_nth_value_size(idx)}; + + auto sz = insert(this->total_entries(), kb, vb); + if (sz == 0) { break; } + ++n; + ++idx; + } + this->set_gen(this_gen + 1); + + // If we copied everything from start_idx till end and if its an edge node, need to copy the edge id as well. + if (other.has_valid_edge() && ((start_idx + n) == other.total_entries())) { + this->set_edge_info(other.edge_info()); + } + return n; + } + + /*uint32_t move_in_from_right_by_entries(const BtreeConfig& cfg, BtreeNode& o, uint32_t nentries) override { + auto& other = static_cast< VariableNode& >(o); + auto this_gen = this->node_gen(); + auto other_gen = other.node_gen(); + nentries = std::min(nentries, other.total_entries()); + + if (nentries == 0) { return 0; } + uint32_t other_ind = 0; + while (nentries) { + // Get the ith key and value blob and then remove the entry from here and insert to the other node + sisl::blob kb; + kb.bytes = (uint8_t*)other.get_nth_obj(other_ind); + kb.size = other.get_nth_key_size(other_ind); + + sisl::blob vb; + vb.bytes = kb.bytes + kb.size; + vb.size = other.get_nth_value_size(other_ind); + + auto sz = insert(this->total_entries(), kb, vb); + if (!sz) { break; } + --nentries; + ++other_ind; + } + + other.remove(0, other_ind - 1); // Remove all entries in bulk + assert(other.total_entries() == nentries); + + if (!other.is_leaf() && (other.total_entries() == 0)) { + // Incase other node is an edge node and we moved all the data into this node, move over the edge info as + // well. + this->set_edge_id(other.edge_id()); + other.invalidate_edge(); + } + + // Remove and insert would have set the gen multiple increments, just reset it to increment only by 1 + // TODO: This is bit ugly but needed in-order to avoid repeat the same code again, but see if we can produce + // interface around it. + this->set_gen(this_gen + 1); + other.set_gen(other_gen + 1); + + return (other_ind); + } + + uint32_t move_in_from_right_by_size(const BtreeConfig& cfg, BtreeNode& o, uint32_t size_to_move) override { + auto& other = static_cast< VariableNode& >(o); + uint32_t moved_size = 0U; + auto this_gen = this->node_gen(); + auto other_gen = other.node_gen(); + + uint32_t ind = 0; + while (ind < this->total_entries()) { + sisl::blob kb; + kb.bytes = (uint8_t*)other.get_nth_obj(ind); + kb.size = other.get_nth_key_size(ind); + + sisl::blob vb; + vb.bytes = kb.bytes + kb.size; + vb.size = other.get_nth_value_size(ind); + + if ((kb.size + vb.size + other.get_record_size()) > size_to_move) { + // We reached threshold of how much we could move + break; + } + auto sz = insert(this->total_entries(), kb, vb); // Keep on inserting on the last index. + if (!sz) break; + moved_size += sz; + ind++; + size_to_move -= sz; + } + if (ind) other.remove(0, ind - 1); + + if (!other.is_leaf() && (other.total_entries() == 0)) { + // Incase other node is an edge node and we moved all the data into this node, move over the edge info as + // well. + this->set_edge_id(other.edge_id()); + other.invalidate_edge(); + } + + // Remove and insert would have set the gen multiple increments, just reset it to increment only by 1 + // TODO: This is bit ugly but needed in-order to avoid repeat the same code again, but see if we can produce + // interface around it. + this->set_gen(this_gen + 1); + other.set_gen(other_gen + 1); + + return moved_size; + } */ + + uint32_t available_size() const override { return get_var_node_header_const()->m_available_space; } + + void set_nth_key(uint32_t ind, const BtreeKey& key) { + const auto kb = key.serialize(); + assert(ind < this->total_entries()); + assert(kb.size() == get_nth_key_size(ind)); + memcpy(uintptr_cast(get_nth_obj(ind)), kb.cbytes(), kb.size()); + } + + bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const override { + auto needed_size = key_size + value_size; + if ((put_type == btree_put_type::UPSERT) || (put_type == btree_put_type::INSERT)) { + needed_size += get_record_size(); + } + return (available_size() >= needed_size); + } + + virtual uint32_t get_record_size() const = 0; + virtual void set_nth_key_len(uint8_t* rec_ptr, uint32_t key_len) = 0; + virtual void set_nth_value_len(uint8_t* rec_ptr, uint32_t value_len) = 0; + + void get_nth_key_internal(uint32_t ind, BtreeKey& out_key, bool copy) const override { + assert(ind < this->total_entries()); + sisl::blob b{const_cast< uint8_t* >(get_nth_obj(ind)), get_nth_key_size(ind)}; + out_key.deserialize(b, copy); + } + + void get_nth_value(uint32_t ind, BtreeValue* out_val, bool copy) const override { + if (ind == this->total_entries()) { + DEBUG_ASSERT_EQ(this->is_leaf(), false, "get_nth_value out-of-bound"); + DEBUG_ASSERT_EQ(this->has_valid_edge(), true, "get_nth_value out-of-bound"); + *(BtreeLinkInfo*)out_val = this->get_edge_value(); + } else { + sisl::blob b{const_cast< uint8_t* >(get_nth_obj(ind)) + get_nth_key_size(ind), get_nth_value_size(ind)}; + out_val->deserialize(b, copy); + } + } + + std::string to_string(bool print_friendly = false) const override { + auto str = fmt::format( + "{}id={} level={} nEntries={} {} free_space={}{} ", + (print_friendly ? "---------------------------------------------------------------------\n" : ""), + this->node_id(), this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), + get_var_node_header_const()->m_available_space, + (this->next_bnode() == empty_bnodeid) ? "" : fmt::format(" next_node={}", this->next_bnode())); + if (!this->is_leaf() && (this->has_valid_edge())) { + fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid, + this->edge_info().m_link_version); + } + for (uint32_t i{0}; i < this->total_entries(); ++i) { + V val; + get_nth_value(i, &val, false); + fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={}]", (print_friendly ? "\n\t" : " "), i + 1, + BtreeNode::get_nth_key< K >(i, false).to_string(), val.to_string()); + } + return str; + } + + std::string to_dot_keys() const override { return "NOT Supported"; } + + /*int compare_nth_key_range(const BtreeKeyRange& range, uint32_t ind) const { + return get_nth_key(ind, false).compare_range(range); + }*/ + +protected: + uint32_t insert(uint32_t ind, const sisl::blob& key_blob, const sisl::blob& val_blob) { + assert(ind <= this->total_entries()); + LOGTRACEMOD(btree, "{}:{}:{}:{}", ind, get_var_node_header()->tail_offset(), get_arena_free_space(), + get_var_node_header()->available_space()); + uint16_t obj_size = key_blob.size() + val_blob.size(); + uint16_t to_insert_size = obj_size + this->get_record_size(); + if (to_insert_size > get_var_node_header()->available_space()) { + RELEASE_ASSERT(false, "insert failed insert size {} available size {}", to_insert_size, + get_var_node_header()->available_space()); + return 0; + } + + // If we don't have enough space in the tail arena area, we need to compact and get the space. + if (to_insert_size > get_arena_free_space()) { + compact(); + // Expect after compaction to have available space to insert + DEBUG_ASSERT_LE(to_insert_size, get_arena_free_space(), "We should have space available after compaction"); + } + + // Create a room for a new record + uint8_t* rec_ptr = uintptr_cast(get_nth_record_mutable(ind)); + memmove((void*)(rec_ptr + this->get_record_size()), rec_ptr, + (this->total_entries() - ind) * this->get_record_size()); + + // Move up the tail area + assert(get_var_node_header()->m_tail_arena_offset > obj_size); + get_var_node_header()->m_tail_arena_offset -= obj_size; + get_var_node_header()->m_available_space -= (obj_size + this->get_record_size()); + + // Create a new record + set_nth_key_len(rec_ptr, key_blob.size()); + set_nth_value_len(rec_ptr, val_blob.size()); + set_record_data_offset(rec_ptr, get_var_node_header()->m_tail_arena_offset); + + // Copy the contents of key and value in the offset + uint8_t* raw_data_ptr = offset_to_ptr_mutable(get_var_node_header()->m_tail_arena_offset); + memcpy(raw_data_ptr, key_blob.cbytes(), key_blob.size()); + raw_data_ptr += key_blob.size(); + memcpy(raw_data_ptr, val_blob.cbytes(), val_blob.size()); + + // Increment the entries and generation number + this->add_entries(1); + this->inc_gen(); + +#ifndef NDEBUG + this->validate_sanity(); +#endif + + return to_insert_size; + } + + /* + * This method compacts and provides contiguous tail arena space + * so that available space == tail arena space + * */ + void compact() { +#ifndef NDEBUG + this->validate_sanity(); +#endif + // temp ds to sort records in stack space + struct Record { + uint16_t m_obj_offset; + uint16_t orig_record_index; + }; + + uint32_t no_of_entries = this->total_entries(); + if (no_of_entries == 0) { + // this happens when there is only entry and in update, we first remove and than insert + get_var_node_header()->m_tail_arena_offset = get_var_node_header()->m_init_available_space; + LOGTRACEMOD(btree, "Full available size reclaimed"); + return; + } + std::vector< Record > rec; + rec.reserve(no_of_entries); + + uint32_t ind = 0; + while (ind < no_of_entries) { + btree_obj_record* rec_ptr = (btree_obj_record*)(get_nth_record_mutable(ind)); + rec[ind].m_obj_offset = rec_ptr->m_obj_offset; + rec[ind].orig_record_index = ind; + ind++; + } + + // use comparator to sort based on m_obj_offset in desc order + std::sort(rec.begin(), rec.begin() + no_of_entries, + [](Record const& a, Record const& b) -> bool { return b.m_obj_offset < a.m_obj_offset; }); + + uint16_t last_offset = get_var_node_header()->m_init_available_space; + + ind = 0; + uint16_t sparce_space = 0; + // loop records + while (ind < no_of_entries) { + uint16_t total_key_value_len = + get_nth_key_size(rec[ind].orig_record_index) + get_nth_value_size(rec[ind].orig_record_index); + sparce_space = last_offset - (rec[ind].m_obj_offset + total_key_value_len); + if (sparce_space > 0) { + // do compaction + uint8_t* old_key_ptr = (uint8_t*)get_nth_obj(rec[ind].orig_record_index); + uint8_t* raw_data_ptr = old_key_ptr + sparce_space; + memmove(raw_data_ptr, old_key_ptr, total_key_value_len); + + // update original record + btree_obj_record* rec_ptr = (btree_obj_record*)(get_nth_record_mutable(rec[ind].orig_record_index)); + rec_ptr->m_obj_offset += sparce_space; + + last_offset = rec_ptr->m_obj_offset; + + } else { + assert(sparce_space == 0); + last_offset = rec[ind].m_obj_offset; + } + ind++; + } + get_var_node_header()->m_tail_arena_offset = last_offset; +#ifndef NDEBUG + this->validate_sanity(); +#endif + LOGTRACEMOD(btree, "Sparse space reclaimed:{}", sparce_space); + } + + const uint8_t* get_nth_record(uint32_t ind) const { + return this->node_data_area_const() + sizeof(var_node_header) + (ind * this->get_record_size()); + } + uint8_t* get_nth_record_mutable(uint32_t ind) { + return this->node_data_area() + sizeof(var_node_header) + (ind * this->get_record_size()); + } + + const uint8_t* get_nth_obj(uint32_t ind) const { + return offset_to_ptr(((btree_obj_record*)get_nth_record(ind))->m_obj_offset); + } + uint8_t* get_nth_obj_mutable(uint32_t ind) { + return offset_to_ptr_mutable(((btree_obj_record*)get_nth_record(ind))->m_obj_offset); + } + + void set_record_data_offset(uint8_t* rec_ptr, uint16_t offset) { + auto r = (btree_obj_record*)rec_ptr; + r->m_obj_offset = offset; + } + + uint8_t* offset_to_ptr_mutable(uint16_t offset) { return this->node_data_area() + offset; } + + const uint8_t* offset_to_ptr(uint16_t offset) const { return this->node_data_area_const() + offset; } + + ///////////// Other Private Methods ////////////////// + inline var_node_header* get_var_node_header() { return r_cast< var_node_header* >(this->node_data_area()); } + + inline const var_node_header* get_var_node_header_const() const { + return r_cast< const var_node_header* >(this->node_data_area_const()); + } + + uint16_t get_arena_free_space() const { + return get_var_node_header_const()->m_tail_arena_offset - sizeof(var_node_header) - + (this->total_entries() * this->get_record_size()); + } +}; + +template < typename K, typename V > +class VarKeySizeNode : public VariableNode< K, V > { +public: + VarKeySizeNode(uint8_t* node_buf, bnodeid_t id, bool init, bool is_leaf, const BtreeConfig& cfg) : + VariableNode< K, V >(node_buf, id, init, is_leaf, cfg) { + this->set_node_type(btree_node_type::VAR_KEY); + } + virtual ~VarKeySizeNode() = default; + + uint32_t get_nth_key_size(uint32_t ind) const override { + return r_cast< const var_key_record* >(this->get_nth_record(ind))->m_key_len; + } + uint32_t get_nth_value_size(uint32_t ind) const override { return dummy_value< V >.serialized_size(); } + uint32_t get_record_size() const override { return sizeof(var_key_record); } + + void set_nth_key_len(uint8_t* rec_ptr, uint32_t key_len) override { + r_cast< var_key_record* >(rec_ptr)->m_key_len = key_len; + } + void set_nth_value_len(uint8_t* rec_ptr, uint32_t value_len) override { + assert(value_len == dummy_value< V >.serialized_size()); + } + +private: +#pragma pack(1) + struct var_key_record : public btree_obj_record { + uint16_t m_key_len : 14; + uint16_t reserved : 2; + }; +#pragma pack() +}; + +/***************** Template Specialization for variable value records ******************/ +template < typename K, typename V > +class VarValueSizeNode : public VariableNode< K, V > { +public: + VarValueSizeNode(uint8_t* node_buf, bnodeid_t id, bool init, bool is_leaf, const BtreeConfig& cfg) : + VariableNode< K, V >(node_buf, id, init, is_leaf, cfg) { + this->set_node_type(btree_node_type::VAR_VALUE); + } + virtual ~VarValueSizeNode() = default; + + uint32_t get_nth_key_size(uint32_t ind) const override { return dummy_key< K >.serialized_size(); } + uint32_t get_nth_value_size(uint32_t ind) const override { + return r_cast< const var_value_record* >(this->get_nth_record(ind))->m_value_len; + } + uint32_t get_record_size() const override { return sizeof(var_value_record); } + + void set_nth_key_len(uint8_t* rec_ptr, uint32_t key_len) override { + assert(key_len == dummy_key< K >.serialized_size()); + } + void set_nth_value_len(uint8_t* rec_ptr, uint32_t value_len) override { + r_cast< var_value_record* >(rec_ptr)->m_value_len = value_len; + } + +private: +#pragma pack(1) + struct var_value_record : public btree_obj_record { + uint16_t m_value_len : 14; + uint16_t reserved : 2; + }; +#pragma pack() +}; + +/***************** Template Specialization for variable object records ******************/ +template < typename K, typename V > +class VarObjSizeNode : public VariableNode< K, V > { +public: + VarObjSizeNode(uint8_t* node_buf, bnodeid_t id, bool init, bool is_leaf, const BtreeConfig& cfg) : + VariableNode< K, V >(node_buf, id, init, is_leaf, cfg) { + this->set_node_type(btree_node_type::VAR_OBJECT); + } + virtual ~VarObjSizeNode() = default; + + uint32_t get_nth_key_size(uint32_t ind) const override { + return r_cast< const var_obj_record* >(this->get_nth_record(ind))->m_key_len; + } + uint32_t get_nth_value_size(uint32_t ind) const override { + return r_cast< const var_obj_record* >(this->get_nth_record(ind))->m_value_len; + } + uint32_t get_record_size() const override { return sizeof(var_obj_record); } + + void set_nth_key_len(uint8_t* rec_ptr, uint32_t key_len) override { + r_cast< var_obj_record* >(rec_ptr)->m_key_len = key_len; + } + void set_nth_value_len(uint8_t* rec_ptr, uint32_t value_len) override { + r_cast< var_obj_record* >(rec_ptr)->m_value_len = value_len; + } + +private: +#pragma pack(1) + struct var_obj_record : public btree_obj_record { + uint16_t m_key_len : 14; + uint16_t reserved : 2; + + uint16_t m_value_len : 14; + uint16_t reserved2 : 2; + }; +#pragma pack() +}; +} // namespace homestore +#endif diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/node_variant/prefix_node.hpp similarity index 79% rename from src/include/homestore/btree/detail/prefix_node.hpp rename to src/include/homestore/btree/node_variant/prefix_node.hpp index cd75beca0..a9890cff8 100644 --- a/src/include/homestore/btree/detail/prefix_node.hpp +++ b/src/include/homestore/btree/node_variant/prefix_node.hpp @@ -18,34 +18,31 @@ #include #include -#include "btree_node.hpp" +#include #include -#include - -SISL_LOGGING_DECL(btree) namespace homestore { -template < typename K, typename V > -class Btree; + // Internal format of variable node: // [Persistent Header][prefix_node_header][prefix_area_bitset][KV Suffix][KV Suffix].. ... ... [KV Prefix][KV Prefix] // template < typename K, typename V > class FixedPrefixNode : public VariantNode< K, V > { +public: using BtreeNode::get_nth_key_internal; using BtreeNode::get_nth_key_size; using BtreeNode::get_nth_obj_size; using BtreeNode::get_nth_value; using BtreeNode::get_nth_value_size; + using BtreeNode::occupied_size; using BtreeNode::to_string; using VariantNode< K, V >::get_nth_value; - friend class Btree< K, V >; private: #pragma pack(1) struct prefix_node_header { - uint16_t used_slots; // Number of slots actually used. TODO: We can deduce from set_bit_count of bitset - uint16_t tail_slot; // The tail slot number being used. Address will point to the beginning of tail prefix + uint16_t used_slots{0}; // Number of slots actually used. TODO: We can deduce from set_bit_count of bitset + uint16_t tail_slot{0}; // What is the tail slot number being used std::string to_string() const { return fmt::format("slots_used={} tail_slot={} ", used_slots, tail_slot); } @@ -91,21 +88,6 @@ class FixedPrefixNode : public VariantNode< K, V > { } } - int compare(BtreeKey const& key, BtreeValue const& val) const { - if constexpr (std::is_base_of_v< BtreeIntervalKey, K > && std::is_base_of_v< BtreeIntervalValue, V >) { - sisl::blob const kblob = s_cast< K const& >(key).serialize_prefix(); - sisl::blob const vblob = s_cast< V const& >(val).serialize_prefix(); - DEBUG_ASSERT_EQ(kblob.size(), key_size(), "Prefix key size mismatch with serialized prefix size"); - DEBUG_ASSERT_EQ(vblob.size(), value_size(), "Prefix value size mismatch with serialized prefix size"); - uint8_t const* cur_ptr = r_cast< uint8_t const* >(this) + sizeof(prefix_entry); - int cmp = std::memcmp(cur_ptr, kblob.cbytes(), kblob.size()); - if (cmp) { return cmp; } - cmp = std::memcmp(cur_ptr + kblob.size(), vblob.cbytes(), vblob.size()); - return cmp; - } - return 0; - } - sisl::blob key_buf() const { return sisl::blob{r_cast< uint8_t const* >(this) + sizeof(prefix_entry), key_size()}; } @@ -166,22 +148,23 @@ class FixedPrefixNode : public VariantNode< K, V > { sisl::CompactBitSet prefix_bitset_; public: - FixedPrefixNode(uint8_t* node_buf, bnodeid_t id, bool init, bool is_leaf, const BtreeConfig& cfg) : - VariantNode< K, V >(node_buf, id, init, is_leaf, cfg), - prefix_bitset_{sisl::blob{bitset_area(), reqd_bitset_size(cfg)}, init} { - this->set_node_type(btree_node_type::PREFIX); - if (init) { - auto phdr = prefix_header(); - phdr->used_slots = 0; - phdr->tail_slot = 0; - } + FixedPrefixNode(bnodeid_t id, bool is_leaf, uint32_t node_size, BtreeNode::Allocator::Token token) : + VariantNode< K, V >(id, is_leaf, node_size, token), + prefix_bitset_{sisl::blob{bitset_area(), reqd_bitset_size(this->node_data_size())}, /*init=*/true} { + this->set_node_type(btree_node_type::FIXED_PREFIX); + this->m_variant_private_data = reqd_bitset_size(this->node_data_size()); + new (this->node_data_area()) prefix_node_header(); } - virtual ~FixedPrefixNode() = default; - virtual void on_update_phys_buf() override { - // Update the prefix bitset with the new buffer - prefix_bitset_ = sisl::CompactBitSet{sisl::blob{bitset_area(), prefix_bitset_.size() / 8}, false}; + FixedPrefixNode(uint8_t* node_buf, bnodeid_t id, BtreeNode::Allocator::Token token) : + VariantNode< K, V >(node_buf, id, token), + prefix_bitset_{sisl::blob{bitset_area(), reqd_bitset_size(this->node_data_size())}, /*init=*/false} { + DEBUG_ASSERT_EQ(this->get_node_type(), btree_node_type::FIXED_PREFIX); + this->m_variant_private_data = reqd_bitset_size(this->node_data_size()); } + + virtual ~FixedPrefixNode() = default; + ///////////////////////////// All overrides of BtreeIntervalNode /////////////////////////////////// /// @brief Upserts a batch of entries into a prefix node. /// @@ -195,15 +178,14 @@ class FixedPrefixNode : public VariantNode< K, V > { /// batch_upsert_decision_t value. If the function returns: /// batch_upsert_decision_t::replace, the entry is upserted with the new value. /// batch_upsert_decision_t::remove, the entry is removed from the node. - /// batch_upsert_decision_t::keep, the entry is not modified and the method moves on to the - /// next entry. - /// @param app_ctx User supplied private context data. + /// batch_upsert_decision_t::keep, the entry is not modified and the method moves on to + /// the next entry. /// @return An optional key that was not upserted due to lack of space in the node. /// If all keys were upserted successfully, the method returns std::nullopt. /// If the method ran out of space in the node, the method returns the key that was last upserted btree_status_t multi_put(BtreeKeyRange< K > const& keys, BtreeKey const& first_input_key, BtreeValue const& val, - btree_put_type put_type, K* last_failed_key, put_filter_cb_t const& filter_cb = nullptr, - void* app_ctx = nullptr) override { + btree_put_type put_type, K* last_failed_key, + put_filter_cb_t const& filter_cb = nullptr) override { DEBUG_ASSERT_EQ(this->is_leaf(), true, "Multi put entries on node are supported only for leaf nodes"); if constexpr (std::is_base_of_v< BtreeIntervalKey, K > && std::is_base_of_v< BtreeIntervalValue, V >) { uint32_t modified{0}; @@ -211,7 +193,7 @@ class FixedPrefixNode : public VariantNode< K, V > { uint16_t prefix_slot{std::numeric_limits< uint16_t >::max()}; K cur_key = keys.start_key(); - if (!keys.is_start_inclusive()) { cur_key.shift(1, app_ctx); } + if (!keys.is_start_inclusive()) { cur_key.shift(1); } if (!has_room(1u)) { return btree_status_t::space_not_avail; } bool upserted_all{false}; @@ -246,7 +228,7 @@ class FixedPrefixNode : public VariantNode< K, V > { } else { std::memmove(get_suffix_entry(idx + 1), get_suffix_entry(idx), (this->total_entries() - idx) * suffix_entry::size()); - this->inc_entries(); + this->add_entries(1); } } @@ -255,16 +237,11 @@ class FixedPrefixNode : public VariantNode< K, V > { prefix_slot = add_prefix(cur_key, val); } V new_val{s_cast< V const& >(val)}; - new_val.shift(s_cast< K const& >(cur_key).distance(first_input_key), app_ctx); - if (get_prefix_entry_c(prefix_slot)->compare(cur_key, new_val)) { - LOGTRACEMOD(btree, "Adding new prefix entry for key={} val={}", cur_key.to_string(), - new_val.to_string()); - prefix_slot = add_prefix(cur_key, new_val); - } + new_val.shift(s_cast< K const& >(cur_key).distance(first_input_key)); write_suffix(idx, prefix_slot, cur_key, new_val); } - cur_key.shift(1, app_ctx); + cur_key.shift(1); if (!has_room(1u)) { break; } if (decision != put_filter_decision::remove) { ++idx; } @@ -301,12 +278,11 @@ class FixedPrefixNode : public VariantNode< K, V > { * * @return Returns number of objects removed */ - uint32_t multi_remove(BtreeKeyRange< K > const& keys, remove_filter_cb_t const& filter_cb = nullptr, - void* app_ctx = nullptr) override { + uint32_t multi_remove(BtreeKeyRange< K > const& keys, remove_filter_cb_t const& filter_cb = nullptr) override { DEBUG_ASSERT_EQ(this->is_leaf(), true, "remove_batch api is supported only for leaf node"); if constexpr (std::is_base_of_v< BtreeIntervalKey, K > && std::is_base_of_v< BtreeIntervalValue, V >) { K cur_key = keys.start_key(); - if (!keys.is_start_inclusive()) { cur_key.shift(1, app_ctx); } + if (!keys.is_start_inclusive()) { cur_key.shift(1); } uint32_t num_removed{0}; auto [_, idx] = this->find(cur_key, nullptr, false); @@ -315,18 +291,20 @@ class FixedPrefixNode : public VariantNode< K, V > { auto x = cur_key.compare(keys.end_key()); if ((x > 0) || ((x == 0) && !keys.is_end_inclusive())) { break; } + bool remove{true}; if (!filter_cb || filter_cb(cur_key, get_nth_value(idx, false))) { suffix_entry* sentry = get_suffix_entry(idx); deref_remove_prefix(sentry->prefix_slot); std::memmove(uintptr_cast(sentry), uintptr_cast(get_suffix_entry(idx + 1)), (this->total_entries() - idx - 1) * suffix_entry::size()); - this->dec_entries(); + this->sub_entries(1); ++num_removed; } else { ++idx; } } if (num_removed) { this->inc_gen(); } + #ifndef NDEBUG validate_sanity(); #endif @@ -341,7 +319,7 @@ class FixedPrefixNode : public VariantNode< K, V > { DEBUG_ASSERT_LT(idx, this->total_entries(), "node={}", to_string()); suffix_entry const* sentry = get_suffix_entry_c(idx); prefix_entry const* pentry = get_prefix_entry_c(sentry->prefix_slot); - DEBUG_ASSERT(prefix_bitset_.is_bit_set(sentry->prefix_slot), + DEBUG_ASSERT(prefix_bitset_.is_bit_set(cbitset_blob(), sentry->prefix_slot), "Prefix slot number is in suffix entry, but corresponding bit is not set"); s_cast< BtreeIntervalKey& >(out_key).deserialize(pentry->key_buf(), sentry->key_buf(), true); } @@ -354,7 +332,7 @@ class FixedPrefixNode : public VariantNode< K, V > { } else { suffix_entry const* sentry = get_suffix_entry_c(idx); prefix_entry const* pentry = get_prefix_entry_c(sentry->prefix_slot); - DEBUG_ASSERT(prefix_bitset_.is_bit_set(sentry->prefix_slot), + DEBUG_ASSERT(prefix_bitset_.is_bit_set(cbitset_blob(), sentry->prefix_slot), "Prefix slot number is in suffix entry, but corresponding bit is not set"); s_cast< BtreeIntervalValue* >(out_val)->deserialize(pentry->val_buf(), sentry->val_buf(), true); } @@ -366,8 +344,6 @@ class FixedPrefixNode : public VariantNode< K, V > { return get_prefix_entry_c(get_suffix_entry_c(idx)->prefix_slot)->ref_count; } - uint32_t compact_saving() const { return num_prefix_holes() * prefix_entry::size(); } - uint32_t available_size() const override { auto num_holes = num_prefix_holes(); if (num_holes > prefix_node_header::min_holes_to_compact) { @@ -377,32 +353,21 @@ class FixedPrefixNode : public VariantNode< K, V > { } } - uint32_t occupied_size() const override { - return (this->node_data_size() - sizeof(prefix_node_header) - (prefix_bitset_.size() / 8) - - this->available_size()); - } - - bool has_room_for_put(btree_put_type, uint32_t, uint32_t) const override { -#ifdef _PRERELEASE - auto max_keys = this->max_keys_in_node(); - if (max_keys && this->total_entries() > max_keys) { return false; } -#endif - return has_room(1u); - } + bool has_room_for_put(btree_put_type, uint32_t, uint32_t) const override { return has_room(1u); } uint32_t get_nth_key_size(uint32_t) const override { return dummy_key< K >.serialized_size(); } uint32_t get_nth_value_size(uint32_t) const override { return dummy_value< V >.serialized_size(); } - uint32_t move_out_to_right_by_size(const BtreeConfig& cfg, BtreeNode& on, uint32_t size_to_move) override { - return move_out_to_right_internal(cfg, on, true /* by_size*/, size_to_move); + uint32_t move_out_to_right_by_size(BtreeNode& on, uint32_t size_to_move) override { + return move_out_to_right_internal(on, true /* by_size*/, size_to_move); } - uint32_t move_out_to_right_by_entries(const BtreeConfig& cfg, BtreeNode& on, uint32_t num_entries) override { - return move_out_to_right_internal(cfg, on, false /* by_size*/, num_entries); + uint32_t move_out_to_right_by_entries(BtreeNode& on, uint32_t num_entries) override { + return move_out_to_right_internal(on, false /* by_size*/, num_entries); } - uint32_t move_out_to_right_internal(const BtreeConfig& cfg, BtreeNode& on, bool by_size, uint32_t limit) { + uint32_t move_out_to_right_internal(BtreeNode& on, bool by_size, uint32_t limit) { FixedPrefixNode& dst_node = s_cast< FixedPrefixNode& >(on); uint32_t dst_node_size = dst_node.occupied_size(); @@ -481,7 +446,7 @@ class FixedPrefixNode : public VariantNode< K, V > { validate_sanity(); dst_node.validate_sanity(); #endif - return by_size ? num_moved : dst_node_size; + return num_moved; } btree_status_t insert(uint32_t idx, BtreeKey const& key, BtreeValue const& val) override { @@ -491,7 +456,7 @@ class FixedPrefixNode : public VariantNode< K, V > { (this->total_entries() - idx) * suffix_entry::size()); write_suffix(idx, add_prefix(key, val), key, val); - this->inc_entries(); + this->add_entries(1); this->inc_gen(); #ifndef NDEBUG @@ -546,7 +511,7 @@ class FixedPrefixNode : public VariantNode< K, V > { deref_remove_prefix(sentry->prefix_slot); std::memmove(uintptr_cast(sentry), uintptr_cast(get_suffix_entry(idx + 1)), (this->total_entries() - idx - 1) * suffix_entry::size()); - this->dec_entries(); + this->sub_entries(1); } this->inc_gen(); } @@ -557,14 +522,13 @@ class FixedPrefixNode : public VariantNode< K, V > { } } - void remove_all(BtreeConfig const& cfg) override { + void remove_all() override { this->sub_entries(this->total_entries()); this->invalidate_edge(); this->inc_gen(); - prefix_bitset_ = sisl::CompactBitSet{sisl::blob{bitset_area(), reqd_bitset_size(cfg)}, true}; - auto phdr = prefix_header(); - phdr->used_slots = 0; - phdr->tail_slot = 0; + prefix_bitset_ = sisl::CompactBitSet{bitset_blob(), true}; + + new (this->node_data_area()) prefix_node_header(); #ifndef NDEBUG validate_sanity(); #endif @@ -572,6 +536,7 @@ class FixedPrefixNode : public VariantNode< K, V > { uint32_t get_nth_obj_size(uint32_t) const override { return get_key_size() + get_value_size(); } +#if 0 uint32_t num_entries_by_size(uint32_t start_idx, uint32_t size) const { uint32_t num_entries{0}; uint32_t cum_size{0}; @@ -590,29 +555,48 @@ class FixedPrefixNode : public VariantNode< K, V > { } return num_entries; } + uint32_t copy_by_entries(BtreeNode const& o, uint32_t start_idx, uint32_t nentries) { + return copy_internal(o, start_idx, false /* by_size*/, nentries); + } + +#endif + + uint32_t copy_by_size(BtreeNode const& o, uint32_t start_idx, uint32_t size) { + return copy_internal(o, start_idx, true /* by_size*/, size); + } - uint32_t copy_by_size(BtreeConfig const& cfg, BtreeNode const& o, uint32_t start_idx, uint32_t size) override { - return copy_internal(cfg, o, start_idx, true /* by_size*/, size); + uint32_t get_entries_size(uint32_t start_idx, uint32_t end_idx) const override { + return (prefix_entry::size() + suffix_entry::size()) * (end_idx - start_idx); } - uint32_t copy_by_entries(BtreeConfig const& cfg, BtreeNode const& o, uint32_t start_idx, - uint32_t nentries) override { - if (nentries == 0) { return 0; } - if (!has_room(nentries) && has_room_after_compaction(nentries)) { compact(); } - return copy_internal(cfg, o, start_idx, false /* by_size*/, nentries); + bool append_copy_in_upto_size(const BtreeNode& o, uint32_t& other_cursor, uint32_t upto_size, + bool copy_only_if_fits) override { + // Make all calculations for fit based on that we will do compaction + auto const filled_size = this->node_data_size() - available_size_with_compaction(); + if (filled_size >= upto_size) { return false; } // Already filled beyond whats asked for + if (o.total_entries() == 0) { return true; } // No entries + auto const room = upto_size - filled_size; // This much we have actual room for + + auto const bringin_size = o.get_entries_size(other_cursor, o.total_entries()); + if (copy_only_if_fits) { + if (bringin_size > room) { return false; } + } + + // We made size calculations based on if we need compaction or not, however if we can fit all of other node + // without compaction, we try to avoid it. + if (bringin_size > available_size_without_compaction()) { compact(); } + auto const ncopied = copy_internal(o, other_cursor, true /* by_size*/, room); + other_cursor += ncopied; + + if (copy_only_if_fits) { + DEBUG_ASSERT_EQ(other_cursor, o.total_entries(), + "We proceeded to copy after it checking size, but end up not copying all"); + } + return true; } - uint32_t copy_internal(BtreeConfig const& cfg, BtreeNode const& o, uint32_t start_idx, bool by_size, - uint32_t limit) { + uint32_t copy_internal(BtreeNode const& o, uint32_t start_idx, bool by_size, uint32_t limit) { FixedPrefixNode const& src_node = s_cast< FixedPrefixNode const& >(o); -#ifdef _PRERELEASE - if (by_size) { - const uint32_t max_keys = this->max_keys_in_node(); - if (max_keys) { - if (this->total_entries() + limit > max_keys) { limit = max_keys - this->total_entries(); } - } - } -#endif // Adjust the size_to_move to cover the new node's reqd header space. uint32_t copied_size{0}; @@ -677,23 +661,23 @@ class FixedPrefixNode : public VariantNode< K, V > { #ifndef NDEBUG validate_sanity(); #endif - return by_size ? num_copied : copied_size; + return num_copied; } std::string to_string(bool print_friendly = false) const override { - auto str = - fmt::format("{}id={} level={} nEntries={} {} next_node={} available_size={} occupied_size={} ", - (print_friendly ? "------------------------------------------------------------\n" : ""), - this->node_id(), this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), - this->next_bnode(), this->available_size(), this->occupied_size()); + auto str = fmt::format("{}id={} level={} nEntries={} {} next_node={} available_size={} ", + (print_friendly ? "------------------------------------------------------------\n" : ""), + this->node_id(), this->level(), this->total_entries(), + (this->is_leaf() ? "LEAF" : "INTERIOR"), this->next_bnode(), this->available_size()); if (!this->is_leaf() && (this->has_valid_edge())) { fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid, this->edge_info().m_link_version); } - fmt::format_to(std::back_inserter(str), "{}Prefix_Hdr=[{}], Prefix_Bitmap = [{}] # of holes = {}\n", - (print_friendly ? "\n\t" : " "), cprefix_header()->to_string(), this->compact_bitset(), - this->num_prefix_holes()); + fmt::format_to(std::back_inserter(str), "{}Prefix_Hdr={}, Prefix_Bitmap=[{}]\n", + (print_friendly ? "\n\t" : " "), cprefix_header()->to_string(), + prefix_bitset_.to_string(cbitset_blob())); + for (uint32_t i{0}; i < this->total_entries(); ++i) { fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={} slot#={} ref_count={}]", (print_friendly ? "\n\t" : " "), i + 1, BtreeNode::get_nth_key< K >(i, false).to_string(), @@ -718,16 +702,17 @@ class FixedPrefixNode : public VariantNode< K, V > { } uint16_t alloc_prefix() { - auto const slot_num = prefix_bitset_.get_next_reset_bit(0); + auto const slot_num = prefix_bitset_.get_next_reset_bit(cbitset_blob(), 0); if (slot_num == std::numeric_limits< uint16_t >::max()) { DEBUG_ASSERT(false, "Unable to alloc slot, shouldn't be mutating in this node without splitting"); return std::numeric_limits< uint16_t >::max(); } - prefix_bitset_.set_bit(slot_num); + prefix_bitset_.set_bit(sisl::blob{bitset_area(), uint32_cast(bitset_size())}, slot_num); auto phdr = prefix_header(); ++phdr->used_slots; - if (slot_num + 1u > phdr->tail_slot) { phdr->tail_slot = slot_num + 1u; } + if (s_cast< uint16_t >(slot_num) >= phdr->tail_slot) { phdr->tail_slot = slot_num + 1; } + DEBUG_ASSERT_LE(phdr->used_slots, phdr->tail_slot, "Prefix slot number {} is not less than tail slot number {}", slot_num, phdr->tail_slot); return slot_num; @@ -743,9 +728,9 @@ class FixedPrefixNode : public VariantNode< K, V > { if (--pentry->ref_count == 0) { --phdr->used_slots; - prefix_bitset_.reset_bit(slot_num); - if (slot_num + 1u == phdr->tail_slot) { - uint16_t prev_slot = prefix_bitset_.get_prev_set_bit(slot_num); + prefix_bitset_.reset_bit(sisl::blob{bitset_area(), uint32_cast(bitset_size())}, slot_num); + if ((slot_num == phdr->tail_slot - 1)) { + uint16_t prev_slot = prefix_bitset_.get_prev_set_bit(cbitset_blob(), slot_num); phdr->tail_slot = prev_slot + 1u; } } @@ -760,10 +745,11 @@ class FixedPrefixNode : public VariantNode< K, V > { uint32_t available_size_without_compaction() const { uint8_t const* suffix = r_cast< uint8_t const* >(get_suffix_entry_c(this->total_entries())); - uint8_t const* prefix = r_cast< uint8_t const* >(get_prefix_entry_c(cprefix_header()->tail_slot)); + uint8_t const* prefix = + r_cast< uint8_t const* >(get_prefix_entry_c(cprefix_header()->tail_slot)) + prefix_entry::size(); - if (suffix <= prefix + prefix_entry::size()) { - return prefix - suffix + prefix_entry::size(); + if (suffix <= prefix) { + return prefix - suffix; } else { DEBUG_ASSERT(false, "Node data is corrupted, suffix area is overlapping prefix area {}", int64_t(suffix - prefix)); @@ -771,7 +757,9 @@ class FixedPrefixNode : public VariantNode< K, V > { } } - uint32_t available_size_with_compaction() const { return available_size_without_compaction() + compact_saving(); } + uint32_t available_size_with_compaction() const { + return available_size_without_compaction() + (num_prefix_holes() * prefix_entry::size()); + } bool has_room(uint16_t for_nentries) const { return (available_size_without_compaction() >= (prefix_entry::size() + (for_nentries * suffix_entry::size()))); @@ -783,8 +771,7 @@ class FixedPrefixNode : public VariantNode< K, V > { uint32_t num_prefix_holes() const { auto phdr = cprefix_header(); - DEBUG_ASSERT_LE(phdr->used_slots, phdr->tail_slot, "Prefix slot number {} is not less than tail slot number {}", - phdr->used_slots, phdr->tail_slot); + DEBUG_ASSERT_LE(phdr->used_slots, phdr->tail_slot, "Prefix slot number is not less than tail slot number"); return (phdr->tail_slot - phdr->used_slots); } @@ -803,10 +790,10 @@ class FixedPrefixNode : public VariantNode< K, V > { uint16_t from_slot{prefix_header()->used_slots}; uint16_t to_slot{0}; while (true) { - from_slot = prefix_bitset_.get_next_set_bit(from_slot); + from_slot = prefix_bitset_.get_next_set_bit(cbitset_blob(), from_slot); if (from_slot == std::numeric_limits< uint16_t >::max()) { break; } - auto const to_slot = prefix_bitset_.get_next_reset_bit(0u); + auto const to_slot = prefix_bitset_.get_next_reset_bit(cbitset_blob(), 0u); DEBUG_ASSERT_NE(to_slot, std::numeric_limits< uint16_t >::max(), "Didn't find a free location on to compaction side, not expected"); DEBUG_ASSERT_LT(to_slot, prefix_header()->used_slots, @@ -814,8 +801,8 @@ class FixedPrefixNode : public VariantNode< K, V > { std::memcpy(uintptr_cast(get_prefix_entry(to_slot)), (void*)get_prefix_entry(from_slot), prefix_entry::size()); - prefix_bitset_.reset_bit(from_slot); - prefix_bitset_.set_bit(to_slot); + prefix_bitset_.reset_bit(sisl::blob{bitset_area(), uint32_cast(bitset_size())}, from_slot); + prefix_bitset_.set_bit(sisl::blob{bitset_area(), uint32_cast(bitset_size())}, to_slot); // Move all the suffixes that are referencing this prefix to the new location auto range = prefix_to_suffix.equal_range(from_slot); @@ -828,7 +815,7 @@ class FixedPrefixNode : public VariantNode< K, V > { // Finally adjust the tail offset to the compacted area. auto phdr = prefix_header(); phdr->tail_slot = phdr->used_slots; - DEBUG_ASSERT_EQ(phdr->tail_slot, prefix_bitset_.get_next_reset_bit(0u), + DEBUG_ASSERT_EQ(phdr->tail_slot, prefix_bitset_.get_next_reset_bit(cbitset_blob(), 0u), "Tail slot is not equal to the next reset bit, not expected"); DEBUG_ASSERT_EQ(this->num_prefix_holes(), 0, "Shouldn't be any hole after compression, not expected"); } @@ -851,9 +838,8 @@ class FixedPrefixNode : public VariantNode< K, V > { #endif //////////////////////// All Helper methods section //////////////////////// - static uint32_t reqd_bitset_size(BtreeConfig const& cfg) { - return sisl::round_up((cfg.node_data_size() - sizeof(prefix_node_header) - suffix_entry::size()) / - prefix_entry::size() / 8, + static uint32_t reqd_bitset_size(uint32_t node_data_size) { + return sisl::round_up(node_data_size / (prefix_entry::key_size() + prefix_entry::value_size()) / 8, sisl::CompactBitSet::size_multiples()); } @@ -864,14 +850,17 @@ class FixedPrefixNode : public VariantNode< K, V > { uint8_t* bitset_area() { return this->node_data_area() + sizeof(prefix_node_header); } uint8_t const* cbitset_area() const { return this->node_data_area_const() + sizeof(prefix_node_header); } + uint16_t bitset_size() const { return this->m_variant_private_data; } + sisl::blob bitset_blob() { return sisl::blob{bitset_area(), uint32_cast(bitset_size())}; } + sisl::blob cbitset_blob() const { return sisl::blob{cbitset_area(), uint32_cast(bitset_size())}; } - uint8_t* suffix_kv_area() { return bitset_area() + (prefix_bitset_.size() / 8); } - uint8_t const* csuffix_kv_area() const { return cbitset_area() + (prefix_bitset_.size() / 8); } + uint8_t* suffix_kv_area() { return bitset_area() + bitset_size(); } + uint8_t const* csuffix_kv_area() const { return cbitset_area() + bitset_size(); } prefix_entry* get_prefix_entry(uint16_t slot_num) { return r_cast< prefix_entry* >( this->node_data_area() + - (this->node_data_size() - (static_cast< uint16_t >(slot_num + 1) * prefix_entry::size()))); + (this->node_data_size() - (s_cast< uint16_t >(slot_num + 1) * prefix_entry::size()))); } prefix_entry const* get_prefix_entry_c(uint16_t slot_num) const { @@ -889,39 +878,5 @@ class FixedPrefixNode : public VariantNode< K, V > { static constexpr uint32_t get_key_size() { return prefix_entry::key_size() + suffix_entry::key_size(); } static constexpr uint32_t get_value_size() { return prefix_entry::value_size() + suffix_entry::value_size(); } - - std::string compact_bitset() const { - auto x = prefix_bitset_.to_string(); - std::ostringstream result; - std::vector< size_t > indices; - for (size_t i = 0; i < x.size(); ++i) { - if (x[i] == '1') { indices.push_back(i); } - } - - if (indices.empty()) { return result.str(); } - - size_t start = indices[0]; - size_t end = start; - result << "size = " << indices.size() << " : "; - for (size_t i = 1; i < indices.size(); ++i) { - if (indices[i] == end + 1) { - end = indices[i]; - } else { - if (start == end) { - result << start << ", "; - } else { - result << start << "-" << end << ", "; - } - start = end = indices[i]; - } - } - if (start == end) { - result << start; - } else { - result << start << "-" << end; - } - - return result.str(); - } }; } // namespace homestore diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/node_variant/simple_node.hpp similarity index 78% rename from src/include/homestore/btree/detail/simple_node.hpp rename to src/include/homestore/btree/node_variant/simple_node.hpp index 85dd88021..d4f50acfe 100644 --- a/src/include/homestore/btree/detail/simple_node.hpp +++ b/src/include/homestore/btree/node_variant/simple_node.hpp @@ -16,33 +16,41 @@ #pragma once #include -#include +#include #include -#include "homestore/index/index_internal.hpp" using namespace std; using namespace boost; -SISL_LOGGING_DECL(btree) - namespace homestore { template < typename K, typename V > class SimpleNode : public VariantNode< K, V > { public: - SimpleNode(uint8_t* node_buf, bnodeid_t id, bool init, bool is_leaf, const BtreeConfig& cfg) : - VariantNode< K, V >(node_buf, id, init, is_leaf, cfg) { + SimpleNode(bnodeid_t id, bool is_leaf, uint32_t node_size, BtreeNode::Allocator::Token token) : + VariantNode< K, V >(id, is_leaf, node_size, token) { this->set_node_type(btree_node_type::FIXED); } + SimpleNode(uint8_t* node_buf, bnodeid_t id, BtreeNode::Allocator::Token token) : + VariantNode< K, V >(node_buf, id, token) { + DEBUG_ASSERT_EQ(this->get_node_type(), btree_node_type::FIXED); + } + + virtual ~SimpleNode() = default; + + using BtreeNode::add_entries; using BtreeNode::get_nth_key_internal; using BtreeNode::get_nth_key_size; using BtreeNode::get_nth_obj_size; using BtreeNode::get_nth_value; using BtreeNode::get_nth_value_size; + using BtreeNode::inc_gen; + using BtreeNode::occupied_size; + using BtreeNode::sub_entries; using BtreeNode::to_string; + using BtreeNode::total_entries; using VariantNode< K, V >::get_nth_value; - using VariantNode< K, V >::max_keys_in_node; // Insert the key and value in provided index // Assumption: Node lock is already taken @@ -50,9 +58,9 @@ class SimpleNode : public VariantNode< K, V > { uint32_t sz = (this->total_entries() - (ind + 1) + 1) * get_nth_obj_size(0); if (sz != 0) { std::memmove(get_nth_obj(ind + 1), get_nth_obj(ind), sz); } - this->set_nth_obj(ind, key, val); - this->inc_entries(); - this->inc_gen(); + set_nth_obj(ind, key, val); + add_entries(1); + inc_gen(); #ifndef NDEBUG validate_sanity(); @@ -92,14 +100,14 @@ class SimpleNode : public VariantNode< K, V > { // Set the last key/value as edge entry and by decrementing entry count automatically removed the last // entry. BtreeLinkInfo new_edge; - this->get_nth_value(ind_s - 1, &new_edge, false); - this->set_nth_value(total_entries, new_edge); - this->sub_entries(total_entries - ind_s + 1); + get_nth_value(ind_s - 1, &new_edge, false); + set_nth_value(total_entries, new_edge); + sub_entries(total_entries - ind_s + 1); } else { uint32_t sz = (total_entries - ind_e - 1) * get_nth_obj_size(0); if (sz != 0) { std::memmove(get_nth_obj(ind_s), get_nth_obj(ind_e + 1), sz); } - this->sub_entries(ind_e - ind_s + 1); + sub_entries(ind_e - ind_s + 1); } this->inc_gen(); #ifndef NDEBUG @@ -107,8 +115,8 @@ class SimpleNode : public VariantNode< K, V > { #endif } - void remove_all(const BtreeConfig&) override { - this->sub_entries(this->total_entries()); + void remove_all() override { + sub_entries(this->total_entries()); this->invalidate_edge(); this->inc_gen(); #ifndef NDEBUG @@ -116,7 +124,7 @@ class SimpleNode : public VariantNode< K, V > { #endif } - uint32_t move_out_to_right_by_entries(const BtreeConfig& cfg, BtreeNode& o, uint32_t nentries) override { + uint32_t move_out_to_right_by_entries(BtreeNode& o, uint32_t nentries) override { auto& other_node = s_cast< SimpleNode< K, V >& >(o); // Minimum of whats to be moved out and how many slots available in other node @@ -130,7 +138,7 @@ class SimpleNode : public VariantNode< K, V > { } other_node.add_entries(nentries); - this->sub_entries(nentries); + sub_entries(nentries); // If there is an edgeEntry in this node, it needs to move to move out as well. if (!this->is_leaf() && this->has_valid_edge()) { @@ -147,31 +155,60 @@ class SimpleNode : public VariantNode< K, V > { return nentries; } - uint32_t move_out_to_right_by_size(const BtreeConfig& cfg, BtreeNode& o, uint32_t size) override { - return (get_nth_obj_size(0) * move_out_to_right_by_entries(cfg, o, size / get_nth_obj_size(0))); + uint32_t move_out_to_right_by_size(BtreeNode& o, uint32_t size) override { + return move_out_to_right_by_entries(o, size / get_nth_obj_size(0)); } - uint32_t num_entries_by_size(uint32_t start_idx, uint32_t size) const override { - return std::min(size / get_nth_obj_size(0), this->total_entries() - start_idx); + uint32_t get_entries_size(uint32_t start_idx, uint32_t end_idx) const override { + return get_nth_obj_size(0) * (end_idx - start_idx); + } + + bool append_copy_in_upto_size(const BtreeNode& o, uint32_t& other_cursor, uint32_t upto_size, + bool copy_only_if_fits) override { + auto& other = s_cast< const SimpleNode< K, V >& >(o); + if (occupied_size() >= upto_size) { return false; } + if (other.total_entries() == 0) { return true; } + auto const room = upto_size - occupied_size(); + + if (copy_only_if_fits) { + // Whats going to come in is more than what we are supposed to accept or what has been available. std::min + // check here is to ensure that even though we have available space, but if it exceeds requested upto_size, + // then bail out. + if (other.get_entries_size(other_cursor, other.total_entries()) > room) { return false; } + } + + DEBUG_ASSERT_LT(other_cursor, other.total_entries(), "Invalid cursor pointed in src node={}", + other.to_string()); + auto const nentries = std::min(room / get_nth_obj_size(0), other.total_entries() - other_cursor); + std::memcpy(get_nth_obj(total_entries()), other.get_nth_obj_const(other_cursor), + nentries * get_nth_obj_size(0)); + other_cursor += nentries; + add_entries(nentries); + inc_gen(); + + // If we copied everything from start_idx till end and if its an edge node, need to copy the edge id as well. + if (other.has_valid_edge() && (other_cursor == other.total_entries())) { + this->set_edge_info(other.edge_info()); + } + + if (copy_only_if_fits) { + DEBUG_ASSERT_EQ(other_cursor, other.total_entries(), + "We proceeded to copy after it checking size, but end up not copying all"); + } + return true; } - uint32_t copy_by_size(const BtreeConfig& cfg, const BtreeNode& o, uint32_t start_idx, uint32_t size) override { +#if 0 + uint32_t copy_by_size(const BtreeNode& o, uint32_t start_idx, uint32_t size) override { auto& other = s_cast< const SimpleNode< K, V >& >(o); - return copy_by_entries(cfg, o, start_idx, other.num_entries_by_size(start_idx, size)); + return copy_by_entries(o, start_idx, other.num_entries_by_size(start_idx, size)); } - uint32_t copy_by_entries(const BtreeConfig& cfg, const BtreeNode& o, uint32_t start_idx, - uint32_t nentries) override { + uint32_t copy_by_entries(const BtreeNode& o, uint32_t start_idx, uint32_t nentries) override { auto& other = s_cast< const SimpleNode< K, V >& >(o); nentries = std::min(nentries, other.total_entries() - start_idx); nentries = std::min(nentries, this->get_available_entries()); -#ifdef _PRERELEASE - const uint64_t max_keys = this->max_keys_in_node(); - if (max_keys) { - if (this->total_entries() + nentries > max_keys) { nentries = max_keys - this->total_entries(); } - } -#endif uint32_t sz = nentries * get_nth_obj_size(0); if (sz != 0) { std::memcpy(get_nth_obj(this->total_entries()), other.get_nth_obj_const(start_idx), sz); } this->add_entries(nentries); @@ -184,6 +221,12 @@ class SimpleNode : public VariantNode< K, V > { return nentries; } + uint32_t num_entries_by_size(uint32_t start_idx, uint32_t size) const override { + return std::min(size / get_nth_obj_size(0), this->total_entries() - start_idx); + } + +#endif + uint32_t available_size() const override { return (this->node_data_size() - (this->total_entries() * get_nth_obj_size(0))); } @@ -208,10 +251,6 @@ class SimpleNode : public VariantNode< K, V > { } bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const override { -#ifdef _PRERELEASE - auto max_keys = max_keys_in_node(); - if (max_keys) { return (this->total_entries() < max_keys); } -#endif return ((put_type == btree_put_type::UPSERT) || (put_type == btree_put_type::INSERT)) ? (get_available_entries() > 0) : true; @@ -219,11 +258,10 @@ class SimpleNode : public VariantNode< K, V > { std::string to_string(bool print_friendly = false) const override { auto snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode()); - auto str = - fmt::format("{}id={} level={} nEntries={} {} {} {}", - (print_friendly ? "------------------------------------------------------------\n" : ""), - this->node_id(), this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), - snext, this->is_node_deleted() ? " Deleted" : " LIVE"); + auto str = fmt::format("{}id={} level={} nEntries={} {} {} ", + (print_friendly ? "------------------------------------------------------------\n" : ""), + this->node_id(), this->level(), this->total_entries(), + (this->is_leaf() ? "LEAF" : "INTERIOR"), snext); if (this->has_valid_edge()) { fmt::format_to(std::back_inserter(str), " edge={}.{}", this->edge_info().m_bnodeid, this->edge_info().m_link_version); @@ -231,14 +269,11 @@ class SimpleNode : public VariantNode< K, V > { for (uint32_t i{0}; i < this->total_entries(); ++i) { fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={}]", (print_friendly ? "\n\t" : " "), i + 1, - BtreeNode::get_nth_key< K >(i, false).to_string(), - this->get_nth_value(i, false).to_string()); + BtreeNode::get_nth_key< K >(i, false).to_string(), get_nth_value(i, false).to_string()); } return str; } - std::string to_dot_keys() const override { - return to_dot_keys_impl(std::is_same< decltype(std::declval< K& >().key()), uint64_t >{}); - } + std::string to_dot_keys() const override { return to_dot_keys_impl(std::is_same< K, uint64_t >{}); } std::string to_dot_keys_impl(std::false_type) const { return ""; } @@ -384,9 +419,9 @@ class SimpleNode : public VariantNode< K, V > { return (this->node_data_area_const() + (get_nth_obj_size(ind) * ind)); } - void set_nth_key(uint32_t ind, const BtreeKey& key) { + void set_nth_key(uint32_t ind, BtreeKey* key) { uint8_t* entry = this->node_data_area() + (get_nth_obj_size(ind) * ind); - sisl::blob const b = key.serialize(); + sisl::blob const b = key->serialize(); memcpy(entry, b.cbytes(), b.size()); } diff --git a/src/include/homestore/btree/detail/variant_node.hpp b/src/include/homestore/btree/node_variant/variant_node.hpp similarity index 90% rename from src/include/homestore/btree/detail/variant_node.hpp rename to src/include/homestore/btree/node_variant/variant_node.hpp index b9ee4dd35..332402b5a 100644 --- a/src/include/homestore/btree/detail/variant_node.hpp +++ b/src/include/homestore/btree/node_variant/variant_node.hpp @@ -15,7 +15,7 @@ #pragma once #include -#include +#include namespace homestore { template < typename K > @@ -25,13 +25,17 @@ template < typename V > static V dummy_value; template < typename K, typename V > -class VariantNode : public StoreSpecificBtreeNode { +class VariantNode : public BtreeNode { public: using BtreeNode::get_nth_key_size; using BtreeNode::get_nth_value; - VariantNode(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, BtreeConfig const& cfg) : - StoreSpecificBtreeNode(node_buf, id, init_buf, is_leaf, cfg) {} + VariantNode(bnodeid_t id, bool is_leaf, uint32_t node_size, BtreeNode::Allocator::Token token) : + BtreeNode(id, is_leaf, node_size, token) {} + + VariantNode(uint8_t* node_buf, bnodeid_t id, BtreeNode::Allocator::Token token) : BtreeNode(node_buf, id, token) {} + + virtual ~VariantNode() = default; ///////////////////////////////////////// Get related APIs of the node ///////////////////////////////////////// @@ -192,14 +196,14 @@ class VariantNode : public StoreSpecificBtreeNode { /// translates into one of "Insert", "Update" or "Upsert". /// @param existing_val [optional] A pointer to a value to store the value of the existing entry if it was updated. /// @param filter_cb [optional] A callback function to be called for each entry found in the node that has a key. It - /// is used as a filter to remove anything that needn't be updated. - /// @return A status code indicating whether the operation was successful. + /// is used as an filter to remove anything that needn't be updated. + /// @return A boolean indicating whether the operation was successful. /// - virtual btree_status_t put(BtreeKey const& key, BtreeValue const& val, btree_put_type put_type, - BtreeValue* existing_val, put_filter_cb_t const& filter_cb = nullptr) { + virtual bool put(BtreeKey const& key, BtreeValue const& val, btree_put_type put_type, BtreeValue* existing_val, + put_filter_cb_t const& filter_cb = nullptr) { LOGMSG_ASSERT_EQ(magic(), BTREE_NODE_MAGIC, "Magic mismatch on btree_node {}", get_persistent_header_const()->to_string()); - auto ret = btree_status_t::success; + bool ret = true; DEBUG_ASSERT_EQ( this->is_leaf(), true, @@ -211,25 +215,21 @@ class VariantNode : public StoreSpecificBtreeNode { if (filter_cb && filter_cb(get_nth_key< K >(idx, false), get_nth_value(idx, false), val) != put_filter_decision::replace) { - LOGINFO("Filter callback rejected the update for key {}", key.to_string()); - return btree_status_t::filtered_out; + return false; } } if (put_type == btree_put_type::INSERT) { if (found) { - LOGINFO("Attempt to insert duplicate entry {}", key.to_string()); - return btree_status_t::already_exists; + LOGDEBUG("Attempt to insert duplicate entry {}", key.to_string()); + return false; } - ret = insert(idx, key, val); + ret = (insert(idx, key, val) == btree_status_t::success); } else if (put_type == btree_put_type::UPDATE) { - if (!found) { - LOGINFO("Attempt to update non-existent entry {}", key.to_string()); - return btree_status_t::not_found; - } + if (!found) return false; update(idx, key, val); } else if (put_type == btree_put_type::UPSERT) { - found ? update(idx, key, val) : (void)insert(idx, key, val); + (found) ? update(idx, key, val) : (void)insert(idx, key, val); } else { DEBUG_ASSERT(false, "Wrong put_type {}", put_type); } @@ -251,14 +251,13 @@ class VariantNode : public StoreSpecificBtreeNode { /// put_filter_decision::replace, the entry is upserted with the new value. /// put_filter_decision::remove, the entry is removed from the node. /// put_filter_decision::keep, the entry is not modified and the method moves on to the next entry. - /// @param app_ctx User supplied private context data. /// @return Btree status typically . /// If all keys were upserted successfully, the method returns btree_status_t::success. /// If the method ran out of space in the node, the method returns the key that was last put and the status /// as btree_status_t::has_more virtual btree_status_t multi_put(BtreeKeyRange< K > const& keys, BtreeKey const&, BtreeValue const& val, btree_put_type put_type, K* last_failed_key, - put_filter_cb_t const& filter_cb = nullptr, void* app_ctx = nullptr) { + put_filter_cb_t const& filter_cb = nullptr) { if (put_type != btree_put_type::UPDATE) { DEBUG_ASSERT(false, "For non-interval keys multi-put should be really update and cannot insert"); return btree_status_t::not_supported; @@ -292,8 +291,7 @@ class VariantNode : public StoreSpecificBtreeNode { } ///////////////////////////////////////// Remove related APIs of the node ///////////////////////////////////////// - virtual uint32_t multi_remove(BtreeKeyRange< K > const& keys, remove_filter_cb_t const& filter_cb = nullptr, - void* usr_ctx = nullptr) { + virtual uint32_t multi_remove(BtreeKeyRange< K > const& keys, remove_filter_cb_t const& filter_cb = nullptr) { DEBUG_ASSERT_EQ(this->is_leaf(), true, "Multi put entries on node are supported only for leaf nodes"); // Match the key range to get start and end idx. If none of the ranges here matches, we have to return not_found @@ -313,6 +311,5 @@ class VariantNode : public StoreSpecificBtreeNode { } return ret; } - virtual void on_update_phys_buf() override {}; }; } // namespace homestore \ No newline at end of file diff --git a/src/include/homestore/btree/detail/varlen_node.hpp b/src/include/homestore/btree/node_variant/varlen_node.hpp similarity index 81% rename from src/include/homestore/btree/detail/varlen_node.hpp rename to src/include/homestore/btree/node_variant/varlen_node.hpp index fcb7ff79c..83c0a9648 100644 --- a/src/include/homestore/btree/detail/varlen_node.hpp +++ b/src/include/homestore/btree/node_variant/varlen_node.hpp @@ -17,11 +17,8 @@ #pragma once #include -#include +#include #include -#include "homestore/index/index_internal.hpp" - -SISL_LOGGING_DECL(btree) namespace homestore { #pragma pack(1) @@ -33,10 +30,6 @@ struct btree_obj_record { struct var_node_header { uint16_t m_tail_arena_offset; // Tail side of the arena where new keys are inserted uint16_t m_available_space; - uint16_t m_init_available_space; // remember initial node area size to later use for compaction - // TODO: - // We really dont require storing m_init_available_space in each node. - // Instead add method in variant node to fetch config uint16_t tail_offset() const { return m_tail_arena_offset; } uint16_t available_space() const { return m_available_space; } @@ -54,26 +47,20 @@ class VariableNode : public VariantNode< K, V > { using BtreeNode::get_nth_obj_size; using BtreeNode::get_nth_value; using BtreeNode::get_nth_value_size; + using BtreeNode::occupied_size; using BtreeNode::to_string; using VariantNode< K, V >::get_nth_value; - VariableNode(uint8_t* node_buf, bnodeid_t id, bool init, bool is_leaf, const BtreeConfig& cfg) : - VariantNode< K, V >(node_buf, id, init, is_leaf, cfg) { - if (init) { - // Tail arena points to the edge of the node as data arena grows backwards. Entire space is now available - // except for the header itself - get_var_node_header()->m_init_available_space = this->node_data_size(); - get_var_node_header()->m_tail_arena_offset = this->node_data_size(); - get_var_node_header()->m_available_space = - get_var_node_header()->m_tail_arena_offset - sizeof(var_node_header); - } + VariableNode(bnodeid_t id, bool is_leaf, uint32_t node_size, BtreeNode::Allocator::Token token) : + VariantNode< K, V >(id, is_leaf, node_size, token) { + get_var_node_header()->m_tail_arena_offset = this->node_data_size(); + get_var_node_header()->m_available_space = get_var_node_header()->m_tail_arena_offset - sizeof(var_node_header); } - virtual ~VariableNode() = default; + VariableNode(uint8_t* node_buf, bnodeid_t id, BtreeNode::Allocator::Token token) : + VariantNode< K, V >(node_buf, id, token) {} - uint32_t occupied_size() const override { - return (get_var_node_header_const()->m_init_available_space - sizeof(var_node_header) - available_size()); - } + virtual ~VariableNode() = default; /* Insert the key and value in provided index * Assumption: Node lock is already taken */ @@ -190,11 +177,10 @@ class VariableNode : public VariantNode< K, V > { this->inc_gen(); } - void remove_all(const BtreeConfig&) override { + void remove_all() override { this->sub_entries(this->total_entries()); this->invalidate_edge(); this->inc_gen(); - get_var_node_header()->m_init_available_space = this->node_data_size(); get_var_node_header()->m_tail_arena_offset = this->node_data_size(); get_var_node_header()->m_available_space = get_var_node_header()->m_tail_arena_offset - sizeof(var_node_header); #ifndef NDEBUG @@ -213,7 +199,7 @@ class VariableNode : public VariantNode< K, V > { } }*/ - uint32_t move_out_to_right_by_entries(const BtreeConfig& cfg, BtreeNode& o, uint32_t nentries) override { + uint32_t move_out_to_right_by_entries(BtreeNode& o, uint32_t nentries) override { auto& other = static_cast< VariableNode& >(o); const auto this_gen = this->node_gen(); const auto other_gen = other.node_gen(); @@ -256,7 +242,7 @@ class VariableNode : public VariantNode< K, V > { return (start_ind - ind); } - uint32_t move_out_to_right_by_size(const BtreeConfig& cfg, BtreeNode& o, uint32_t size_to_move) override { + uint32_t move_out_to_right_by_size(BtreeNode& o, uint32_t size_to_move) override { auto& other = static_cast< VariableNode& >(o); auto this_gen = this->node_gen(); auto other_gen = other.node_gen(); @@ -295,21 +281,38 @@ class VariableNode : public VariantNode< K, V > { return nmoved; } - uint32_t num_entries_by_size(uint32_t start_idx, uint32_t size) const override { - auto idx = start_idx; + uint32_t get_entries_size(uint32_t start_idx, uint32_t end_idx) const override { + if ((start_idx == 0) && (end_idx == this->total_entries())) { + return (this->occupied_size() - sizeof(var_node_header)); + } + uint32_t cum_size{0}; + for (uint32_t i = start_idx; i < end_idx; ++i) { + cum_size += get_nth_key_size(i) + get_nth_value_size(i) + this->get_record_size(); + } + return cum_size; + } - while (idx < this->total_entries()) { - uint32_t const rec_size = this->get_record_size() + get_nth_key_size(idx) + get_nth_value_size(idx); - cum_size += rec_size; - if (cum_size > size) { break; } - ++idx; + bool append_copy_in_upto_size(const BtreeNode& o, uint32_t& other_cursor, uint32_t upto_size, + bool copy_only_if_fits) override { + if (occupied_size() >= upto_size) { return false; } + if (o.total_entries() == 0) { return true; } + auto const room = upto_size - occupied_size(); + + if (copy_only_if_fits) { + if (o.get_entries_size(other_cursor, o.total_entries()) > room) { return false; } } + auto const ncopied = copy_by_size(o, other_cursor, room); + other_cursor += ncopied; - return idx - start_idx; + if (copy_only_if_fits) { + DEBUG_ASSERT_EQ(other_cursor, o.total_entries(), + "We proceeded to copy after it checking size, but end up not copying all"); + } + return true; } - uint32_t copy_by_size(const BtreeConfig& cfg, const BtreeNode& o, uint32_t start_idx, uint32_t copy_size) override { + uint32_t copy_by_size(const BtreeNode& o, uint32_t start_idx, uint32_t copy_size) { auto& other = static_cast< const VariableNode& >(o); auto this_gen = this->node_gen(); @@ -337,8 +340,8 @@ class VariableNode : public VariantNode< K, V > { return n; } - uint32_t copy_by_entries(const BtreeConfig& cfg, const BtreeNode& o, uint32_t start_idx, - uint32_t nentries) override { +#if 0 + uint32_t copy_by_entries(const BtreeNode& o, uint32_t start_idx, uint32_t nentries) override { auto& other = static_cast< const VariableNode& >(o); auto this_gen = this->node_gen(); @@ -363,92 +366,20 @@ class VariableNode : public VariantNode< K, V > { return n; } - /*uint32_t move_in_from_right_by_entries(const BtreeConfig& cfg, BtreeNode& o, uint32_t nentries) override { - auto& other = static_cast< VariableNode& >(o); - auto this_gen = this->node_gen(); - auto other_gen = other.node_gen(); - nentries = std::min(nentries, other.total_entries()); - - if (nentries == 0) { return 0; } - uint32_t other_ind = 0; - while (nentries) { - // Get the ith key and value blob and then remove the entry from here and insert to the other node - sisl::blob kb; - kb.bytes = (uint8_t*)other.get_nth_obj(other_ind); - kb.size = other.get_nth_key_size(other_ind); - - sisl::blob vb; - vb.bytes = kb.bytes + kb.size; - vb.size = other.get_nth_value_size(other_ind); - - auto sz = insert(this->total_entries(), kb, vb); - if (!sz) { break; } - --nentries; - ++other_ind; - } - - other.remove(0, other_ind - 1); // Remove all entries in bulk - assert(other.total_entries() == nentries); + uint32_t num_entries_by_size(uint32_t start_idx, uint32_t size) const override { + auto idx = start_idx; + uint32_t cum_size{0}; - if (!other.is_leaf() && (other.total_entries() == 0)) { - // Incase other node is an edge node and we moved all the data into this node, move over the edge info as - // well. - this->set_edge_id(other.edge_id()); - other.invalidate_edge(); + while (idx < this->total_entries()) { + uint32_t const rec_size = this->get_record_size() + get_nth_key_size(idx) + get_nth_value_size(idx); + cum_size += rec_size; + if (cum_size > size) { break; } + ++idx; } - // Remove and insert would have set the gen multiple increments, just reset it to increment only by 1 - // TODO: This is bit ugly but needed in-order to avoid repeat the same code again, but see if we can produce - // interface around it. - this->set_gen(this_gen + 1); - other.set_gen(other_gen + 1); - - return (other_ind); + return idx - start_idx; } - - uint32_t move_in_from_right_by_size(const BtreeConfig& cfg, BtreeNode& o, uint32_t size_to_move) override { - auto& other = static_cast< VariableNode& >(o); - uint32_t moved_size = 0U; - auto this_gen = this->node_gen(); - auto other_gen = other.node_gen(); - - uint32_t ind = 0; - while (ind < this->total_entries()) { - sisl::blob kb; - kb.bytes = (uint8_t*)other.get_nth_obj(ind); - kb.size = other.get_nth_key_size(ind); - - sisl::blob vb; - vb.bytes = kb.bytes + kb.size; - vb.size = other.get_nth_value_size(ind); - - if ((kb.size + vb.size + other.get_record_size()) > size_to_move) { - // We reached threshold of how much we could move - break; - } - auto sz = insert(this->total_entries(), kb, vb); // Keep on inserting on the last index. - if (!sz) break; - moved_size += sz; - ind++; - size_to_move -= sz; - } - if (ind) other.remove(0, ind - 1); - - if (!other.is_leaf() && (other.total_entries() == 0)) { - // Incase other node is an edge node and we moved all the data into this node, move over the edge info as - // well. - this->set_edge_id(other.edge_id()); - other.invalidate_edge(); - } - - // Remove and insert would have set the gen multiple increments, just reset it to increment only by 1 - // TODO: This is bit ugly but needed in-order to avoid repeat the same code again, but see if we can produce - // interface around it. - this->set_gen(this_gen + 1); - other.set_gen(other_gen + 1); - - return moved_size; - } */ +#endif uint32_t available_size() const override { return get_var_node_header_const()->m_available_space; } @@ -556,7 +487,7 @@ class VariableNode : public VariantNode< K, V > { memcpy(raw_data_ptr, val_blob.cbytes(), val_blob.size()); // Increment the entries and generation number - this->inc_entries(); + this->add_entries(1); this->inc_gen(); #ifndef NDEBUG @@ -583,7 +514,7 @@ class VariableNode : public VariantNode< K, V > { uint32_t no_of_entries = this->total_entries(); if (no_of_entries == 0) { // this happens when there is only entry and in update, we first remove and than insert - get_var_node_header()->m_tail_arena_offset = get_var_node_header()->m_init_available_space; + get_var_node_header()->m_tail_arena_offset = this->node_data_size(); LOGTRACEMOD(btree, "Full available size reclaimed"); return; } @@ -602,7 +533,7 @@ class VariableNode : public VariantNode< K, V > { std::sort(rec.begin(), rec.begin() + no_of_entries, [](Record const& a, Record const& b) -> bool { return b.m_obj_offset < a.m_obj_offset; }); - uint16_t last_offset = get_var_node_header()->m_init_available_space; + uint16_t last_offset = this->node_data_size(); ind = 0; uint16_t sparce_space = 0; @@ -675,10 +606,16 @@ class VariableNode : public VariantNode< K, V > { template < typename K, typename V > class VarKeySizeNode : public VariableNode< K, V > { public: - VarKeySizeNode(uint8_t* node_buf, bnodeid_t id, bool init, bool is_leaf, const BtreeConfig& cfg) : - VariableNode< K, V >(node_buf, id, init, is_leaf, cfg) { + VarKeySizeNode(bnodeid_t id, bool is_leaf, uint32_t node_size, BtreeNode::Allocator::Token token) : + VariableNode< K, V >(id, is_leaf, node_size, token) { this->set_node_type(btree_node_type::VAR_KEY); } + + VarKeySizeNode(uint8_t* node_buf, bnodeid_t id, BtreeNode::Allocator::Token token) : + VariableNode< K, V >(node_buf, id, token) { + DEBUG_ASSERT_EQ(this->get_node_type(), btree_node_type::VAR_KEY); + } + virtual ~VarKeySizeNode() = default; uint32_t get_nth_key_size(uint32_t ind) const override { @@ -707,10 +644,16 @@ class VarKeySizeNode : public VariableNode< K, V > { template < typename K, typename V > class VarValueSizeNode : public VariableNode< K, V > { public: - VarValueSizeNode(uint8_t* node_buf, bnodeid_t id, bool init, bool is_leaf, const BtreeConfig& cfg) : - VariableNode< K, V >(node_buf, id, init, is_leaf, cfg) { + VarValueSizeNode(bnodeid_t id, bool is_leaf, uint32_t node_size, BtreeNode::Allocator::Token token) : + VariableNode< K, V >(id, is_leaf, node_size, token) { this->set_node_type(btree_node_type::VAR_VALUE); } + + VarValueSizeNode(uint8_t* node_buf, bnodeid_t id, BtreeNode::Allocator::Token token) : + VariableNode< K, V >(node_buf, id, token) { + DEBUG_ASSERT_EQ(this->get_node_type(), btree_node_type::VAR_VALUE); + } + virtual ~VarValueSizeNode() = default; uint32_t get_nth_key_size(uint32_t ind) const override { return dummy_key< K >.serialized_size(); } @@ -739,10 +682,16 @@ class VarValueSizeNode : public VariableNode< K, V > { template < typename K, typename V > class VarObjSizeNode : public VariableNode< K, V > { public: - VarObjSizeNode(uint8_t* node_buf, bnodeid_t id, bool init, bool is_leaf, const BtreeConfig& cfg) : - VariableNode< K, V >(node_buf, id, init, is_leaf, cfg) { + VarObjSizeNode(bnodeid_t id, bool is_leaf, uint32_t node_size, BtreeNode::Allocator::Token token) : + VariableNode< K, V >(id, is_leaf, node_size, token) { this->set_node_type(btree_node_type::VAR_OBJECT); } + + VarObjSizeNode(uint8_t* node_buf, bnodeid_t id, BtreeNode::Allocator::Token token) : + VariableNode< K, V >(node_buf, id, token) { + DEBUG_ASSERT_EQ(this->get_node_type(), btree_node_type::VAR_OBJECT); + } + virtual ~VarObjSizeNode() = default; uint32_t get_nth_key_size(uint32_t ind) const override { diff --git a/src/include/homestore/checkpoint/cp.hpp b/src/include/homestore/checkpoint/cp.hpp index 266713c55..a4b5e802f 100644 --- a/src/include/homestore/checkpoint/cp.hpp +++ b/src/include/homestore/checkpoint/cp.hpp @@ -44,7 +44,6 @@ * CP end :- when cp flush is completed. It frees the CP. */ namespace homestore { -SISL_LOGGING_DECL(cp, replay) #define CP_PERIODIC_LOG(level, cp_id, msg, ...) \ HS_PERIODIC_DETAILED_LOG(level, cp, "cp_id", cp_id, , , msg, ##__VA_ARGS__) @@ -70,11 +69,6 @@ class CPContext; class CPManager; VENUM(cp_consumer_t, uint8_t, - // Sealer is a special consumer that provides information regarding where the cp is up to. - // It will be the first one during cp switch over , as a conservative marker of everything - // before or equals to this point, should be in current cp, possibly some consumer are above this point which is - // fine. And Sealer is the last one during cp flush after all other services flushed successfully. - SEALER = 3, HS_CLIENT = 0, // Client of the homestore module INDEX_SVC = 1, // Index service module BLK_DATA_SVC = 2, // Block data service module @@ -89,10 +83,7 @@ struct CP { cp_id_t m_cp_id; std::array< std::unique_ptr< CPContext >, (size_t)cp_consumer_t::SENTINEL > m_contexts; folly::SharedPromise< bool > m_comp_promise; - Clock::time_point m_cp_start_time; -#ifdef _PRERELEASE - std::atomic< bool > m_abrupt_cp{false}; -#endif + bool m_is_on_shutdown{false}; // Is this CP taken as part of shutdown of homestore public: CP(CPManager* mgr) : m_cp_mgr{mgr} {} diff --git a/src/include/homestore/checkpoint/cp_mgr.hpp b/src/include/homestore/checkpoint/cp_mgr.hpp index 43bf8b3ae..6892ec82a 100644 --- a/src/include/homestore/checkpoint/cp_mgr.hpp +++ b/src/include/homestore/checkpoint/cp_mgr.hpp @@ -28,13 +28,13 @@ #include namespace homestore { -static constexpr size_t MAX_CP_COUNT{2}; - class CPMgrMetrics : public sisl::MetricsGroup { public: explicit CPMgrMetrics() : sisl::MetricsGroup("CPMgr") { REGISTER_COUNTER(back_to_back_cps, "back to back cp"); REGISTER_COUNTER(cp_cnt, "cp cnt"); + REGISTER_COUNTER(cp_by_timer, "Cp taken because of timer"); + REGISTER_COUNTER(cp_by_index_full, "Cp taken because of index dirty buffer/free blks fulls"); REGISTER_HISTOGRAM(cp_latency, "cp latency (in us)"); register_me_to_farm(); } @@ -53,19 +53,13 @@ class CPContext { public: CPContext(CP* cp) : m_cp{cp} {} + virtual ~CPContext() = default; + CP* cp() { return m_cp; } cp_id_t id() const; - void complete(bool status) { m_flush_comp.setValue(status); } -#ifdef _PRERELEASE - void abrupt() { - m_cp->m_abrupt_cp.store(true); - complete(true); - } - bool is_abrupt() { return m_cp->m_abrupt_cp.load(); } -#endif - folly::Future< bool > get_future() { return m_flush_comp.getFuture(); } - virtual ~CPContext() = default; + void complete(bool status); + folly::Future< bool > get_future() { return m_flush_comp.getFuture(); } }; class CPCallbacks { @@ -129,27 +123,39 @@ class CPGuard { // and passes the cp1 to thread2. However, before accessing cp1, thread2 already takes cp2 critical section and then // access cp1, then it needs to wind up with cp1 and once cp1 is done, has to go back to cp2. This nesting can // potentially happen recursively (although such pattern is not great, it can exist). That is why we use stack here - static thread_local std::stack< CP* > t_cp_stack; + static iomgr::FiberManagerLib::FiberLocal< std::stack< CP* > > t_cp_stack; public: CPGuard(CPManager* mgr); - ~CPGuard(); + virtual ~CPGuard(); CPGuard(const CPGuard& other); - CPGuard operator=(const CPGuard& other); + virtual CPGuard operator=(const CPGuard& other); CPContext* context(cp_consumer_t consumer); - CP& operator*(); - CP* operator->(); - CP* get(); + virtual CP* operator->(); + virtual CP* get(); }; +VENUM(CPTriggerReason, uint8_t, + Unknown = 0, // Caller has not given a reason for it + Timer = 1, // Time was up + IndexBufferFull = 2, // Index Dirty buffer was full + IndexFreeBlksExceeded = 3, // Index blocks freed has hit a limit + LogStoreFull = 4, // Log store has gotten really full + DataFreeBlksExceeded = 5, // Number of free blks in data service exceeded + UserDriven = 6, // User explicitly requested for +); + /* It is responsible to trigger the checkpoints when all concurrent IOs are completed. * @ cp_type :- It is a consumer checkpoint with a base class of cp */ class CPManager { friend class CPGuard; +public: + static constexpr size_t max_concurent_cps{2}; + private: CP* m_cur_cp{nullptr}; // Current CP information std::unique_ptr< CPMgrMetrics > m_metrics; @@ -163,7 +169,7 @@ class CPManager { bool m_in_flush_phase{false}; bool m_pending_trigger_cp{false}; // Is there is a waiter for a cp flush to start folly::SharedPromise< bool > m_pending_trigger_cp_comp; - iomgr::io_fiber_t m_timer_fiber; + // std::vector< uint64_t > m_trigger_reasons; public: CPManager(); @@ -188,6 +194,8 @@ class CPManager { /// @param callbacks : Callbacks denoted by the consumers. Details are provided in CPCallbacks class void register_consumer(cp_consumer_t consumer_id, std::unique_ptr< CPCallbacks > callbacks); + CPCallbacks* get_consumer(cp_consumer_t consumer_id); + /// @brief Call this method before every IO that needs to be checkpointed. It marks the entrance of critical section /// of the returned CP and ensures that until it is exited, flush of the CP will not happen. /// @@ -214,7 +222,7 @@ class CPManager { /// @brief Trigger a checkpoint flush on all subsystems registered. There is only 1 checkpoint per checkpoint /// manager. Checkpoint flush will wait for cp to exited all critical io sections. /// @param force : Do we need to force queue the checkpoint flush, in case previous checkpoint is being flushed - folly::Future< bool > trigger_cp_flush(bool force = false); + folly::Future< bool > trigger_cp_flush(bool force = false, CPTriggerReason reason = CPTriggerReason::Unknown); const std::array< std::unique_ptr< CPCallbacks >, (size_t)cp_consumer_t::SENTINEL >& consumer_list() const { return m_cp_cb_table; @@ -222,6 +230,11 @@ class CPManager { iomgr::io_fiber_t pick_blocking_io_fiber() const; + /// @brief Is the given cp has already finished flushing + /// @param cp_id + /// @return True or False if cp has flushed or not + bool has_cp_flushed(cp_id_t cp_id) const; + private: void cp_ref(CP* cp); void create_first_cp(); @@ -230,10 +243,8 @@ class CPManager { void cleanup_cp(CP* cp); void on_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie); void start_cp_thread(); - folly::Future< bool > do_trigger_cp_flush(bool force, bool flush_on_shutdown); - uint64_t cp_timer_us(); - void start_timer_thread(); - void stop_timer_thread(); + folly::Future< bool > do_trigger_cp_flush(bool force, bool flush_on_shutdown, + CPTriggerReason reason = CPTriggerReason::Unknown); }; extern CPManager& cp_mgr(); diff --git a/src/include/homestore/chunk_selector.h b/src/include/homestore/chunk_selector.h index c04af9991..1d66f63cc 100644 --- a/src/include/homestore/chunk_selector.h +++ b/src/include/homestore/chunk_selector.h @@ -22,11 +22,9 @@ class ChunkSelector { public: ChunkSelector() = default; virtual void add_chunk(cshared< Chunk >&) = 0; - virtual void remove_chunk(cshared< Chunk >&) {}; + virtual void remove_chunk(cshared< Chunk >&){}; virtual void foreach_chunks(std::function< void(cshared< Chunk >&) >&& cb) = 0; virtual cshared< Chunk > select_chunk(blk_count_t nblks, const blk_alloc_hints& hints) = 0; - virtual void on_alloc_blk(chunk_num_t chunk_num, blk_count_t nblks) {} - virtual void on_free_blk(chunk_num_t chunk_num, blk_count_t nblks) {} virtual ~ChunkSelector() = default; }; diff --git a/src/include/homestore/fault_cmt_service.hpp b/src/include/homestore/fault_cmt_service.hpp deleted file mode 100644 index 484d18402..000000000 --- a/src/include/homestore/fault_cmt_service.hpp +++ /dev/null @@ -1,45 +0,0 @@ -/********************************************************************************* - * Modifications Copyright 2017-2019 eBay Inc. - * - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed - * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR - * CONDITIONS OF ANY KIND, either express or implied. See the License for the - * specific language governing permissions and limitations under the License. - * - *********************************************************************************/ -#pragma once -#include -#include -#include - -#include -#include - -namespace homestore { -ENUM(FaultContainmentEvent, uint8_t, ENTER = 0, EXIT = 1, ENTER_GLOBAL = 2); - -class FaultContainmentCallback { -public: - virtual ~FaultContainmentCallback() = default; - virtual void on_fault_containment(FaultContainmentEvent evt, void* cookie, const std::string& reason) { assert(0); } -}; - -class FaultContainmentService { -private: - std::unique_ptr< FaultContainmentCallback > m_cb; - -public: - FaultContainmentService(std::unique_ptr< FaultContainmentCallback > cb) : m_cb(std::move(cb)) {} - ~FaultContainmentService() = default; - void trigger_fc(FaultContainmentEvent evt, void* cookie, const std::string& reason = "") { - m_cb->on_fault_containment(evt, cookie, reason); - } -}; - -} // namespace homestore diff --git a/src/include/homestore/homestore.hpp b/src/include/homestore/homestore.hpp index 69d766092..541fdf80d 100644 --- a/src/include/homestore/homestore.hpp +++ b/src/include/homestore/homestore.hpp @@ -53,8 +53,6 @@ class VirtualDev; class ChunkSelector; class ReplDevListener; class ReplApplication; -class FaultContainmentService; -class FaultContainmentCallback; #ifdef _PRERELEASE class CrashSimulator; @@ -62,46 +60,60 @@ class CrashSimulator; using HomeStoreSafePtr = std::shared_ptr< HomeStore >; +using hs_before_services_starting_cb_t = std::function< void(void) >; + +struct hs_stats { + uint64_t total_capacity{0ul}; + uint64_t used_capacity{0ul}; +}; + +ENUM(ServiceType, uint32_t, // List of all services we support + META = 0, // Meta Service + LOG = 1, // Log Service + DATA = 2, // Data Service + INDEX = 3, // Index Service + REPLICATION = 4 // Replication Service +); +using HS_SERVICE = ServiceType; // Alias for easier porting of code + +ENUM(ServiceSubType, uint32_t, // All sub types within services. At this point it is a global list for all services + DEFAULT = 0, // No sub type + INDEX_BTREE_COPY_ON_WRITE = 1, // Copy on Write btree index + INDEX_BTREE_INPLACE = 2, // LInplace Btree based index + INDEX_BTREE_MEMORY = 3, // Memory based index +); + VENUM(hs_vdev_type_t, uint32_t, DATA_VDEV = 1, INDEX_VDEV = 2, META_VDEV = 3, LOGDEV_VDEV = 4); #pragma pack(1) struct hs_vdev_context { enum hs_vdev_type_t type; + ServiceSubType sub_type{ServiceSubType::DEFAULT}; sisl::blob to_blob() { return sisl::blob{uintptr_cast(this), sizeof(*this)}; } }; #pragma pack() -using hs_before_services_starting_cb_t = std::function< void(void) >; +struct ServiceId { + ServiceType type; + ServiceSubType sub_type; -struct hs_stats { - uint64_t total_capacity{0ul}; - uint64_t used_capacity{0ul}; + ServiceId(ServiceType st, ServiceSubType sst) : type{st}, sub_type{sst} {} + ServiceId(ServiceType st) : type{st}, sub_type{ServiceSubType::DEFAULT} {} }; +} // namespace homestore -struct HS_SERVICE { - static constexpr uint32_t META = 1 << 0; - static constexpr uint32_t LOG = 1 << 1; - static constexpr uint32_t DATA = 1 << 2; - static constexpr uint32_t INDEX = 1 << 3; - static constexpr uint32_t REPLICATION = 1 << 4; - static constexpr uint32_t FAULT_CMT = 1 << 5; - - uint32_t svcs; - - HS_SERVICE() : svcs{META} {} - - std::string list() const { - std::string str; - if (svcs & META) { str += "meta,"; } - if (svcs & DATA) { str += "data,"; } - if (svcs & INDEX) { str += "index,"; } - if (svcs & LOG) { str += "log,"; } - if (svcs & REPLICATION) { str += "replication,"; } - if (svcs & FAULT_CMT) { str += "fault_containment,"; } - return str; +namespace std { +template <> +struct less< homestore::ServiceId > { + bool operator()(const homestore::ServiceId& lhs, const homestore::ServiceId& rhs) const { + return (lhs.type == rhs.type) ? (uint32_cast(lhs.sub_type) < uint32_cast(lhs.sub_type)) + : (uint32_cast(lhs.type) < uint32_cast(rhs.type)); } }; +} // namespace std + +namespace homestore { /* * IO errors handling by homestore. @@ -119,8 +131,9 @@ class HomeStore { std::unique_ptr< MetaBlkService > m_meta_service; std::unique_ptr< LogStoreService > m_log_service; std::unique_ptr< IndexService > m_index_service; +#ifdef REPLICATION_SUPPORT std::shared_ptr< ReplicationService > m_repl_service; - std::unique_ptr< FaultContainmentService > m_fc_service; +#endif std::unique_ptr< DeviceManager > m_dev_mgr; shared< sisl::logging::logger_t > m_periodic_logger; @@ -129,12 +142,12 @@ class HomeStore { std::unique_ptr< CPManager > m_cp_mgr; shared< sisl::Evictor > m_evictor; - HS_SERVICE m_services; // Services homestore is starting with + std::vector< std::vector< ServiceSubType > > m_services; // Services homestore is starting with hs_before_services_starting_cb_t m_before_services_starting_cb{nullptr}; std::atomic< bool > m_init_done{false}; public: - HomeStore() = default; + HomeStore(); virtual ~HomeStore() = default; /////////////////////////////////////////// static HomeStore member functions ///////////////////////////////// @@ -151,13 +164,14 @@ class HomeStore { HomeStore& with_data_service(cshared< ChunkSelector >& custom_chunk_selector = nullptr); HomeStore& with_log_service(); HomeStore& with_index_service(std::unique_ptr< IndexServiceCallbacks > cbs, - cshared< ChunkSelector >& custom_chunk_selector = nullptr); + std::vector< ServiceSubType > sub_types); +#ifdef REPLICATION_SUPPORT HomeStore& with_repl_data_service(cshared< ReplApplication >& repl_app, cshared< ChunkSelector >& custom_chunk_selector = nullptr); - HomeStore& with_fault_containment(std::unique_ptr< FaultContainmentCallback > cb); +#endif bool start(const hs_input_params& input, hs_before_services_starting_cb_t svcs_starting_cb = nullptr); - void format_and_start(std::map< uint32_t, hs_format_params >&& format_opts); + void format_and_start(std::map< ServiceId, hs_format_params >&& format_opts); void shutdown(); // cap_attrs get_system_capacity() const; // Need to move this to homeblks/homeobj @@ -170,7 +184,7 @@ class HomeStore { bool has_meta_service() const; bool has_log_service() const; bool has_repl_data_service() const; - bool has_fc_service() const; + std::string services_list() const; BlkDataService& data_service() { return *m_data_service; } MetaBlkService& meta_service() { return *m_meta_service; } @@ -179,11 +193,9 @@ class HomeStore { if (!m_index_service) { throw std::runtime_error("index_service is nullptr"); } return *m_index_service; } +#ifdef REPLICATION_SUPPORT ReplicationService& repl_service() { return *m_repl_service; } - FaultContainmentService& fc_service() { - if (!m_fc_service) { throw std::runtime_error("fc_service is nullptr"); } - return *m_fc_service; - } +#endif DeviceManager* device_mgr() { return m_dev_mgr.get(); } ResourceMgr& resource_mgr() { return *m_resource_mgr.get(); } CPManager& cp_mgr() { return *m_cp_mgr.get(); } @@ -196,7 +208,6 @@ class HomeStore { #endif private: - void init_cache(); shared< VirtualDev > create_vdev_cb(const vdev_info& vinfo, bool load_existing); uint64_t pct_to_size(float pct, HSDevType dev_type) const; void do_start(); diff --git a/src/include/homestore/homestore_decl.hpp b/src/include/homestore/homestore_decl.hpp index 05d62ebb6..859b4c59c 100644 --- a/src/include/homestore/homestore_decl.hpp +++ b/src/include/homestore/homestore_decl.hpp @@ -112,7 +112,8 @@ ENUM(vdev_size_type_t, uint8_t, VDEV_SIZE_STATIC, VDEV_SIZE_DYNAMIC); ////////////// All structs /////////////////// struct dev_info { - explicit dev_info(std::string name, HSDevType type = HSDevType::Data) : dev_name{std::move(name)}, dev_type{type} {} + explicit dev_info(std::string name, HSDevType type = HSDevType::Data, uint64_t size = 0) : + dev_name{std::move(name)}, dev_type{type}, dev_size{size} {} std::string to_string() const { return fmt::format("{} - {} size={}", dev_name, enum_name(dev_type), dev_size); } std::string dev_name; @@ -212,5 +213,3 @@ struct cap_attrs { } // namespace homestore ////////////// Misc /////////////////// -#define HOMESTORE_LOG_MODS \ - btree, device, blkalloc, cp, metablk, wbcache, logstore, transient, replication, journalvdev, solorepl diff --git a/src/include/homestore/index/index_common.h b/src/include/homestore/index/index_common.h new file mode 100644 index 000000000..7494dba8e --- /dev/null +++ b/src/include/homestore/index/index_common.h @@ -0,0 +1,19 @@ +#pragma once +#include +#include + +namespace homestore { +class IndexStore { +public: + SCOPED_ENUM_DECL(Type, uint8_t); + + IndexStore() = default; + virtual ~IndexStore() = default; + virtual void stop() = 0; + + virtual std::string store_type() const = 0; + virtual void on_recovery_completed() = 0; +}; + +SCOPED_ENUM_DEF(IndexStore, Type, uint8_t, MEM_BTREE, COPY_ON_WRITE_BTREE, INPLACE_BTREE); +} // namespace homestore \ No newline at end of file diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp deleted file mode 100644 index 1d6dbd4e9..000000000 --- a/src/include/homestore/index/index_internal.hpp +++ /dev/null @@ -1,186 +0,0 @@ -/********************************************************************************* - * Modifications Copyright 2017-2019 eBay Inc. - * - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed - * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR - * CONDITIONS OF ANY KIND, either express or implied. See the License for the - * specific language governing permissions and limitations under the License. - * - *********************************************************************************/ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#pragma once -#ifdef StoreSpecificBtreeNode -#undef StoreSpecificBtreeNode -#endif - -#define StoreSpecificBtreeNode homestore::IndexBtreeNode - -namespace homestore { - -using bnodeid_t = uint64_t; -typedef int64_t cp_id_t; - -static constexpr uint64_t indx_sb_magic{0xbedabb1e}; -static constexpr uint32_t indx_sb_version{0x2}; - -#pragma pack(1) -struct index_table_sb { - uint64_t magic{indx_sb_magic}; - uint32_t version{indx_sb_version}; - uuid_t uuid; // UUID of the index - uuid_t parent_uuid; // UUID of the parent container of index (controlled by user) - - // Btree Section - bnodeid_t root_node{empty_bnodeid}; // Btree Root Node ID - uint64_t root_link_version{0}; // Link version to btree root node - int64_t index_size{0}; // Size of the Index - // seq_id_t last_seq_id{-1}; // TODO: See if this is needed - uint64_t total_leaf_nodes{0}; // Number of leaf nodes in the index - uint64_t total_interior_nodes{0}; // Number of internal nodes in the index - uint8_t btree_depth{0}; // Depth of the btree - - uint32_t ordinal{0}; // Ordinal of the Index - - uint32_t user_sb_size; // Size of the user superblk - uint8_t user_sb_bytes[0]; - uint32_t pdev_id; - uint32_t index_num_chunks{0}; - // List of chunk ids allocated for this index table are stored after this. - void init_chunks(std::vector< chunk_num_t > const& chunk_ids) { - index_num_chunks = chunk_ids.size(); - auto chunk_id_ptr = get_chunk_ids_mutable(); - for (auto& chunk_id : chunk_ids) { - *chunk_id_ptr = chunk_id; - chunk_id_ptr++; - } - } - chunk_num_t* get_chunk_ids_mutable() { return r_cast< chunk_num_t* >(uintptr_cast(this) + sizeof(index_table_sb)); } - const chunk_num_t* get_chunk_ids() const { - return r_cast< const chunk_num_t* >(reinterpret_cast< const uint8_t* >(this) + sizeof(index_table_sb)); - } -}; -#pragma pack() - -struct IndexBuffer; -using IndexBufferPtr = std::shared_ptr< IndexBuffer >; -using IndexBufferPtrList = folly::small_vector< IndexBufferPtr, 3 >; - -// An Empty base class to have the IndexService not having to template and refer the IndexTable virtual class -class IndexTableBase { -public: - virtual ~IndexTableBase() = default; - virtual uuid_t uuid() const = 0; - virtual void recovery_completed() = 0; - virtual uint32_t ordinal() const = 0; - virtual uint64_t used_size() const = 0; - virtual btree_status_t destroy() = 0; - virtual void stop() = 0; - virtual void repair_node(IndexBufferPtr const& buf) = 0; - virtual void repair_root_node(IndexBufferPtr const& buf) = 0; - virtual void delete_stale_children(IndexBufferPtr const& buf) = 0; - virtual void audit_tree() const = 0; - virtual void update_sb() = 0; - virtual void load_metrics(uint64_t interior, uint64_t leaf, uint8_t depth) = 0; - virtual bool sanity_check(const IndexBufferPtrList& bufs) const = 0; -}; - -enum class index_buf_state_t : uint8_t { - CLEAN, // Buffer is clean - DIRTY, // Buffer is dirty and yet to start flush - FLUSHING, // Buffer is current flushing -}; - -///////////////////////// Btree Node and Buffer Portion ////////////////////////// - -// IndexBuffer is for each CP. The dependent index buffers are chained using -// m_up_buffer and each buffer is flushed only its wait_for_leaders reaches 0 -// which means all its dependent buffers are flushed. -struct IndexBuffer : public sisl::ObjLifeCounter< IndexBuffer > { - BlkId m_blkid; // BlkId where this needs to be persisted - cp_id_t m_dirtied_cp_id{-1}; // Last CP that dirtied this index buffer - cp_id_t m_created_cp_id{-1}; // CP id when this buffer is created. - std::atomic< index_buf_state_t > m_state{index_buf_state_t::CLEAN}; // Is buffer yet to persist? - uint8_t* m_bytes{nullptr}; // Actual data buffer - uint32_t m_node_level{0}; // levels of the node in the btree - - std::shared_ptr< IndexBuffer > m_up_buffer; // Parent buffer in the chain to persisted - sisl::atomic_counter< int > m_wait_for_down_buffers{0}; // Number of children need to wait for before persisting -#ifndef NDEBUG - // Down buffers are not mandatory members, but only to keep track of any bugs and asserts - std::vector< std::weak_ptr< IndexBuffer > > m_down_buffers; - std::mutex m_down_buffers_mtx; - std::shared_ptr< IndexBuffer > m_prev_up_buffer; // Keep a copy for debugging -#endif - -#ifdef _PRERELEASE - bool m_crash_flag_on{false}; - void set_crash_flag() { m_crash_flag_on = true; } -#endif - - uint32_t m_index_ordinal{0}; // Ordinal of the index table this buffer belongs to, used only during recovery - uint8_t m_is_meta_buf{false}; // Is the index buffer writing to metablk? - bool m_node_freed{false}; - - IndexBuffer(BlkId blkid, uint32_t buf_size, uint32_t align_size); - IndexBuffer(uint8_t* raw_bytes, BlkId blkid); - virtual ~IndexBuffer(); - - BlkId blkid() const { return m_blkid; } - uint8_t* raw_buffer() { return m_bytes; } - bool is_clean() const { return (m_state.load() == index_buf_state_t::CLEAN); } - index_buf_state_t state() const { return m_state.load(); } - void set_state(index_buf_state_t st) { m_state.store(st); } - bool is_meta_buf() const { return m_is_meta_buf; } - - std::string to_string() const; - std::string to_string_dot() const; - - void add_down_buffer(const IndexBufferPtr& buf); - - void remove_down_buffer(const IndexBufferPtr& buf); -#ifndef NDEBUG - bool is_in_down_buffers(const IndexBufferPtr& buf); -#endif -}; - -// This is a special buffer which is used to write to the meta block -struct MetaIndexBuffer : public IndexBuffer { - MetaIndexBuffer(superblk< index_table_sb >& sb); - MetaIndexBuffer(shared< MetaIndexBuffer > const& other); - virtual ~MetaIndexBuffer(); - void copy_sb_to_buf(); - - bool m_valid{true}; - superblk< index_table_sb >& m_sb; -}; - -struct IndexBtreeNode : public BtreeNode { -public: - IndexBufferPtr m_idx_buf; // Buffer backing this node - -public: - template < typename... Args > - IndexBtreeNode(Args&&... args) : BtreeNode(std::forward< Args >(args)...) {} - virtual ~IndexBtreeNode() { m_idx_buf.reset(); } - - void attach_buf(IndexBufferPtr const& buf) { m_idx_buf = buf; } - uint8_t* raw_buffer() { return m_idx_buf->raw_buffer(); } -}; - -} // namespace homestore diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp index 2cef7eb7e..a80790d01 100644 --- a/src/include/homestore/index_service.hpp +++ b/src/include/homestore/index_service.hpp @@ -17,107 +17,134 @@ #include #include #include +#include +#include #include #include +#include #include -#include #include -#include +#include +#include namespace homestore { -class IndexWBCacheBase; -class IndexTableBase; +class Index; class VirtualDev; -class ChunkSelector; + +#pragma pack(1) +struct IndexSuperBlock { + static constexpr uint64_t indx_sb_magic{0xbedabb1e}; + static constexpr uint32_t indx_sb_version{0x3}; + + // Common Area for all index implementations + uint64_t magic{indx_sb_magic}; + uint32_t version{indx_sb_version}; + uuid_t uuid; // UUID of the index + uuid_t parent_uuid; // UUID of the parent container of index (controlled by user) + uint32_t ordinal; // Ordinal of the Index (unique within the homestore instance) + IndexStore::Type index_store_type; // Underlying store type for this index + + static constexpr size_t index_impl_sb_size = 512; + std::array< uint8_t, index_impl_sb_size > underlying_index_sb; + + // User area of the superblock, which can be updated with cp guard. + uint32_t user_sb_size; // Size of the user superblk + uint8_t user_sb_bytes[0]; // Raw bytes of the sb. Better to access with helper routine below + + sisl::blob user_sb() { return sisl::blob{&user_sb_bytes[0], user_sb_size}; } +}; + +struct IndexStoreSuperBlock { + IndexStore::Type index_store_type; +}; + +#pragma pack() class IndexServiceCallbacks { public: virtual ~IndexServiceCallbacks() = default; - virtual std::shared_ptr< IndexTableBase > on_index_table_found(superblk< index_table_sb >&&) { + virtual shared< Index > on_index_table_found(superblk< IndexSuperBlock >&&) { assert(0); return nullptr; } }; +class Index : public std::enable_shared_from_this< Index > { +protected: + bool const m_is_ephemeral; // Is it a persistent btree? + superblk< IndexSuperBlock > m_sb; + +public: + Index(bool is_ephermal) : m_is_ephemeral{is_ephermal}, m_sb{"index_table"} {} + virtual ~Index() = default; + + bool is_ephemeral() const { return m_is_ephemeral; } + + // Destroys the index and remove all its resources. This could be delayed call as in actual destroy could + // potentially takes place in subsequent checkpoints. Hence caller should not assume that destroy is completed + // instantly. This is an idempotent call and the implementer of this method needs to support that. + virtual folly::Future< folly::Unit > destroy() = 0; + + // Getters + uuid_t uuid() const { return m_sb->uuid; } + virtual uint64_t space_occupied() const = 0; + virtual uint32_t ordinal() const = 0; + + superblk< IndexSuperBlock > const& super_blk() const { return m_sb; } + superblk< IndexSuperBlock >& super_blk() { + return const_cast< superblk< IndexSuperBlock >& >(s_cast< const Index* >(this)->super_blk()); + } +}; + class IndexService { private: - std::unique_ptr< IndexServiceCallbacks > m_svc_cbs; - std::unique_ptr< IndexWBCacheBase > m_wb_cache; - std::shared_ptr< VirtualDev > m_vdev; - std::pair< meta_blk*, sisl::byte_view > m_wbcache_sb{ - std::pair< meta_blk*, sisl::byte_view >{nullptr, sisl::byte_view{}}}; - std::vector< std::pair< meta_blk*, sisl::byte_view > > m_itable_sbs; - std::unique_ptr< sisl::IDReserver > m_ordinal_reserver; - std::shared_ptr< ChunkSelector > m_custom_chunk_selector; - - mutable std::mutex m_index_map_mtx; - std::map< uuid_t, std::shared_ptr< IndexTableBase > > m_index_map; - std::unordered_map< uint32_t, std::shared_ptr< IndexTableBase > > m_ordinal_index_map; + unique< IndexServiceCallbacks > m_svc_cbs; + std::unordered_map< ServiceSubType, shared< VirtualDev > > m_vdevs; + std::vector< superblk< IndexSuperBlock > > m_index_sbs; + std::vector< superblk< IndexStoreSuperBlock > > m_store_sbs; + unique< sisl::IDReserver > m_ordinal_reserver; + std::unordered_map< IndexStore::Type, shared< IndexStore > > m_index_stores; + + mutable std::shared_mutex m_index_map_mtx; + std::map< uuid_t, shared< Index > > m_index_map; + std::unordered_map< uint32_t, shared< Index > > m_ordinal_index_map; public: - IndexService(std::unique_ptr< IndexServiceCallbacks > cbs, shared< ChunkSelector > custom_chunk_selector = nullptr); - ~IndexService(); + IndexService(unique< IndexServiceCallbacks > cbs, std::vector< ServiceSubType > const& sub_types); // Creates the vdev that is needed to initialize the device - void create_vdev(uint64_t size, HSDevType devType, uint32_t num_chunks, - chunk_selector_type_t chunk_sel_type = chunk_selector_type_t::ROUND_ROBIN); + void create_vdev(ServiceSubType sub_type, uint64_t size, HSDevType devType, uint32_t num_chunks); + // Open the existing vdev which is represnted by the vdev_info_block - shared< VirtualDev > open_vdev(const vdev_info& vb, bool load_existing); - std::shared_ptr< ChunkSelector > get_chunk_selector() { return m_custom_chunk_selector; }; - // for now, we don't support start after stop and there is no use case for this. - // TODO: support start after stop if necessary + shared< VirtualDev > open_vdev(ServiceSubType sub_type, const vdev_info& vb, bool load_existing); - // Start the Index Service + // Start the Index Service void start(); // Stop the Index Service void stop(); // Add/Remove Index Table to/from the index service - uint64_t num_tables(); - bool add_index_table(const std::shared_ptr< IndexTableBase >& tbl); - bool remove_index_table(const std::shared_ptr< IndexTableBase >& tbl); - std::shared_ptr< IndexTableBase > get_index_table(uuid_t uuid) const; - std::shared_ptr< IndexTableBase > get_index_table(uint32_t ordinal) const; - void write_sb(uint32_t ordinal); - bool sanity_check(const uint32_t index_ordinal, const IndexBufferPtrList& bufs) const; - - // Reserve/unreserve an ordinal for the index table - uint32_t reserve_ordinal(); - bool reserve_ordinal(uint32_t ordinal); - bool unreserve_ordinal(uint32_t ordinal); + void add_index_table(shared< Index > const& tbl); + folly::Future< folly::Unit > destroy_index_table(shared< Index > const& tbl); - uint64_t used_size() const; - uint32_t node_size() const; + shared< Index > get_index_table(uuid_t uuid) const; + shared< Index > get_index_table(uint32_t ordinal) const; + std::vector< shared< Index > > get_all_index_tables() const; - // the following methods are used wb_cache , which will not used by upper layer. so graceful shutdown just skips - // them for now. - void repair_index_node(uint32_t ordinal, IndexBufferPtr const& node_buf); - void parent_recover(uint32_t ordinal, IndexBufferPtr const& node_buf); - void update_root(uint32_t ordinal, IndexBufferPtr const& node_buf); + IndexStore* lookup_store(IndexStore::Type store_type); + uint64_t space_occupied() const; + uint32_t reserve_ordinal(); - IndexWBCacheBase& wb_cache() { - if (!m_wb_cache) { throw std::runtime_error("Attempted to access a null pointer wb_cache"); } - return *m_wb_cache; - } + shared< IndexStore > lookup_or_create_store(IndexStore::Type store_type, + std::vector< superblk< IndexStoreSuperBlock > > sbs); private: - // graceful shutdown related - std::atomic_bool m_stopping{false}; - mutable std::atomic_uint64_t pending_request_num{0}; - - bool is_stopping() const { return m_stopping.load(); } - void start_stopping() { m_stopping = true; } - - uint64_t get_pending_request_num() const { return pending_request_num.load(); } - - void incr_pending_request_num() const { pending_request_num++; } - void decr_pending_request_num() const { pending_request_num--; } + shared< VirtualDev > get_vdev(ServiceSubType sub_type); }; extern IndexService& index_service(); -extern IndexWBCacheBase& wb_cache(); } // namespace homestore diff --git a/src/include/homestore/logstore/log_store.hpp b/src/include/homestore/logstore/log_store.hpp index cfeecc05f..91735be79 100644 --- a/src/include/homestore/logstore/log_store.hpp +++ b/src/include/homestore/logstore/log_store.hpp @@ -89,7 +89,7 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * @param cb [OPTIONAL] Callback if caller wants specific callback as against common/default callback registed. * The callback returns the request back with status of execution */ - logstore_seq_num_t write_async(logstore_req* req, const log_req_comp_cb_t& cb = nullptr); + void write_async(logstore_req* req, const log_req_comp_cb_t& cb = nullptr); /** * @brief Write the blob at the user specified seq number @@ -99,8 +99,7 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * @param cookie : Any cookie or context which will passed back in the callback * @param cb Callback upon completion which is called with the status, seq_num and cookie that was passed. */ - logstore_seq_num_t write_async(logstore_seq_num_t seq_num, const sisl::io_blob& b, void* cookie, - const log_write_comp_cb_t& cb); + void write_async(logstore_seq_num_t seq_num, const sisl::io_blob& b, void* cookie, const log_write_comp_cb_t& cb); /** * @brief This method appends the blob into the log and makes a callback at the end of the append. @@ -126,7 +125,7 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * @param seq_num: Seq number to write to * @param b : Blob of data */ - logstore_seq_num_t write_and_flush(logstore_seq_num_t seq_num, const sisl::io_blob& b); + void write_and_flush(logstore_seq_num_t seq_num, const sisl::io_blob& b); /** * @brief Read the log provided the sequence number synchronously. This is not the most efficient way to read @@ -151,11 +150,9 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * completed, a device truncation can be triggered for all the logstores. The device truncation is more * expensive and grouping them together yields better results. * - * @return True on success - * * Note: this flag currently is not used, meaning all truncate is in memory only; */ - bool truncate(logstore_seq_num_t upto_seq_num, bool in_memory_truncate_only = true); + void truncate(logstore_seq_num_t upto_seq_num, bool in_memory_truncate_only = true); /** * @brief Fill the gap in the seq_num with a dummy value. This ensures that get_contiguous_issued and completed @@ -163,9 +160,8 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * result in out_of_range exception. * * @param seq_num: Seq_num to fill to. - * @return True on success */ - bool fill_gap(logstore_seq_num_t seq_num); + void fill_gap(logstore_seq_num_t seq_num); /** * @brief Get the last truncated seqnum upto which we have truncated. If called after recovery, it returns the @@ -196,9 +192,8 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * @param start_idx idx to start with; * @param cb called with current idx and log buffer. * Return value of the cb: true means proceed, false means stop; - * @return True on success */ - bool foreach (int64_t start_idx, const std::function< bool(logstore_seq_num_t, log_buffer) >& cb); + void foreach (int64_t start_idx, const std::function< bool(logstore_seq_num_t, log_buffer) >& cb); /** * @brief Get the store id of this HomeLogStore @@ -232,9 +227,8 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * * @param seq_num Sequence number upto which logs are to be flushed. If not provided, will wait to flush all seq * numbers issued prior. - * @return True on success */ - bool flush(logstore_seq_num_t upto_seq_num = invalid_lsn()); + void flush(logstore_seq_num_t upto_seq_num = invalid_lsn()); /** * @brief Rollback the given instance to the given sequence number @@ -283,8 +277,6 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { auto get_comp_cb() const { return m_comp_cb; } - void stop(); - private: logstore_id_t m_store_id; std::shared_ptr< LogDev > m_logdev; @@ -303,18 +295,5 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { LogStoreServiceMetrics& m_metrics; logdev_key m_trunc_ld_key{0, 0}; - -private: - // graceful shutdown related fields - std::atomic_bool m_stopping{false}; - mutable std::atomic_uint64_t pending_request_num{0}; - - bool is_stopping() const { return m_stopping.load(); } - void start_stopping() { m_stopping = true; } - - uint64_t get_pending_request_num() const { return pending_request_num.load(); } - - void incr_pending_request_num() const { pending_request_num++; } - void decr_pending_request_num() const { pending_request_num--; } }; } // namespace homestore diff --git a/src/include/homestore/logstore_service.hpp b/src/include/homestore/logstore_service.hpp index 48183a56c..039e14114 100644 --- a/src/include/homestore/logstore_service.hpp +++ b/src/include/homestore/logstore_service.hpp @@ -68,7 +68,6 @@ class LogStoreService { public: LogStoreService(); - ~LogStoreService(); LogStoreService(const LogStoreService&) = delete; LogStoreService(LogStoreService&&) noexcept = delete; LogStoreService& operator=(const LogStoreService&) = delete; @@ -195,19 +194,6 @@ class LogStoreService { LogStoreServiceMetrics m_metrics; std::unordered_set< logdev_id_t > m_unopened_logdev; superblk< logstore_service_super_block > m_sb; - -private: - // graceful shutdown related - std::atomic_bool m_stopping{false}; - mutable std::atomic_uint64_t pending_request_num{0}; - - bool is_stopping() const { return m_stopping.load(); } - void start_stopping() { m_stopping = true; } - - uint64_t get_pending_request_num() const { return pending_request_num.load(); } - - void incr_pending_request_num() const { pending_request_num++; } - void decr_pending_request_num() const { pending_request_num--; } }; extern LogStoreService& logstore_service(); diff --git a/src/include/homestore/meta_service.hpp b/src/include/homestore/meta_service.hpp index 118677e54..81cb79e3c 100644 --- a/src/include/homestore/meta_service.hpp +++ b/src/include/homestore/meta_service.hpp @@ -24,10 +24,12 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -80,8 +82,8 @@ class MetaBlkService { private: static bool s_self_recover; std::shared_ptr< VirtualDev > m_sb_vdev; // super block vdev - std::mutex m_meta_mtx; // mutex to access to meta_map; - std::mutex m_shutdown_mtx; // protects concurrent operations between recover and shutdown; + iomgr::FiberManagerLib::mutex m_meta_mtx; // mutex to access to meta_map; + iomgr::FiberManagerLib::mutex m_shutdown_mtx; // protects concurrent operations between recover and shutdown; meta_blk_map_t m_meta_blks; // subsystem type to meta blk map; ovf_hdr_map_t m_ovf_blk_hdrs; // ovf blk map; client_info_map_t m_sub_info; // map of callbacks diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index c52466779..88a928aa3 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -10,10 +10,6 @@ #include #include -SISL_LOGGING_DECL(replication) - -#define REPL_LOG_MODS grpc_server, HOMESTORE_LOG_MODS, nuraft_mesg, nuraft, replication - namespace homestore { // clang-format off VENUM(ReplServiceError, int32_t, @@ -30,7 +26,6 @@ VENUM(ReplServiceError, int32_t, SERVER_IS_LEAVING = -10, TERM_MISMATCH = -11, RETRY_REQUEST = -12, - STOPPING = -13, RESULT_NOT_EXIST_YET = -10000, NOT_IMPLEMENTED = -10001, NO_SPACE_LEFT = -20000, @@ -38,17 +33,7 @@ VENUM(ReplServiceError, int32_t, DATA_DUPLICATED = -20002, QUIENCE_STATE = -20003, QUORUM_NOT_MET = -20004, - REPLACE_MEMBER_TASK_MISMATCH = -20005, - UNREADY_STATE = -20006, FAILED = -32768); - -VENUM(ReplaceMemberStatus, int32_t, - COMPLETED = 0, - IN_PROGRESS = 1, - NOT_LEADER = 2, - TASK_ID_MISMATCH = 3, - TASK_NOT_FOUND = 4, - UNKNOWN = 5); // clang-format on template < typename V, typename E > diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 70d564d88..45e2488c6 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -38,21 +38,18 @@ VENUM(repl_req_state_t, uint32_t, DATA_WRITTEN = 1 << 2, // Data has been written to the storage LOG_RECEIVED = 1 << 3, // Log is received and waiting for data LOG_FLUSHED = 1 << 4, // Log has been flushed - ERRORED = 1 << 5, // Error has happened and cleaned up - DATA_COMMITTED = 1 << 6 // Data has already been committed, used in duplication handling, will skip commit_blk + ERRORED = 1 << 5, // Error has happened and cleaned up + DATA_COMMITTED = 1 << 6 // Data has already been committed, used in duplication handling, will skip commit_blk ) VENUM(journal_type_t, uint16_t, - HS_DATA_LINKED = 0, // Linked data where each entry will store physical blkid where data reside - HS_DATA_INLINED = 1, // Data is inlined in the header of journal entry - HS_CTRL_DESTROY = 2, // Control message to destroy the repl_dev - HS_CTRL_START_REPLACE = 3, // Control message to start replace a member - HS_CTRL_COMPLETE_REPLACE = 4, // Control message to complete replace a member, - HS_CTRL_UPDATE_TRUNCATION_BOUNDARY = 5, // Control message to update truncation boundary + HS_DATA_LINKED = 0, // Linked data where each entry will store physical blkid where data reside + HS_DATA_INLINED = 1, // Data is inlined in the header of journal entry + HS_CTRL_DESTROY = 2, // Control message to destroy the repl_dev + HS_CTRL_START_REPLACE = 3, // Control message to start replace a member + HS_CTRL_COMPLETE_REPLACE = 4, // Control message to complete replace a member ) -ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, UNREADY, DESTROYING, DESTROYED, PERMANENT_DESTROYED); - // magic num comes from the first 8 bytes of 'echo homestore_resync_data | md5sum' static constexpr uint64_t HOMESTORE_RESYNC_DATA_MAGIC = 0xa65dbd27c213f327; static constexpr uint32_t HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1 = 0x01; @@ -120,7 +117,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: repl_req_ctx() { m_start_time = Clock::now(); } virtual ~repl_req_ctx(); ReplServiceError init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size, cshared< ReplDevListener >& listener); + sisl::blob const& key, uint32_t data_size, cshared< ReplDevListener >& listener); /////////////////////// All getters /////////////////////// repl_key const& rkey() const { return m_rkey; } @@ -372,12 +369,12 @@ class ReplDevListener { virtual void on_destroy(const group_id_t& group_id) = 0; /// @brief Called when start replace member. - virtual void on_start_replace_member(const std::string& task_id, const replica_member_info& member_out, - const replica_member_info& member_in, trace_id_t tid) = 0; + virtual void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + trace_id_t tid) = 0; /// @brief Called when complete replace member. - virtual void on_complete_replace_member(const std::string& task_id, const replica_member_info& member_out, - const replica_member_info& member_in, trace_id_t tid) = 0; + virtual void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + trace_id_t tid) = 0; /// @brief Called when the snapshot is being created by nuraft virtual AsyncReplResult<> create_snapshot(shared< snapshot_context > context) = 0; @@ -417,8 +414,8 @@ class ReplDevListener { /// @brief ask upper layer to handle no_space_left event // @param lsn - on which repl_lsn no_space_left happened - // @param header - on which header no_space_left happened when trying to allocate blk - virtual void on_no_space_left(repl_lsn_t lsn, sisl::blob const& header) = 0; + // @param chunk_id - on which chunk no_space_left happened + virtual void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) = 0; /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer virtual void on_log_replay_done(const group_id_t& group_id) {}; @@ -544,26 +541,9 @@ class ReplDev { /// @return true if ready, false otherwise virtual bool is_ready_for_traffic() const = 0; - /// @brief Set the stage of this repl dev, this helps user to set unready state when the condition is not met(e.g. - /// disk is unhealthy) and vice versa which supports to run in degrade mode. - virtual void set_stage(repl_dev_stage_t stage) = 0; - - /// @brief Get the stage of this repl dev. - /// @return current stage of this repl dev. - virtual repl_dev_stage_t get_stage() const = 0; - /// @brief Clean up resources on this repl dev. virtual void purge() = 0; - /// @brief Pause repl dev state machine, timeout is in milliseconds. - virtual void pause_state_machine(size_t timeout) = 0; - - /// @brief Resume repl dev state machine. - virtual void resume_state_machine() = 0; - - /// @brief Check if the state machine is paused. - virtual bool is_state_machine_paused() = 0; - virtual std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) = 0; virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } @@ -579,6 +559,7 @@ class ReplDev { // we have no shutdown for repl_dev, since shutdown repl_dev is done by repl_service void stop() { +#if 0 start_stopping(); while (true) { auto pending_request_num = get_pending_request_num(); @@ -586,6 +567,7 @@ class ReplDev { std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } +#endif } // complete all the requests that are in progress and start refusing new reqs @@ -599,19 +581,6 @@ class ReplDev { protected: shared< ReplDevListener > m_listener; - - // graceful shutdown related -protected: - std::atomic_bool m_stopping{false}; - mutable std::atomic_uint64_t pending_request_num{0}; - - bool is_stopping() const { return m_stopping.load(); } - void start_stopping() { m_stopping = true; } - - uint64_t get_pending_request_num() const { return pending_request_num.load(); } - - void incr_pending_request_num() const { pending_request_num++; } - void decr_pending_request_num() const { pending_request_num--; } }; } // namespace homestore diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index 60eba96c4..f28704546 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -43,34 +43,16 @@ class ReplicationService { /// @brief Replace one of the members with a new one. /// @param group_id Group where the replace member happens - /// @param task_id Id of the task which is going to be used for this operation. This is used to track the replace - /// member. /// @param member_out The member which is going to be replaced /// @param member_in The member which is going to be added in place of member_out /// @param commit_quorum Commit quorum to be used for this operation. If 0, it will use the default commit quorum. /// @return A Future on replace the member accepted or Future ReplServiceError upon error - virtual AsyncReplResult<> replace_member(group_id_t group_id, std::string& task_id, - const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0) const = 0; - - virtual AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, - uint32_t commit_quorum, bool wait_and_verify = true, - uint64_t trace_id = 0) const = 0; - - /// @brief Get status of member replacement. - /// @param group_id Group where the replace member happens - /// @param task_id Id of the replace member task. This is used to track the replace - /// @param member_out The member which is going to be replaced - /// @param member_in The member which is going to be added in place of member_out - /// @param others Other members excluding member_out, member_in - /// @return ReplaceMemberStatus - virtual ReplaceMemberStatus get_replace_member_status(group_id_t group_id, std::string& task_id, - const replica_member_info& member_out, - const replica_member_info& member_in, - const std::vector< replica_member_info >& others, - uint64_t trace_id = 0) const = 0; + virtual AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const = 0; + virtual AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, + bool wait_and_verify = true, uint64_t trace_id = 0) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened /// @param group_id Group id interested in /// @return ReplDev is opened or ReplServiceError::SERVER_NOT_FOUND if it doesn't exist diff --git a/src/include/homestore/vchunk.h b/src/include/homestore/vchunk.h index 9dc0dd208..4b69b1332 100644 --- a/src/include/homestore/vchunk.h +++ b/src/include/homestore/vchunk.h @@ -31,10 +31,8 @@ class VChunk { const uint8_t* get_user_private() const; blk_num_t get_total_blks() const; blk_num_t available_blks() const; - blk_num_t get_used_blks() const; blk_num_t get_defrag_nblks() const; uint32_t get_pdev_id() const; - const std::string& get_pdev_name() const; uint16_t get_chunk_id() const; cshared< Chunk > get_internal_chunk() const; uint64_t size() const; diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index 2cabcd9f2..2f6cec25c 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -76,19 +76,21 @@ BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hint LOGERROR("No space left to serve request nblks: {}, available_blks: {}, actual available_blks(exclude reserved " "blks): {}", nblks, available_blks(), avail_blks); + // the caller can know in which chunk no_space_left happened; + out_bid = BlkId{0, 0, m_chunk_id}; return BlkAllocStatus::SPACE_FULL; } else if (nblks > max_blks_per_blkid()) { // consumer(vdev) already handles this case. // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); LOGERROR("Can't serve request nblks: {} larger than max_blks_in_op: {}", nblks, max_blks_per_blkid()); + out_bid = BlkId{0, 0, m_chunk_id}; return BlkAllocStatus::FAILED; } // Push 1 blk to the vector which has all the requested nblks; out_bid = BlkId{m_last_append_offset.fetch_add(nblks), nblks, m_chunk_id}; - LOGDEBUG("chunk {} has successfully allocated nblks: {}, totally used blks: {}, available_blks: {}, actual " - "available_blks(exclude reserved blks): {}, last_append_offset: {}", - m_chunk_id, nblks, get_used_blks(), available_blks(), avail_blks, m_last_append_offset.load()); + + // COUNTER_INCREMENT(m_metrics, num_alloc, 1); return BlkAllocStatus::SUCCESS; } @@ -165,9 +167,9 @@ std::string AppendBlkAllocator::to_string() const { blk_num_t AppendBlkAllocator::available_blks() const { return get_total_blks() - get_used_blks(); } -blk_num_t AppendBlkAllocator::get_used_blks() const { return m_last_append_offset.load(); } +blk_num_t AppendBlkAllocator::get_used_blks() const { return m_last_append_offset.load(std::memory_order_relaxed); } -blk_num_t AppendBlkAllocator::get_defrag_nblks() const { return m_freeable_nblks.load(); } +blk_num_t AppendBlkAllocator::get_defrag_nblks() const { return m_freeable_nblks.load(std::memory_order_relaxed); } nlohmann::json AppendBlkAllocator::get_status(int log_level) const { nlohmann::json j; diff --git a/src/lib/blkalloc/blk.cpp b/src/lib/blkalloc/blk.cpp index e98d017ba..30a4b155c 100644 --- a/src/lib/blkalloc/blk.cpp +++ b/src/lib/blkalloc/blk.cpp @@ -40,6 +40,12 @@ void BlkId::invalidate() { s.m_nblks = 0; } bool BlkId::is_valid() const { return (blk_count() > 0); } +std::pair< BlkId, BlkId > BlkId::split(blk_count_t count) const { + BlkId lb{blk_num(), count, chunk_num()}; + BlkId rb{blk_num() + count, (blk_count_t)(blk_count() - count), chunk_num()}; + return std::pair(lb, rb); +} + std::string BlkId::to_string() const { return is_valid() ? fmt::format("blk#={} count={} chunk={}", blk_num(), blk_count(), chunk_num()) : "Invalid_Blkid"; } @@ -122,6 +128,12 @@ uint16_t MultiBlkId::num_pieces() const { return BlkId::is_valid() ? n_addln_pie bool MultiBlkId::has_room() const { return (n_addln_piece < max_addln_pieces); } +std::pair< MultiBlkId, MultiBlkId > MultiBlkId::split(blk_count_t count) const { + MultiBlkId lb{blk_num(), count, chunk_num()}; + MultiBlkId rb{blk_num() + count, (blk_count_t)(blk_count() - count), chunk_num()}; + return std::pair(lb, rb); +} + MultiBlkId::iterator MultiBlkId::iterate() const { return MultiBlkId::iterator{*this}; } std::string MultiBlkId::to_string() const { diff --git a/src/lib/blkalloc/blk_allocator.h b/src/lib/blkalloc/blk_allocator.h index f02aa3dd2..8c64fc8e5 100644 --- a/src/lib/blkalloc/blk_allocator.h +++ b/src/lib/blkalloc/blk_allocator.h @@ -34,9 +34,6 @@ #include "common/homestore_config.hpp" #include "common/homestore_assert.hpp" -SISL_LOGGING_DECL(blkalloc) -SISL_LOGGING_DECL(transient) - namespace homestore { #define BLKALLOC_LOG(level, msg, ...) HS_SUBMOD_LOG(level, blkalloc, , "blkalloc", get_name(), msg, ##__VA_ARGS__) #define BLKALLOC_DBG_ASSERT(cond, msg, ...) \ diff --git a/src/lib/blkalloc/fixed_blk_allocator.h b/src/lib/blkalloc/fixed_blk_allocator.h index 4d743f60b..01f1e1138 100644 --- a/src/lib/blkalloc/fixed_blk_allocator.h +++ b/src/lib/blkalloc/fixed_blk_allocator.h @@ -41,7 +41,7 @@ class FixedBlkAllocator : public BitmapBlkAllocator { blk_num_t available_blks() const override; blk_num_t get_used_blks() const override; blk_num_t get_defrag_nblks() const override; - void reset() override {}; + void reset() override{}; bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; std::string to_string() const override; diff --git a/src/lib/blkalloc/ss_blk_allocator.h b/src/lib/blkalloc/ss_blk_allocator.h new file mode 100644 index 000000000..a3abe139d --- /dev/null +++ b/src/lib/blkalloc/ss_blk_allocator.h @@ -0,0 +1,53 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +#include "bitmap_blk_allocator.h" + +namespace homestore { +/* SSBlkAllocator is a fast allocator where it allocates only 1 size block and ALL free blocks are cached instead + * of selectively caching few blks which are free. Thus there is no sweeping of bitmap or other to refill the cache. + * It does not support temperature of blocks and allocates simply on first come first serve basis + */ +class SSBlkAllocator : public BitmapBlkAllocator { +public: + SSBlkAllocator(BlkAllocConfig const& cfg, bool is_fresh, chunk_num_t chunk_id); + SSBlkAllocator(SSBlkAllocator const&) = delete; + SSBlkAllocator(SSBlkAllocator&&) noexcept = delete; + SSBlkAllocator& operator=(SSBlkAllocator const&) = delete; + SSBlkAllocator& operator=(SSBlkAllocator&&) noexcept = delete; + virtual ~SSBlkAllocator() = default; + + void load() override; + + BlkAllocStatus alloc_contiguous(BlkId& bid) override; + BlkAllocStatus alloc(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) override; + BlkAllocStatus mark_blk_allocated(BlkId const& b) override; + void free(BlkId const& b) override; + + blk_num_t available_blks() const override; + blk_num_t get_used_blks() const override; + blk_num_t get_freeable_nblks() const override; + blk_num_t get_defrag_nblks() const override; + bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; + std::string to_string() const override; + +private: + blk_num_t init_portion(BlkAllocPortion& portion, blk_num_t start_blk_num); + +private: + std::unique_lock m_sweep_lock; // Lock protecting multiple threads sweeping on-disk bmap +}; +} // namespace homestore diff --git a/src/lib/blkalloc/varsize_blk_allocator.cpp b/src/lib/blkalloc/varsize_blk_allocator.cpp index bfa92902d..0c445921e 100644 --- a/src/lib/blkalloc/varsize_blk_allocator.cpp +++ b/src/lib/blkalloc/varsize_blk_allocator.cpp @@ -28,8 +28,6 @@ #include "varsize_blk_allocator.h" -SISL_LOGGING_DECL(blkalloc) - template <> struct fmt::formatter< std::thread::id > { constexpr auto parse(format_parse_context& ctx) -> format_parse_context::iterator { return ctx.begin(); } @@ -586,9 +584,7 @@ blk_count_t VarsizeBlkAllocator::alloc_blks_direct(blk_count_t nblks, blk_alloc_ auto start_portion_num = m_start_portion_num; auto const max_pieces = hints.is_contiguous ? 1u : MultiBlkId::max_pieces; - blk_count_t const min_blks = hints.is_contiguous && !hints.partial_alloc_ok - ? nblks - : std::min< blk_count_t >(nblks, hints.min_blks_per_piece); + blk_count_t const min_blks = hints.is_contiguous ? nblks : std::min< blk_count_t >(nblks, hints.min_blks_per_piece); blk_count_t nblks_remain = nblks; do { BlkAllocPortion& portion = get_blk_portion(portion_num); @@ -621,11 +617,9 @@ blk_count_t VarsizeBlkAllocator::alloc_blks_direct(blk_count_t nblks, blk_alloc_ if (nblks_remain) { auto curr_portion = portion_num; if (++portion_num == get_num_portions()) { portion_num = 0; } - BLKALLOC_LOG(TRACE, - "alloc direct unable to find in curr portion {}, will searching in portion={}, " - "start_portion={},continue={}, out_blkid num_pieces={} , max_pieces={}", - curr_portion, portion_num, start_portion_num, hints.is_contiguous, out_blkid.num_pieces(), - max_pieces); + BLKALLOC_LOG( + TRACE, "alloc direct unable to find in curr portion {}, will searching in portion={}, start_portion={},continue={}, out_blkid num_pieces={} , max_pieces={}", + curr_portion, portion_num, start_portion_num, hints.is_contiguous, out_blkid.num_pieces(), max_pieces); } } while (nblks_remain && (portion_num != start_portion_num) && (out_blkid.num_pieces() < max_pieces)); @@ -779,8 +773,8 @@ void VarsizeBlkAllocator::alloc_sanity_check(blk_count_t nblks, blk_alloc_hints } BLKALLOC_REL_ASSERT((nblks == alloced_nblks), "Requested blks={} alloced_blks={} num_pieces={}", nblks, alloced_nblks, out_blkid.num_pieces()); - // BLKALLOC_REL_ASSERT((!hints.is_contiguous || (out_blkid.num_pieces() == 1)), - // "Multiple blkids allocated for contiguous request"); + BLKALLOC_REL_ASSERT((!hints.is_contiguous || (out_blkid.num_pieces() == 1)), + "Multiple blkids allocated for contiguous request"); } } #endif diff --git a/src/lib/blkalloc/varsize_blk_allocator.h b/src/lib/blkalloc/varsize_blk_allocator.h index 04945ab52..03a507b03 100644 --- a/src/lib/blkalloc/varsize_blk_allocator.h +++ b/src/lib/blkalloc/varsize_blk_allocator.h @@ -222,7 +222,7 @@ class VarsizeBlkAllocator : public BitmapBlkAllocator { blk_num_t get_used_blks() const override; bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; std::string to_string() const override; - void reset() override {}; + void reset() override{}; nlohmann::json get_metrics_in_json(); private: diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index 87a59f8e2..b17fc0a61 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -80,8 +80,6 @@ static auto collect_all_futures(std::vector< folly::Future< std::error_code > >& folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& blkid, uint8_t* buf, uint32_t size, bool part_of_batch) { - if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - incr_pending_request_num(); auto do_read = [this](BlkId const& bid, uint8_t* buf, uint32_t size, bool part_of_batch) { m_blk_read_tracker->insert(bid); @@ -92,7 +90,6 @@ folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& bl }; if (blkid.num_pieces() == 1) { - decr_pending_request_num(); return do_read(blkid.to_single_blkid(), buf, size, part_of_batch); } else { static thread_local std::vector< folly::Future< std::error_code > > s_futs; @@ -104,15 +101,13 @@ folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& bl s_futs.emplace_back(do_read(*bid, buf, sz, part_of_batch)); buf += sz; } - decr_pending_request_num(); + return collect_all_futures(s_futs); } } folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch) { - if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - incr_pending_request_num(); // TODO: sg_iovs_t should not be passed by value. We need it pass it as const&, but that is failing because // iovs.data() will then return "const iovec*", but unfortunately all the way down to iomgr, we take iovec* // instead it can easily take "const iovec*". Until we change this is made as copy by value @@ -127,7 +122,6 @@ folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& bl }; if (blkid.num_pieces() == 1) { - decr_pending_request_num(); return do_read(blkid.to_single_blkid(), sgs.iovs, size, part_of_batch); } else { static thread_local std::vector< folly::Future< std::error_code > > s_futs; @@ -139,7 +133,7 @@ folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& bl uint32_t const sz = bid->blk_count() * m_blk_size; s_futs.emplace_back(do_read(*bid, sg_it.next_iovs(sz), sz, part_of_batch)); } - decr_pending_request_num(); + return collect_all_futures(s_futs); } } @@ -147,25 +141,17 @@ folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& bl folly::Future< std::error_code > BlkDataService::async_alloc_write(const sisl::sg_list& sgs, const blk_alloc_hints& hints, MultiBlkId& out_blkids, bool part_of_batch) { - if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - incr_pending_request_num(); const auto status = alloc_blks(sgs.size, hints, out_blkids); if (status != BlkAllocStatus::SUCCESS) { - decr_pending_request_num(); return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::resource_unavailable_try_again)); } - auto ret = async_write(sgs, out_blkids, part_of_batch); - decr_pending_request_num(); - return ret; + return async_write(sgs, out_blkids, part_of_batch); } folly::Future< std::error_code > BlkDataService::async_write(const char* buf, uint32_t size, MultiBlkId const& blkid, bool part_of_batch) { - if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - incr_pending_request_num(); if (blkid.num_pieces() == 1) { // Shortcut to most common case - decr_pending_request_num(); return m_vdev->async_write(buf, size, blkid.to_single_blkid(), part_of_batch); } else { static thread_local std::vector< folly::Future< std::error_code > > s_futs; @@ -178,21 +164,17 @@ folly::Future< std::error_code > BlkDataService::async_write(const char* buf, ui s_futs.emplace_back(m_vdev->async_write(ptr, sz, *bid, part_of_batch)); ptr += sz; } - decr_pending_request_num(); return collect_all_futures(s_futs); } } folly::Future< std::error_code > BlkDataService::async_write(sisl::sg_list const& sgs, MultiBlkId const& blkid, bool part_of_batch) { - if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - incr_pending_request_num(); // TODO: Async write should pass this by value the sgs.size parameter as well, currently vdev write routine // walks through again all the iovs and then getting the len to pass it down to iomgr. This defeats the purpose of // taking size parameters (which was done exactly done to avoid this walk through) if (blkid.num_pieces() == 1) { // Shortcut to most common case - decr_pending_request_num(); return m_vdev->async_writev(sgs.iovs.data(), sgs.iovs.size(), blkid.to_single_blkid(), part_of_batch); } else { static thread_local std::vector< folly::Future< std::error_code > > s_futs; @@ -204,78 +186,51 @@ folly::Future< std::error_code > BlkDataService::async_write(sisl::sg_list const const auto iovs = sg_it.next_iovs(bid->blk_count() * m_blk_size); s_futs.emplace_back(m_vdev->async_writev(iovs.data(), iovs.size(), *bid, part_of_batch)); } - decr_pending_request_num(); return collect_all_futures(s_futs); } } folly::Future< std::error_code > BlkDataService::async_write(sisl::sg_list const& sgs, std::vector< MultiBlkId > const& blkids, bool part_of_batch) { - if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - incr_pending_request_num(); static thread_local std::vector< folly::Future< std::error_code > > s_futs; s_futs.clear(); - sisl::sg_iterator sg_it{sgs.iovs}; for (const auto& blkid : blkids) { - auto sgs_size = blkid.blk_count() * data_service().get_blk_size(); - const auto iovs = sg_it.next_iovs(sgs_size); - sisl::sg_list single_sgs{sgs_size, iovs}; - s_futs.emplace_back(async_write(single_sgs, blkid, part_of_batch)); + s_futs.emplace_back(async_write(sgs, blkid, part_of_batch)); } - decr_pending_request_num(); return collect_all_futures(s_futs); } void BlkDataService::submit_io_batch() { m_vdev->submit_batch(); } BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, MultiBlkId& out_blkids) { - if (is_stopping()) return BlkAllocStatus::FAILED; - incr_pending_request_num(); HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested size={} blk_size={}", size, m_blk_size); blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size); - auto ret = m_vdev->alloc_blks(nblks, hints, out_blkids); - decr_pending_request_num(); - return ret; + return m_vdev->alloc_blks(nblks, hints, out_blkids); } BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkids) { - if (is_stopping()) return BlkAllocStatus::FAILED; - incr_pending_request_num(); HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested size={} blk_size={}", size, m_blk_size); blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size); - auto ret = m_vdev->alloc_blks(nblks, hints, out_blkids); - decr_pending_request_num(); - return ret; + return m_vdev->alloc_blks(nblks, hints, out_blkids); } BlkAllocStatus BlkDataService::commit_blk(MultiBlkId const& blkid) { - if (is_stopping()) return BlkAllocStatus::FAILED; - incr_pending_request_num(); - if (blkid.num_pieces() == 1) { // Shortcut to most common case - auto ret = m_vdev->commit_blk(blkid); - decr_pending_request_num(); - return ret; + return m_vdev->commit_blk(blkid); } auto it = blkid.iterate(); while (auto const bid = it.next()) { auto alloc_status = m_vdev->commit_blk(*bid); - if (alloc_status != BlkAllocStatus::SUCCESS) { - decr_pending_request_num(); - return alloc_status; - } + if (alloc_status != BlkAllocStatus::SUCCESS) return alloc_status; } - decr_pending_request_num(); return BlkAllocStatus::SUCCESS; } folly::Future< std::error_code > BlkDataService::async_free_blk(MultiBlkId const& bids) { - if (is_stopping()) return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - incr_pending_request_num(); // create blk read waiter instance; folly::Promise< std::error_code > promise; auto f = promise.getFuture(); @@ -291,7 +246,7 @@ folly::Future< std::error_code > BlkDataService::async_free_blk(MultiBlkId const p.setValue(std::error_code{}); }); } - decr_pending_request_num(); + return f; } @@ -301,18 +256,7 @@ void BlkDataService::start() { std::move(std::make_unique< DataSvcCPCallbacks >(m_vdev))); } -void BlkDataService::stop() { - start_stopping(); - // we have no way to track the completion of each async io in detail which should be done in iomanager level, so - // we just wait for 3 seconds, and we expect each io will be completed within this time. - - // TODO: find a better solution to track the completion of these aysnc calls - std::this_thread::sleep_for(std::chrono::milliseconds(3000)); - while (true) { - if (!get_pending_request_num()) break; - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } -} +void BlkDataService::stop() {} uint64_t BlkDataService::get_total_capacity() const { return m_vdev->size(); } diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp index 960d885e2..98f0f7cbb 100644 --- a/src/lib/checkpoint/cp_mgr.cpp +++ b/src/lib/checkpoint/cp_mgr.cpp @@ -14,7 +14,6 @@ * *********************************************************************************/ #include -#include #include #include @@ -24,9 +23,11 @@ #include "common/homestore_config.hpp" #include "common/resource_mgr.hpp" #include "cp_internal.hpp" - +#ifdef _PRERELEASE +#include "common/crash_simulator.hpp" +#endif namespace homestore { -thread_local std::stack< CP* > CPGuard::t_cp_stack; +iomgr::FiberManagerLib::FiberLocal< std::stack< CP* > > CPGuard::t_cp_stack; CPManager& cp_mgr() { return hs()->cp_mgr(); } @@ -34,15 +35,12 @@ CPManager::CPManager() : m_metrics{std::make_unique< CPMgrMetrics >()}, m_wd_cp{std::make_unique< CPWatchdog >(this)}, m_sb{"CPSuperBlock"} { + // m_trigger_reasons{enum_count< CPTriggerReason >(), 0ul} { meta_service().register_handler( "CPSuperBlock", [this](meta_blk* mblk, sisl::byte_view buf, size_t size) { on_meta_blk_found(std::move(buf), (void*)mblk); }, nullptr); - resource_mgr().register_dirty_buf_exceed_cb( - [this]([[maybe_unused]] int64_t dirty_buf_count, bool critical) { this->trigger_cp_flush(false /* force */); }); - - start_timer_thread(); start_cp_thread(); } @@ -56,48 +54,11 @@ void CPManager::start(bool first_time_boot) { } } -uint64_t CPManager::cp_timer_us() { - if (SISL_OPTIONS.count("cp_timer_ms")) { - auto const n = SISL_OPTIONS["cp_timer_ms"].as< uint64_t >() * 1000; - LOGINFO("Using cp_timer_ms option value: {}", n); - return n; - } else { - return HS_DYNAMIC_CONFIG(generic.cp_timer_us); - } -} - -void CPManager::start_timer_thread() { - std::latch latch{1}; - m_timer_fiber = nullptr; - iomanager.create_reactor("cp_timer_thread", iomgr::TIGHT_LOOP | iomgr::ADAPTIVE_LOOP, 1 /* num_fibers */, - [this, &latch](bool is_started) { - if (is_started) { - m_timer_fiber = iomanager.iofiber_self(); - latch.count_down(); - } - }); - latch.wait(); -} - -void CPManager::stop_timer_thread() { - std::latch latch{1}; - iomanager.run_on_forget(m_timer_fiber, [this, &latch]() mutable { - if (m_cp_timer_hdl != iomgr::null_timer_handle) { - iomanager.cancel_timer(m_cp_timer_hdl, true); - m_cp_timer_hdl = iomgr::null_timer_handle; - } - latch.count_down(); - }); - latch.wait(); -} - void CPManager::start_timer() { - auto usecs = cp_timer_us(); - LOGINFO("cp timer is set to {} usec", usecs); - iomanager.run_on_wait(m_timer_fiber, [this, usecs]() { - m_cp_timer_hdl = iomanager.schedule_thread_timer(usecs * 1000, true /* recurring */, nullptr /* cookie */, - [this](void*) { trigger_cp_flush(false /* false */); }); - }); + LOGINFO("cp timer is set to {} usec", HS_DYNAMIC_CONFIG(generic.cp_timer_us)); + m_cp_timer_hdl = iomanager.schedule_global_timer( + HS_DYNAMIC_CONFIG(generic.cp_timer_us) * 1000, true, nullptr /*cookie*/, iomgr::reactor_regex::all_worker, + [this](void*) { trigger_cp_flush(false /* false */, CPTriggerReason::Timer); }, true /* wait_to_schedule */); } void CPManager::on_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie) { @@ -111,22 +72,29 @@ void CPManager::create_first_cp() { m_cur_cp = new CP(this); m_cur_cp->m_cp_status = cp_status_t::cp_io_ready; m_cur_cp->m_cp_id = m_sb->m_last_flushed_cp + 1; - m_cur_cp->m_cp_start_time = Clock::now(); } void CPManager::shutdown() { LOGINFO("Stopping cp timer"); - stop_timer_thread(); + iomanager.cancel_timer(m_cp_timer_hdl, true); + m_cp_timer_hdl = iomgr::null_timer_handle; { std::unique_lock< std::mutex > lk(m_trigger_cp_mtx); m_cp_shutdown_initiated = true; } - LOGINFO("Trigger cp flush at CP shutdown"); - auto success = do_trigger_cp_flush(true /* force */, true /* flush_on_shutdown */).get(); - HS_REL_ASSERT_EQ(success, true, "CP Flush failed"); - LOGINFO("Trigger cp done"); +#ifdef _PRERELEASE + if (!hs()->crash_simulator().is_in_crashing_phase()) { +#endif + LOGINFO("Trigger cp flush at CP shutdown"); + auto success = + do_trigger_cp_flush(true /* force */, true /* flush_on_shutdown */, CPTriggerReason::Timer).get(); + HS_REL_ASSERT_EQ(success, true, "CP Flush failed"); + LOGINFO("Trigger cp done"); +#ifdef _PRERELEASE + } +#endif delete (m_cur_cp); rcu_xchg_pointer(&m_cur_cp, nullptr); @@ -146,6 +114,11 @@ void CPManager::register_consumer(cp_consumer_t consumer_id, std::unique_ptr< CP } } +CPCallbacks* CPManager::get_consumer(cp_consumer_t consumer_id) { + size_t idx = (size_t)consumer_id; + return m_cp_cb_table[idx].get(); +} + [[nodiscard]] CPGuard CPManager::cp_guard() { return CPGuard{this}; } CP* CPManager::cp_io_enter() { @@ -186,11 +159,11 @@ CP* CPManager::get_cur_cp() { return p; } -folly::Future< bool > CPManager::trigger_cp_flush(bool force) { - return do_trigger_cp_flush(force, false /* flush_on_shutdown */); +folly::Future< bool > CPManager::trigger_cp_flush(bool force, CPTriggerReason reason) { + return do_trigger_cp_flush(force, false /* flush_on_shutdown */, reason); } -folly::Future< bool > CPManager::do_trigger_cp_flush(bool force, bool flush_on_shutdown) { +folly::Future< bool > CPManager::do_trigger_cp_flush(bool force, bool flush_on_shutdown, CPTriggerReason reason) { std::unique_lock< std::mutex > lk(m_trigger_cp_mtx); if (m_in_flush_phase) { @@ -210,34 +183,27 @@ folly::Future< bool > CPManager::do_trigger_cp_flush(bool force, bool flush_on_s } } m_in_flush_phase = true; + //++m_trigger_reasons[(size_t)reason]; folly::Future< bool > ret_fut = folly::Future< bool >::makeEmpty(); auto cur_cp = cp_guard(); cur_cp->m_cp_status = cp_status_t::cp_trigger; - HS_PERIODIC_LOG(INFO, cp, "<<<<<<<<<<< Triggering flush of the CP {}", cur_cp->to_string()); + cur_cp->m_is_on_shutdown = flush_on_shutdown; + CP_PERIODIC_LOG(INFO, cur_cp->id(), "Time to flush the CP {}", cur_cp->to_string()); COUNTER_INCREMENT(*m_metrics, cp_cnt, 1); m_wd_cp->set_cp(cur_cp.get()); // allocate a new cp and ask consumers to switchover to new cp auto new_cp = new CP(this); new_cp->m_cp_id = cur_cp->m_cp_id + 1; - new_cp->m_cp_start_time = Clock::now(); - - HS_PERIODIC_LOG(DEBUG, cp, "Create New CP session", new_cp->id()); - // sealer should be the first one to switch over - auto& sealer_cp = m_cp_cb_table[(size_t)cp_consumer_t::SEALER]; - if (sealer_cp) { - new_cp->m_contexts[(size_t)cp_consumer_t::SEALER] = - std::move(sealer_cp->on_switchover_cp(cur_cp.get(), new_cp)); - } - // switch over other consumers - for (size_t svcid = 0; svcid < (size_t)cp_consumer_t::SENTINEL; svcid++) { - if (svcid == (size_t)cp_consumer_t::SEALER) { continue; } - auto& consumer = m_cp_cb_table[svcid]; - if (consumer) { new_cp->m_contexts[svcid] = std::move(consumer->on_switchover_cp(cur_cp.get(), new_cp)); } + + CP_PERIODIC_LOG(DEBUG, new_cp->id(), "Create New CP session"); + size_t idx{0}; + for (auto& consumer : m_cp_cb_table) { + if (consumer) { new_cp->m_contexts[idx] = std::move(consumer->on_switchover_cp(cur_cp.get(), new_cp)); } + ++idx; } - HS_PERIODIC_LOG(DEBUG, cp, "CP Attached completed, proceed to exit cp critical section"); if (m_pending_trigger_cp) { // Triggered because of back-2-back CP, use the pending promise/future. cur_cp->m_comp_promise = std::move(m_pending_trigger_cp_comp); @@ -257,28 +223,27 @@ folly::Future< bool > CPManager::do_trigger_cp_flush(bool force, bool flush_on_s // might start cp flush and we don't want that to hold this mutex. lk.unlock(); - HS_PERIODIC_LOG(DEBUG, cp, "CP critical section done, doing cp_io_exit"); + HS_PERIODIC_LOG(DEBUG, cp, "Active CP switch completed"); return ret_fut; } void CPManager::cp_start_flush(CP* cp) { std::vector< folly::Future< bool > > futs; - HS_PERIODIC_LOG(INFO, cp, "Starting CP {} flush", cp->id()); + CP_PERIODIC_LOG(INFO, cp->id(), "Starting CP flush"); cp->m_cp_status = cp_status_t::cp_flushing; + for (size_t svcid = 0; svcid < (size_t)cp_consumer_t::SENTINEL; svcid++) { - if (svcid == (size_t)cp_consumer_t::SEALER) { continue; } + if (svcid == (size_t)cp_consumer_t::REPLICATION_SVC) { continue; } auto& consumer = m_cp_cb_table[svcid]; bool participated = (cp->m_contexts[svcid] != nullptr); if (consumer && participated) { futs.emplace_back(std::move(consumer->cp_flush(cp))); } } folly::collectAllUnsafe(futs).thenValue([this, cp](auto) { - // Sync flushing SEALER svc which is the replication service - // at last as the cp_lsn updated here. Other component should - // at least flushed to cp_lsn. - auto& sealer_cp = m_cp_cb_table[(size_t)cp_consumer_t::SEALER]; - bool participated = (cp->m_contexts[(size_t)cp_consumer_t::SEALER] != nullptr); - if (sealer_cp && participated) { sealer_cp->cp_flush(cp).wait(); } + // Sync flushing replication svc at last as the cp_lsn updated here + // other component should at least flushed to cp_lsn + auto& repl_cp = m_cp_cb_table[(size_t)cp_consumer_t::REPLICATION_SVC]; + if (repl_cp) { repl_cp->cp_flush(cp).wait(); } // All consumers have flushed for the cp on_cp_flush_done(cp); }); @@ -293,13 +258,14 @@ void CPManager::on_cp_flush_done(CP* cp) { ++(m_sb->m_last_flushed_cp); m_sb.write(); - HISTOGRAM_OBSERVE(*m_metrics, cp_latency, get_elapsed_time_us(cp->m_cp_start_time)); + CP_PERIODIC_LOG(INFO, cp->id(), "CP Flush completed"); cleanup_cp(cp); // Setting promise will cause the CP manager destructor to cleanup before getting a chance to do the // checking if shutdown has been initiated or not. auto promise = std::move(cp->m_comp_promise); m_wd_cp->reset_cp(); + bool is_shutdown_cp = cp->m_is_on_shutdown; delete cp; bool trigger_back_2_back_cp{false}; @@ -311,13 +277,18 @@ void CPManager::on_cp_flush_done(CP* cp) { } promise.setValue(true); + if (!is_shutdown_cp) { // No need of back_2_back cp etc on shutdown. + // Dont access any cp state after this, in case trigger_back_2_back_cp is false, because its false on + // cp_shutdown_initated and setting this promise could destruct the CPManager itself. + if (trigger_back_2_back_cp) { + HS_PERIODIC_LOG(INFO, cp, "Triggering back to back CP"); + COUNTER_INCREMENT(*m_metrics, back_to_back_cps, 1); + trigger_cp_flush(false, CPTriggerReason::Timer); + } - // Dont access any cp state after this, in case trigger_back_2_back_cp is false, because its false on - // cp_shutdown_initated and setting this promise could destruct the CPManager itself. - if (trigger_back_2_back_cp) { - HS_PERIODIC_LOG(INFO, cp, "Triggering back to back CP"); - COUNTER_INCREMENT(*m_metrics, back_to_back_cps, 1); - trigger_cp_flush(false); +#ifdef _PRERELEASE + if (hs()->crash_simulator().is_in_crashing_phase()) { hs()->crash_simulator().crash_now(); } +#endif } }); } @@ -343,9 +314,7 @@ void CPManager::start_cp_thread() { // Multiple sync_io fibers may acquire a thread-level mutex and perform synchronous I/O using io_uring. // This can block the fiber and allow other fibers to be scheduled. // If another fiber tries to acquire the same mutex, a deadlock can occur. - auto const num_fibers = HS_DYNAMIC_CONFIG(generic.cp_io_fibers); // default: 2 - LOGINFO("Starting CP IO fibers with count: {}", num_fibers); - iomanager.create_reactor("cp_io", iomgr::INTERRUPT_LOOP, num_fibers, [this, ctx](bool is_started) { + iomanager.create_reactor("cp_io", iomgr::INTERRUPT_LOOP, 2u, [this, ctx](bool is_started) { if (is_started) { { std::unique_lock< std::mutex > lk{ctx->mtx}; @@ -370,24 +339,28 @@ iomgr::io_fiber_t CPManager::pick_blocking_io_fiber() const { return m_cp_io_fibers[rand_fiber(s_re)]; } +bool CPManager::has_cp_flushed(cp_id_t cp_id) const { return (m_sb->m_last_flushed_cp >= cp_id); } + //////////////////////////////////////// CP Guard class //////////////////////////////////////////// CPGuard::CPGuard(CPManager* mgr) { - if (t_cp_stack.empty()) { + if (mgr == nullptr) { return; } + + if (t_cp_stack->empty()) { // First CP in this thread stack. m_cp = mgr->cp_io_enter(); } else { // Nested CP sections - m_cp = t_cp_stack.top(); + m_cp = t_cp_stack->top(); m_cp->m_cp_mgr->cp_ref(m_cp); } - t_cp_stack.push(m_cp); + t_cp_stack->push(m_cp); m_pushed = true; // m_pushed represented if this is added to current thread stack } CPGuard::~CPGuard() { - if (m_pushed && !t_cp_stack.empty()) { + if (m_pushed && !t_cp_stack->empty()) { // HS_DBG_ASSERT_EQ((void*)m_cp, (void*)t_cp_stack.top(), "CPGuard mismatch of CP pointers"); - t_cp_stack.pop(); + t_cp_stack->pop(); } if (m_cp) { m_cp->m_cp_mgr->cp_io_exit(m_cp); } } @@ -395,25 +368,27 @@ CPGuard::~CPGuard() { CPGuard::CPGuard(const CPGuard& other) { m_cp = other.m_cp; m_pushed = false; - m_cp->m_cp_mgr->cp_ref(m_cp); + if (m_cp) { m_cp->m_cp_mgr->cp_ref(m_cp); } } CPGuard CPGuard::operator=(const CPGuard& other) { m_cp = other.m_cp; m_pushed = false; - m_cp->m_cp_mgr->cp_ref(m_cp); + if (m_cp) { m_cp->m_cp_mgr->cp_ref(m_cp); } return *this; } -CP& CPGuard::operator*() { return *get(); } CP* CPGuard::operator->() { return get(); } -CPContext* CPGuard::context(cp_consumer_t consumer) { return get()->context(consumer); } +CPContext* CPGuard::context(cp_consumer_t consumer) { + CP* cp = get(); + return cp ? cp->context(consumer) : nullptr; +} CP* CPGuard::get() { - HS_DBG_ASSERT_NE((void*)m_cp, (void*)nullptr, "CPGuard get on empty CP pointer"); - if (!m_pushed) { + // HS_DBG_ASSERT_NE((void*)m_cp, (void*)nullptr, "CPGuard get on empty CP pointer"); + if (!m_pushed && m_cp) { // m_pushed is false in case cp guard is moved from one thread to other - t_cp_stack.push(m_cp); + t_cp_stack->push(m_cp); m_pushed = true; } return m_cp; @@ -498,4 +473,6 @@ void CPWatchdog::cp_watchdog_timer() { cp_id_t CPContext::id() const { return m_cp->id(); } +void CPContext::complete(bool status) { m_flush_comp.setValue(status); } + } // namespace homestore diff --git a/src/lib/common/concurrent_vector.hpp b/src/lib/common/concurrent_vector.hpp new file mode 100644 index 000000000..040897080 --- /dev/null +++ b/src/lib/common/concurrent_vector.hpp @@ -0,0 +1,97 @@ +/********************************************************************************* + * + * Author/Developer(s): Harihara Kadayam + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace homestore { + +template < typename T, size_t IncrementalSize = 1024 > +class ConcurrentVector { +private: + struct Block { + std::array< T, IncrementalSize > m_data; + }; + + std::vector< Block > m_blocks; + std::mutex m_mutex; + std::atomic< size_t > m_size{0}; + +public: + struct iterator { + size_t slot_num{0}; + ConcurrentVector* vec; + + iterator() = default; + iterator(ConcurrentVector* v, size_t s) : slot_num{s}, vec{v} {} + + void operator++() { ++slot_num; } + void operator+=(int64_t count) { slot_num += count; } + + bool operator==(iterator const& other) const = default; + bool operator!=(iterator const& other) const = default; + + T const& operator*() const { return vec->at(slot_num); } + T const* operator->() const { return &(vec->at(slot_num)); } + T&& operator*() { return std::move(vec->at(slot_num)); } + }; + +public: + friend class ConcurrentVector::iterator; + + ConcurrentVector() { m_blocks.emplace_back(Block{}); } + ConcurrentVector(size_t size) : m_blocks{(size + IncrementalSize - 1) / IncrementalSize} {} + ConcurrentVector(const ConcurrentVector&) = delete; + ConcurrentVector(ConcurrentVector&&) noexcept = delete; + ConcurrentVector& operator=(const ConcurrentVector&) = delete; + ConcurrentVector& operator=(ConcurrentVector&&) noexcept = delete; + ~ConcurrentVector() = default; + + template < typename U = T > + std::enable_if_t< std::is_copy_constructible_v< U >, void > push_back(U const& ele) { + *(data(get_next_slot())) = ele; + } + void emplace_back(T&& ele) { *(data(get_next_slot())) = std::move(ele); } + + T& at(size_t slot) { return *(data(slot)); } + T const& at(size_t slot) const { return *(data_const(slot)); } + T& operator[](size_t slot) { return *(data(slot)); } + T const& operator[](size_t slot) const { return *(data_const(slot)); } + size_t size() const { return m_size.load(); } + void clear() { m_size.store(0); } + + iterator begin() { return iterator(this, 0); } + iterator end() { return iterator(this, size()); } + +private: + size_t get_next_slot() { + auto next_id = m_size.fetch_add(1); + if (next_id >= m_blocks.size() * IncrementalSize) { + std::unique_lock lg{m_mutex}; + m_blocks.emplace_back(Block{}); + } + return next_id; + } + + T* data(size_t slot) { return &m_blocks[slot / IncrementalSize].m_data[slot % IncrementalSize]; } + T const* data_const(size_t slot) { return &m_blocks[slot / IncrementalSize].m_data[slot % IncrementalSize]; } +}; +} diff --git a/src/lib/common/crash_simulator.hpp b/src/lib/common/crash_simulator.hpp index 9ae7e5236..e8826b61d 100644 --- a/src/lib/common/crash_simulator.hpp +++ b/src/lib/common/crash_simulator.hpp @@ -12,7 +12,7 @@ class CrashSimulator { CrashSimulator(std::function< void(void) > cb = nullptr) : m_restart_cb{std::move(cb)} {} ~CrashSimulator() = default; - void crash() { + void crash_now() { if (m_restart_cb) { m_crashed.update([](auto* s) { *s = true; }); @@ -27,11 +27,15 @@ class CrashSimulator { } } - bool is_crashed() const { return *(m_crashed.access().get()); } + void start_crash() { + m_crashed.update([](auto* s) { *s = true; }); + } + + bool is_in_crashing_phase() const { return *(m_crashed.access().get()); } bool crash_if_flip_set(const std::string& flip_name) { if (iomgr_flip::instance()->test_flip(flip_name)) { - this->crash(); + this->crash_now(); return true; } else { return false; @@ -43,7 +47,7 @@ class CrashSimulator { private: std::function< void(void) > m_restart_cb{nullptr}; - std::atomic< bool > m_will_crash{false}; + std::atomic m_will_crash{false}; sisl::urcu_scoped_ptr< bool > m_crashed; }; } // namespace homestore diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index 5a63bb9d5..4a7f9bd8b 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -57,6 +57,18 @@ table Btree { max_nodes_to_rebalance: uint32 = 3; mem_btree_page_size: uint32 = 8192; + + /* Maximum number of btree pages to persist per IO */ + max_btree_write_size_per_io: uint64 = 16777216; /* 16 MB */ + + // Maximum number of incremental map cp's flushed for cowbtree before we do a forcible full map flush during CP. This + // is a maximum, but in reality on a lively written system, we might do full map flushes much sooner. + cow_max_incremental_map_flushes: uint64 = 10000 (hotswap); + + // Percentage of metablk filled size, upto which we can do incremental map. Once it reaches this threshold on metablk, we + // will do a force full map flush on all btrees. Force full map flush should free up the space once CP is completely. So + // setting this is higher is reasonable to prevent full map flushes. + cow_full_map_flush_size_threshold_pct: double = 80 (hotswap); } table Cache { @@ -123,11 +135,9 @@ table Generic { // cp timer in us cp_timer_us: uint64 = 60000000 (hotswap); - // number of fibers for cp_io thread; - cp_io_fibers: uint32 = 2; - - // writeback cache flush threads - cache_flush_threads : int32 = 1; + // Btree CP flush threads and fibers per thread. So effectively it will be threads * fibers concurrent IOs + btree_cp_flush_threads : uint32 = 2; + btree_cp_flush_fibers_per_thread : uint32 = 8; cp_watchdog_timer_sec : uint32 = 10; // it checks if cp stuck every 10 seconds @@ -161,7 +171,14 @@ table Generic { table ResourceLimits { /* it is going to use 2 times of this space because of two concurrent cps */ - dirty_buf_percent: uint32 = 1 (hotswap); + + // Percentage of Total IO Memory size with which we can have dirty buffer, beyond which it will force flush by + // triggering Checkpoint + index_max_dirty_memory_percent: double = 1 (hotswap); + + // Percentage of Index VDev space upto which we allow freed space are allowed to be made pending, after which a + // checkpoint is triggered and capture the free space. + index_max_free_space_accumulate_percent: double = 1 (hotswap); /* it is going to use 2 times of this space because of two concurrent cps */ free_blk_cnt: uint32 = 10000000 (hotswap); @@ -184,8 +201,7 @@ table ResourceLimits { /* num entries that raft logstore wants to reserve -- its truncate should not across this */ /* 0 means HomeStore doesn't reserve anything and let nuraft controlls the truncation */ - /* default reserve 1 million logs */ - raft_logstore_reserve_threshold: uint32 = 1000000 (hotswap); + raft_logstore_reserve_threshold: uint32 = 0 (hotswap); /* resource audit timer in ms */ resource_audit_timer_ms: uint32 = 120000; @@ -238,9 +254,7 @@ table Consensus { snapshot_freq_distance: uint32 = 2000; // Num reserved log items while triggering compact from raft server, only consumed by nuraft server; - // Set it same as snapshot_freq_distance, so that every create_snapshot will trigger compact - // which is helpful for truncate unused logs - num_reserved_log_items: uint32 = 2000; + num_reserved_log_items: uint32 = 20000; // Max append batch size max_append_batch_size: int32 = 64; @@ -306,9 +320,6 @@ table Consensus { // The interval in ms to check if the new member in replace_member is fully synced and ready to take over replace_member_sync_check_interval_ms: uint64 = 60000; - - // Enable tee logs to console, this is helpful for sherlock to collect logs - enable_console_log: bool = true; } table HomeStoreSettings { diff --git a/src/lib/common/homestore_utils.cpp b/src/lib/common/homestore_utils.cpp index 937083012..a3eacbfdc 100644 --- a/src/lib/common/homestore_utils.cpp +++ b/src/lib/common/homestore_utils.cpp @@ -51,7 +51,7 @@ bool hs_utils::mod_aligned_sz(size_t size_to_check, size_t align_sz) { bool hs_utils::is_ptr_aligned(void* ptr, std::size_t alignment) { // Cast the pointer to uintptr_t, which is an integer type capable of holding a pointer - auto intptr = reinterpret_cast< std::uintptr_t >(ptr); + auto intptr = reinterpret_cast(ptr); // Check if the pointer is a multiple of the alignment return (intptr % alignment) == 0; } diff --git a/src/lib/common/homestore_utils.hpp b/src/lib/common/homestore_utils.hpp index d8980ae19..b6989ff48 100644 --- a/src/lib/common/homestore_utils.hpp +++ b/src/lib/common/homestore_utils.hpp @@ -54,6 +54,7 @@ class hs_utils { std::vector< std::string >& ordered_entries); }; -static bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms = 100); +static bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, + uint32_t interval_ms = 100); } // namespace homestore diff --git a/src/lib/common/large_id_reserver.hpp b/src/lib/common/large_id_reserver.hpp new file mode 100644 index 000000000..cadc50bdd --- /dev/null +++ b/src/lib/common/large_id_reserver.hpp @@ -0,0 +1,56 @@ +#pragma once +#include +#include +#include +#include "common/homestore_assert.hpp" + +namespace homestore { +class LargeIDReserver { +private: + using IntervalSet = boost::icl::interval_set< uint32_t >; + using Interval = IntervalSet::interval_type; + + IntervalSet m_iset; + uint64_t m_max; + +public: + LargeIDReserver(uint32_t max_count) : m_max{max_count} {} + ~LargeIDReserver() = default; + + static constexpr uint64_t out_of_bounds = std::numeric_limits< uint64_t >::max(); + uint64_t reserve() { + uint64_t id = find_next(); + if (id >= m_max) { return out_of_bounds; } + m_iset.insert(Interval::right_open(id, id + 1)); + return id; + } + + void reserve(uint64_t id) { + HS_DBG_ASSERT(!is_reserved(id), "Reserving an already reserved id={}", id); + m_iset.insert(Interval::right_open(id, id + 1)); + } + + void unreserve(uint64_t id) { + HS_DBG_ASSERT_LT(id, m_max, "Unreserving an id which was out of bounds"); + m_iset.erase(Interval::right_open(id, id + 1)); + } + + bool is_reserved(uint64_t id) const { return (m_iset.find(id) != m_iset.end()); } + +private: + uint64_t find_next() const { + uint64_t next = 0; + auto it = m_iset.begin(); + while (it != m_iset.end()) { + if (it->lower() != 0) { + next = it->lower() - 1; + break; + } else { + next = it->upper(); + ++it; + } + } + return next; + } +}; +} // namespace homestore \ No newline at end of file diff --git a/src/lib/common/resource_mgr.cpp b/src/lib/common/resource_mgr.cpp index 7dcb6190f..8440d6f68 100644 --- a/src/lib/common/resource_mgr.cpp +++ b/src/lib/common/resource_mgr.cpp @@ -15,13 +15,15 @@ *********************************************************************************/ #include #include -#include +#include #include #include "resource_mgr.hpp" #include "homestore_assert.hpp" -#include "replication/repl_dev/raft_repl_dev.h" -#include "replication/service/generic_repl_svc.h" +#ifdef REPLICATION_SUPPORT +#include +#include "replication/repl_dev/raft_repl_dev.h" +#endif namespace homestore { ResourceMgr& resource_mgr() { return hs()->resource_mgr(); } @@ -32,14 +34,13 @@ void ResourceMgr::start(uint64_t total_cap) { void ResourceMgr::stop() { LOGINFO("Cancel resource manager timer."); - m_is_stopped_ = true; if (m_res_audit_timer_hdl != iomgr::null_timer_handle) { iomanager.cancel_timer(m_res_audit_timer_hdl); } m_res_audit_timer_hdl = iomgr::null_timer_handle; } // -// 1. Conceptually in rare case(not poosible for NuObject, possibly true for NuBlox2.0) truncate itself can't garunteen -// the space is freed up upto satisfy resource manager. e.g. multiple log stores on this same descriptor and one +// 1. Conceptually in rare case truncate itself can't guarantee the space is freed up upto satisfy resource manager. +// e.g. multiple log stores on this same descriptor and one // logstore lagging really behind and not able to truncate much space. Doing multiple truncation won't help in this // case. // 2. And any write on any other descriptor will trigger a high_watermark_check, and if it were to trigger critial @@ -48,21 +49,7 @@ void ResourceMgr::stop() { // writes on this descriptor; // void ResourceMgr::trigger_truncate() { - if (m_is_stopped_.load()) { - // when we are here, it means HomeStore is shutting down and since this API is called in timer thread, the timer - // thread might already been triggered while RM is tring to cancel it; - // and since shutdown and timer thread happen parallel, by the time we are here, shutdown might already cleaned - // up all replication service instances. and it will throw heap-use-after-free; - LOGINFO("Resource manager is stopped, so not triggering truncate"); - return; - } - if (hs()->has_repl_data_service()) { - auto& repl_svc = dynamic_cast< GenericReplService& >(hs()->repl_service()); - if (repl_svc.get_impl_type() == repl_impl_type::solo) { - // skip truncation from RM for solo repl dev; - return; - } /* * DO NOT NEED : raft will truncate logs. * // first make sure all repl dev's underlying raft log store make corresponding reservation during @@ -98,61 +85,7 @@ void ResourceMgr::start_timer() { true /* wait_to_schedule */); } -/* monitor dirty buffer count */ -void ResourceMgr::inc_dirty_buf_size(const uint32_t size) { - HS_REL_ASSERT_GT(size, 0); - const auto dirty_buf_cnt = m_hs_dirty_buf_cnt.fetch_add(size, std::memory_order_relaxed); - COUNTER_INCREMENT(m_metrics, dirty_buf_cnt, size); - if (m_dirty_buf_exceed_cb && ((dirty_buf_cnt + size) > get_dirty_buf_limit())) { - m_dirty_buf_exceed_cb(dirty_buf_cnt + size, false /* critical */); - } -} - -void ResourceMgr::dec_dirty_buf_size(const uint32_t size) { - HS_REL_ASSERT_GT(size, 0); - const int64_t dirty_buf_cnt = m_hs_dirty_buf_cnt.fetch_sub(size, std::memory_order_relaxed); - COUNTER_DECREMENT(m_metrics, dirty_buf_cnt, size); - if (dirty_buf_cnt < size) { LOGERROR("dirty_buf_cnt {} of now is less then size {}", dirty_buf_cnt, size); } - // HS_REL_ASSERT_GE(dirty_buf_cnt, size); -} - -void ResourceMgr::register_dirty_buf_exceed_cb(exceed_limit_cb_t cb) { m_dirty_buf_exceed_cb = std::move(cb); } - -/* monitor free blk cnt */ -void ResourceMgr::inc_free_blk(int size) { - // trigger hs cp when either one of the limit is reached - auto cnt = m_hs_fb_cnt.fetch_add(1, std::memory_order_relaxed); - auto sz = m_hs_fb_size.fetch_add(size, std::memory_order_relaxed); - COUNTER_INCREMENT(m_metrics, free_blk_size_in_cp, size); - COUNTER_INCREMENT(m_metrics, free_blk_cnt_in_cp, 1); -} - -void ResourceMgr::dec_free_blk(int size) { - auto dirty_fb_cnt = m_hs_fb_cnt.fetch_sub(1, std::memory_order_relaxed); - HS_REL_ASSERT_GE(dirty_fb_cnt, 0); - auto dirty_fb_size = m_hs_fb_size.fetch_sub(size, std::memory_order_relaxed); - HS_REL_ASSERT_GE(dirty_fb_size, 0); - COUNTER_DECREMENT(m_metrics, free_blk_size_in_cp, size); - COUNTER_DECREMENT(m_metrics, free_blk_cnt_in_cp, 1); -} - -void ResourceMgr::register_free_blks_exceed_cb(exceed_limit_cb_t cb) { m_free_blks_exceed_cb = std::move(cb); } - -bool ResourceMgr::can_add_free_blk(int cnt) const { - if ((cur_free_blk_cnt() + cnt) > get_free_blk_cnt_limit() || (cur_free_blk_size()) > get_free_blk_size_limit()) { - return false; - } else { - return true; - } -} - -int64_t ResourceMgr::cur_free_blk_cnt() const { return m_hs_fb_cnt.load(std::memory_order_relaxed); } -int64_t ResourceMgr::get_free_blk_cnt_limit() const { return ((HS_DYNAMIC_CONFIG(resource_limits.free_blk_cnt))); } -int64_t ResourceMgr::cur_free_blk_size() const { return m_hs_fb_size.load(std::memory_order_relaxed); } -int64_t ResourceMgr::get_free_blk_size_limit() const { - return ((m_total_cap * HS_DYNAMIC_CONFIG(resource_limits.free_blk_size_percent)) / 100); -} - +//////////////////////// Index Resource Tracking //////////////////////////////////// /* monitor memory used to store seqid --> data mapping during recovery */ void ResourceMgr::inc_mem_used_in_recovery(int size) { m_memory_used_in_recovery.fetch_add(size, std::memory_order_relaxed); @@ -231,9 +164,4 @@ void ResourceMgr::increase_dirty_buf_qd() { void ResourceMgr::reset_dirty_buf_qd() { m_flush_dirty_buf_q_depth = HS_DYNAMIC_CONFIG(generic.cache_max_throttle_cnt); } - -int64_t ResourceMgr::get_dirty_buf_limit() const { - return int64_cast((HS_DYNAMIC_CONFIG(resource_limits.dirty_buf_percent) * HS_STATIC_CONFIG(input.io_mem_size())) / - 100); -} } // namespace homestore diff --git a/src/lib/common/resource_mgr.hpp b/src/lib/common/resource_mgr.hpp index 6a9023627..7579123b4 100644 --- a/src/lib/common/resource_mgr.hpp +++ b/src/lib/common/resource_mgr.hpp @@ -22,7 +22,7 @@ namespace homestore { class RsrcMgrMetrics : public sisl::MetricsGroup { public: explicit RsrcMgrMetrics() : sisl::MetricsGroup("resource_mgr", "resource_mgr") { - REGISTER_COUNTER(dirty_buf_cnt, "Total wb cache dirty buffer cnt", sisl::_publish_as::publish_as_gauge); + REGISTER_COUNTER(index_dirty_size, "Total Index cache dirty buffer size", sisl::_publish_as::publish_as_gauge); REGISTER_COUNTER(free_blk_size_in_cp, "Total free blks size accumulated in a cp", sisl::_publish_as::publish_as_gauge); REGISTER_COUNTER(free_blk_cnt_in_cp, "Total free blks cnt accumulated in a cp", @@ -47,24 +47,6 @@ class ResourceMgr { void start(uint64_t total_cap); void stop(); - /* monitor dirty buffer count */ - void inc_dirty_buf_size(const uint32_t size); - void dec_dirty_buf_size(const uint32_t size); - void register_dirty_buf_exceed_cb(exceed_limit_cb_t cb); - - /* monitor free blk cnt */ - void inc_free_blk(int size); - - void dec_free_blk(int size); - void register_free_blks_exceed_cb(exceed_limit_cb_t cb); - - bool can_add_free_blk(int cnt) const; - - int64_t cur_free_blk_cnt() const; - int64_t get_free_blk_cnt_limit() const; - int64_t cur_free_blk_size() const; - int64_t get_free_blk_size_limit() const; - /* monitor memory used to store seqid --> data mapping during recovery */ void inc_mem_used_in_recovery(int size); void dec_mem_used_in_recovery(int size); @@ -129,26 +111,19 @@ class ResourceMgr { void trigger_truncate(); private: - int64_t get_dirty_buf_limit() const; - /** * Starts resource manager resource audit timer. */ void start_timer(); private: - std::atomic< int64_t > m_hs_dirty_buf_cnt; - std::atomic< int64_t > m_hs_fb_cnt; // free count std::atomic< int64_t > m_hs_fb_size; // free size std::atomic< int64_t > m_hs_ab_cnt; // alloc count std::atomic< int64_t > m_memory_used_in_recovery; std::atomic< uint32_t > m_flush_dirty_buf_q_depth{64}; - std::atomic< bool > m_is_stopped_{false}; uint64_t m_total_cap; // TODO: make it event_cb - exceed_limit_cb_t m_dirty_buf_exceed_cb; - exceed_limit_cb_t m_free_blks_exceed_cb; exceed_limit_cb_t m_journal_vdev_exceed_cb; RsrcMgrMetrics m_metrics; diff --git a/src/lib/device/chunk.cpp b/src/lib/device/chunk.cpp index ecbd293cf..4962be386 100644 --- a/src/lib/device/chunk.cpp +++ b/src/lib/device/chunk.cpp @@ -30,7 +30,7 @@ std::string Chunk::to_string() const { } float Chunk::get_blk_usage() const { - return s_cast< float >(m_blk_allocator->get_used_blks()) / s_cast< float >(m_blk_allocator->get_total_blks()); + return s_cast(m_blk_allocator->get_used_blks()) / s_cast(m_blk_allocator->get_total_blks()); } void Chunk::set_user_private(const sisl::blob& data) { diff --git a/src/lib/device/device.h b/src/lib/device/device.h index 3d9818ed0..1c3843534 100644 --- a/src/lib/device/device.h +++ b/src/lib/device/device.h @@ -133,6 +133,7 @@ class DeviceManager { sisl::sparse_vector< std::unique_ptr< PhysicalDev > > m_all_pdevs; std::map< HSDevType, std::vector< PhysicalDev* > > m_pdevs_by_type; + uint32_t m_cur_pdev_id{0}; std::map< uint16_t, shared< Chunk > > m_chunks; // Chunks organized as array (indexed on chunk id) sisl::Bitset m_chunk_id_bm{hs_super_blk::MAX_CHUNKS_IN_SYSTEM}; // Bitmap to keep track of chunk ids available @@ -154,7 +155,6 @@ class DeviceManager { bool is_first_time_boot() const { return m_first_time_boot; } void format_devices(); - uint32_t format_single_device(dev_info& dinfo); void commit_formatting(); void load_devices(); void close_devices(); @@ -165,11 +165,7 @@ class DeviceManager { /// @param event_cb Event handler in case of /// @return shared< VirtualDev > create_vdev(vdev_parameters&& vdev_param); - void compose_vparam(uint64_t vdev_id, vdev_parameters& vparam, std::vector< PhysicalDev* > pdevs); - std::map< PhysicalDev*, uint32_t > calculate_vdev_chunk_num_on_new_pdevs(shared< VirtualDev > vdev, - std::vector< PhysicalDev* > pdevs, - uint64_t total_chunk_num); - void add_pdev_to_vdev(shared< VirtualDev > vdev, PhysicalDev* pdev, uint32_t total_chunk_num_in_pdev); + const Chunk* get_chunk(uint16_t chunk_id) { return get_chunk_mutable(chunk_id); } Chunk* get_chunk_mutable(uint16_t chunk_id) { diff --git a/src/lib/device/device_manager.cpp b/src/lib/device/device_manager.cpp index ff4cb5d88..28eb37e33 100644 --- a/src/lib/device/device_manager.cpp +++ b/src/lib/device/device_manager.cpp @@ -29,8 +29,6 @@ #include "common/homestore_utils.hpp" #include "common/homestore_assert.hpp" -#include - namespace homestore { static int determine_open_flags(io_flag oflags) { @@ -62,8 +60,6 @@ static bool is_hdd(const std::string& devname) { static void populate_vdev_info(const vdev_parameters& vparam, uint32_t vdev_id, const std::vector< PhysicalDev* >& pdevs, vdev_info* out_info); -static void populate_vparam(vdev_parameters& vparam, vdev_info& vinfo); - DeviceManager::DeviceManager(const std::vector< dev_info >& devs, vdev_create_cb_t vdev_create_cb) : m_dev_infos{devs}, m_vdev_create_cb{std::move(vdev_create_cb)} { bool found_hdd_dev{false}; @@ -100,8 +96,6 @@ DeviceManager::DeviceManager(const std::vector< dev_info >& devs, vdev_create_cb } void DeviceManager::format_devices() { - // Only the first time boot, we will generate the first block header. After that, the first block header will be - // loaded from the existing devices. ++m_first_blk_hdr.gen_number; m_first_blk_hdr.version = first_block_header::CURRENT_SUPERBLOCK_VERSION; std::strncpy(m_first_blk_hdr.product_name, first_block_header::PRODUCT_NAME, @@ -114,44 +108,38 @@ void DeviceManager::format_devices() { // Get common iomgr_attributes for (auto& dinfo : m_dev_infos) { - format_single_device(dinfo); - } -} + auto attr = iomgr::DriveInterface::get_attributes(dinfo.dev_name); + if (dinfo.dev_size == 0) { dinfo.dev_size = PhysicalDev::get_dev_size(dinfo.dev_name); } + auto sb_size = hs_super_blk::total_used_size(dinfo); + auto buf = hs_utils::iobuf_alloc(sb_size, sisl::buftag::superblk, attr.align_size); + std::memset(buf, 0, sb_size); -uint32_t DeviceManager::format_single_device(dev_info& dinfo) { - HS_LOG_ASSERT(!m_first_blk_hdr.is_empty(), "Empty first block header, cannot format device {}", dinfo.dev_name); - auto attr = iomgr::DriveInterface::get_attributes(dinfo.dev_name); - if (dinfo.dev_size == 0) { dinfo.dev_size = PhysicalDev::get_dev_size(dinfo.dev_name); } - auto sb_size = hs_super_blk::total_used_size(dinfo); - auto buf = hs_utils::iobuf_alloc(sb_size, sisl::buftag::superblk, attr.align_size); - std::memset(buf, 0, sb_size); - - first_block* fblk = r_cast< first_block* >(buf); - fblk->magic = first_block::HOMESTORE_MAGIC; - fblk->checksum = 0; // Computed while writing the first block - fblk->formatting_done = 0x0; // Formatting is not done yet, until homestore is completely started - fblk->hdr = m_first_blk_hdr; // Entire header is copied as is - auto pdev_id = populate_pdev_info(dinfo, attr, m_first_blk_hdr.system_uuid, fblk->this_pdev_hdr); - fblk->checksum = crc32_ieee(init_crc32, uintptr_cast(fblk), first_block::s_atomic_fb_size); - - auto pdev = std::make_unique< PhysicalDev >(dinfo, device_open_flags(dinfo.dev_name), fblk->this_pdev_hdr); - - LOGINFO("Formatting Homestore on Device[dev_name={}, pdev_id={}] with first block as: [{}] total_super_blk_size={}", - dinfo.dev_name, pdev_id, fblk->to_string(), sb_size); - pdev->write_super_block(buf, sb_size, hs_super_blk::first_block_offset()); - - auto it = m_pdevs_by_type.find(dinfo.dev_type); - if (it == m_pdevs_by_type.end()) { - bool happened; - std::tie(it, happened) = m_pdevs_by_type.insert(std::pair{dinfo.dev_type, std::vector< PhysicalDev* >{}}); - } - it->second.push_back(pdev.get()); + first_block* fblk = r_cast< first_block* >(buf); + fblk->magic = first_block::HOMESTORE_MAGIC; + fblk->checksum = 0; // Computed while writing the first block + fblk->formatting_done = 0x0; // Formatting is not done yet, until homestore is completely started + fblk->hdr = m_first_blk_hdr; // Entire header is copied as is + auto pdev_id = populate_pdev_info(dinfo, attr, m_first_blk_hdr.system_uuid, fblk->this_pdev_hdr); + fblk->checksum = crc32_ieee(init_crc32, uintptr_cast(fblk), first_block::s_atomic_fb_size); - pdev->format_chunks(); - m_all_pdevs[pdev_id] = std::move(pdev); + auto pdev = std::make_unique< PhysicalDev >(dinfo, device_open_flags(dinfo.dev_name), fblk->this_pdev_hdr); - hs_utils::iobuf_free(buf, sisl::buftag::superblk); - return pdev_id; + LOGINFO("Formatting Homestore on Device={} with first block as: [{}] total_super_blk_size={}", dinfo.dev_name, + fblk->to_string(), sb_size); + pdev->write_super_block(buf, sb_size, hs_super_blk::first_block_offset()); + + auto it = m_pdevs_by_type.find(dinfo.dev_type); + if (it == m_pdevs_by_type.end()) { + bool happened; + std::tie(it, happened) = m_pdevs_by_type.insert(std::pair{dinfo.dev_type, std::vector< PhysicalDev* >{}}); + } + it->second.push_back(pdev.get()); + + pdev->format_chunks(); + m_all_pdevs[pdev_id] = std::move(pdev); + + hs_utils::iobuf_free(buf, sisl::buftag::superblk); + } } void DeviceManager::load_devices() { @@ -165,115 +153,33 @@ void DeviceManager::load_devices() { m_boot_in_degraded_mode = true; } - // 1. Load all physical devices. - std::vector< dev_info > pdevs_to_format; - auto stale_first_blk_found = false; - for (auto& d : m_dev_infos) { + for (const auto& d : m_dev_infos) { first_block fblk = PhysicalDev::read_first_block(d.dev_name, device_open_flags(d.dev_name)); pdev_info_header* pinfo = &fblk.this_pdev_hdr; - if (!fblk.is_valid()) { - pdevs_to_format.emplace_back(d); - LOGINFO("Empty first block found on device {}, format it", d.dev_name); - } else { - RELEASE_ASSERT_EQ(pinfo->get_system_uuid_str(), m_first_blk_hdr.get_system_uuid_str(), - "Device {} has uuid stamp different than this instance uuid. Perhaps device from other " - "homestore is provided?", - d.dev_name); - - auto pdev = std::make_unique< PhysicalDev >(d, device_open_flags(d.dev_name), *pinfo); - LOGINFO("Loading Homestore from Device={} with first block as: [{}]", d.dev_name, fblk.to_string()); - - auto it = m_pdevs_by_type.find(d.dev_type); - if (it == m_pdevs_by_type.end()) { - bool happened; - std::tie(it, happened) = m_pdevs_by_type.insert(std::pair{d.dev_type, std::vector< PhysicalDev* >{}}); - } - it->second.push_back(pdev.get()); - m_all_pdevs[pinfo->pdev_id] = std::move(pdev); - stale_first_blk_found = fblk.hdr.gen_number != m_first_blk_hdr.gen_number; - if (fblk.hdr.gen_number > m_first_blk_hdr.gen_number) { - // cur_pdev_id will be updated to the max pdev id found in the formatted devices. The stale number will - // be flushed in commit_formatting(). - LOGINFO("newer generation number {} found in device {}, updating first block header", - fblk.hdr.gen_number, d.dev_name); - m_first_blk_hdr = fblk.hdr; - } + RELEASE_ASSERT_EQ(pinfo->get_system_uuid_str(), m_first_blk_hdr.get_system_uuid_str(), + "Device {} has uuid stamp different than this instance uuid. Perhaps device from other " + "homestore is provided?", + d.dev_name); + + auto pdev = std::make_unique< PhysicalDev >(d, device_open_flags(d.dev_name), *pinfo); + LOGINFO("Loading Homestore from Device={} with first block as: [{}]", d.dev_name, fblk.to_string()); + + auto it = m_pdevs_by_type.find(d.dev_type); + if (it == m_pdevs_by_type.end()) { + bool happened; + std::tie(it, happened) = m_pdevs_by_type.insert(std::pair{d.dev_type, std::vector< PhysicalDev* >{}}); } - } + it->second.push_back(pdev.get()); - // 2. format new devices. - for (auto& d : pdevs_to_format) { - auto pdev_id = format_single_device(d); - LOGINFO("Device {} has been formatted, pdev_id {}", d.dev_name, pdev_id); + m_all_pdevs[pinfo->pdev_id] = std::move(pdev); } - // 3. Recover vdevs from the physical devices. load_vdevs(); - - if (pdevs_to_format.empty() && !stale_first_blk_found) return; - - if (!pdevs_to_format.empty()) { - ++m_first_blk_hdr.gen_number; - // 4. Add new physical devices to existing vdevs. - for (auto vdev : m_vdevs) { - vdev_parameters vparam; - auto vinfo = vdev->info(); - populate_vparam(vparam, vinfo); - if (vparam.size_type == vdev_size_type_t::VDEV_SIZE_DYNAMIC || - vparam.multi_pdev_opts == vdev_multi_pdev_opts_t::SINGLE_FIRST_PDEV || - vparam.multi_pdev_opts == vdev_multi_pdev_opts_t::SINGLE_ANY_PDEV) { - LOGINFO("Skipping adding new devices to vdev {}, as it is dynamic or single pdev type", - vinfo.get_name()); - continue; - } - - std::vector< PhysicalDev* > pdevs = pdevs_by_type_internal(vparam.dev_type); - RELEASE_ASSERT_GT( - pdevs.size(), 0, - "vdev is loaded from at least one pdev, but unable to find any pdevs for given vdev type"); - RELEASE_ASSERT(vparam.blk_size % pdevs[0]->align_size() == 0, - "blk_size should be multiple of pdev align_size"); - - // vparam.num_chunks will be inferred. - compose_vparam(vdev->info().vdev_id, vparam, pdevs); - if (vdev->get_pdevs().size() == pdevs.size()) { - LOGDEBUG("Virtual device {} is already sized correctly, no new devices to add", - vdev->info().get_name()); - continue; - } - LOGINFO("Virtual device {} is undersized, pdevs already added={}, qualified pdevs ={}, need to add new " - "devices to it", - vdev->info().get_name(), vdev->get_pdevs().size(), pdevs.size()); - - // calculate the number of chunks to be created in each new pdev - auto pdev_chunk_num_map = calculate_vdev_chunk_num_on_new_pdevs(vdev, pdevs, vparam.num_chunks); - - std::unique_lock lg{m_vdev_mutex}; - auto buf = hs_utils::iobuf_alloc(vdev_info::size, sisl::buftag::superblk, pdevs[0]->align_size()); - std::memcpy(buf, &vinfo, sizeof(vdev_info)); - uint64_t offset = hs_super_blk::vdev_sb_offset() + (vdev->info().vdev_id * vdev_info::size); - - // add the new pdevs to the vdev - for (auto pdev_to_add : pdev_chunk_num_map) { - auto pdev = pdev_to_add.first; - add_pdev_to_vdev(vdev, pdev_to_add.first, pdev_to_add.second); - LOGINFO("Added pdev[name={}, id={}] with total_chunk_num_in_pdev={} to vdev {}", pdev->get_devname(), - pdev->pdev_id(), pdev_to_add.second, vdev->info().get_name()); - - // Update vdev info in the super block area of the pdev - pdev->write_super_block(buf, vdev_info::size, offset); - } - - hs_utils::iobuf_free(buf, sisl::buftag::superblk); - } - } - commit_formatting(); } void DeviceManager::commit_formatting() { auto buf = hs_utils::iobuf_alloc(hs_super_blk::first_block_size(), sisl::buftag::superblk, 512); - LOGINFO("commit formatting first block with gen_number={}", m_first_blk_hdr.gen_number); for (auto& pdev : m_all_pdevs) { if (!pdev) { continue; } @@ -284,8 +190,6 @@ void DeviceManager::commit_formatting() { } first_block* fblk = r_cast< first_block* >(buf); - fblk->hdr.gen_number = m_first_blk_hdr.gen_number; - fblk->hdr.cur_pdev_id = m_first_blk_hdr.cur_pdev_id; fblk->formatting_done = 0x1; fblk->checksum = crc32_ieee(init_crc32, uintptr_cast(fblk), first_block::s_atomic_fb_size); @@ -308,62 +212,10 @@ shared< VirtualDev > DeviceManager::create_vdev(vdev_parameters&& vparam) { auto vdev_id = m_vdev_id_bm.get_next_reset_bit(0u); if (vdev_id == sisl::Bitset::npos) { throw std::out_of_range("System has no room for additional vdev"); } m_vdev_id_bm.set_bit(vdev_id); + std::vector< PhysicalDev* > pdevs = pdevs_by_type_internal(vparam.dev_type); RELEASE_ASSERT_GT(pdevs.size(), 0, "Unable to find any pdevs for given vdev type, can't create vdev"); RELEASE_ASSERT(vparam.blk_size % pdevs[0]->align_size() == 0, "blk_size should be multiple of pdev align_size"); - - // Populate the vdev parameters based on the given cfg and pdevs - compose_vparam(vdev_id, vparam, pdevs); - - // Convert the vparameters to the vdev_info - auto buf = hs_utils::iobuf_alloc(vdev_info::size, sisl::buftag::superblk, pdevs[0]->align_size()); - auto vinfo = new (buf) vdev_info(); - populate_vdev_info(vparam, vdev_id, pdevs, vinfo); - - // Do a callback for the upper layer to create the vdev instance from vdev_info - shared< VirtualDev > vdev = m_vdev_create_cb(*vinfo, false /* load_existing */); - m_vdevs[vdev_id] = vdev; - - // different type might have different capacity, so we need to spread all the newly created chunks to all pdevs - // according to their capacity - - auto pdev_chunk_num_map = calculate_vdev_chunk_num_on_new_pdevs(vdev, pdevs, vparam.num_chunks); - - uint32_t total_created_chunks{0}; - - for (auto& pdev : pdevs) { - if (total_created_chunks >= vparam.num_chunks) break; - - // the total number of chunks will be created in this pdev - auto total_chunk_num_in_pdev = pdev_chunk_num_map[pdev]; - - RELEASE_ASSERT(vparam.num_chunks >= total_chunk_num_in_pdev, - "chunks in pdev {} is {}, larger than total chunks {} , which is expected to be created ", - pdev->get_devname(), total_chunk_num_in_pdev, vparam.num_chunks); - - LOGINFO("{} chunks is created on pdev {} for vdev {}, pdev data size is {}", total_chunk_num_in_pdev, - pdev->get_devname(), vparam.vdev_name, pdev->data_size()); - - add_pdev_to_vdev(vdev, pdev, total_chunk_num_in_pdev); - total_created_chunks += total_chunk_num_in_pdev; - } - - LOGINFO("{} chunks is created for vdev {}, expected {}", total_created_chunks, vparam.vdev_name, vparam.num_chunks); - // Handle any initialization needed. - vdev->init(); - // Locate and write the vdev info in the super blk area of all pdevs this vdev will be created on - for (auto& pdev : pdevs) { - uint64_t offset = hs_super_blk::vdev_sb_offset() + (vdev_id * vdev_info::size); - pdev->write_super_block(buf, vdev_info::size, offset); - } - - vinfo->~vdev_info(); - hs_utils::iobuf_free(buf, sisl::buftag::superblk); - LOGINFO("Virtal Dev={} of size={} successfully created", vparam.vdev_name, in_bytes(vparam.vdev_size)); - return vdev; -} - -void DeviceManager::compose_vparam(uint64_t vdev_id, vdev_parameters& vparam, std::vector< PhysicalDev* > pdevs) { // Identify the number of chunks if (vparam.multi_pdev_opts == vdev_multi_pdev_opts_t::ALL_PDEV_STRIPED) { auto total_streams = std::accumulate(pdevs.begin(), pdevs.end(), 0u, @@ -465,69 +317,74 @@ void DeviceManager::compose_vparam(uint64_t vdev_id, vdev_parameters& vparam, st "adjusted as follows: VDev_Size={} Num_pdevs={} Total_chunks_across_all_pdevs={} Each_Chunk_Size={}", vparam.vdev_name, in_bytes(input_vdev_size), vdev_id, vparam.multi_pdev_opts, in_bytes(vparam.vdev_size), pdevs.size(), vparam.num_chunks, in_bytes(vparam.chunk_size)); -} -// The actual total chunk num might be not the same as vdev.num_chunks, as it is calculated based on the pdevs data -// size proportion. -std::map< PhysicalDev*, uint32_t > -DeviceManager::calculate_vdev_chunk_num_on_new_pdevs(shared< VirtualDev > vdev, std::vector< PhysicalDev* > pdevs, - uint64_t total_chunk_num) { - auto added_pdevs = vdev->get_pdevs(); - uint64_t total_pdev_data_size = 0; - uint32_t chunk_num = 0; - if (added_pdevs.size() == 0) { - // vdev is created newly, so we need to calculate the total bytes of all pdevs - total_pdev_data_size = std::accumulate(pdevs.begin(), pdevs.end(), 0ull, - [](uint64_t r, const PhysicalDev* a) { return r + a->data_size(); }); - chunk_num = total_chunk_num; - LOGDEBUG("total size of type {} in this homestore is {}", vdev->get_dev_type(), total_pdev_data_size) - } else { - // vdev is recovered from existing pdevs, in this case, calculate the number of chunks needed based on the - // proportional relationship between the size of the new disk and the existing disks. - total_pdev_data_size = std::accumulate(added_pdevs.begin(), added_pdevs.end(), 0ull, + // Convert the vparameters to the vdev_info + auto buf = hs_utils::iobuf_alloc(vdev_info::size, sisl::buftag::superblk, pdevs[0]->align_size()); + auto vinfo = new (buf) vdev_info(); + populate_vdev_info(vparam, vdev_id, pdevs, vinfo); + + // Do a callback for the upper layer to create the vdev instance from vdev_info + shared< VirtualDev > vdev = m_vdev_create_cb(*vinfo, false /* load_existing */); + m_vdevs[vdev_id] = vdev; + + // different type might have different capacity, so we need to spread all the newly created chunks to all pdevs + // according to their capacity + + // the total size of all pdevs of a certain type + uint64_t total_type_size = std::accumulate(pdevs.begin(), pdevs.end(), 0ull, [](uint64_t r, const PhysicalDev* a) { return r + a->data_size(); }); - chunk_num = vdev->get_total_chunk_num(); - LOGDEBUG("size of all added pdevs={}, current_chunk_num={} of type {} in vdev {}", total_pdev_data_size, - chunk_num, vdev->get_dev_type(), vdev->info().get_name()); - } - std::map< PhysicalDev*, uint32_t > pdev_chunk_num_map; - for (auto pdev : pdevs) { - if (added_pdevs.contains(pdev)) { - LOGDEBUG("pdev {} is already added to vdev {}, skip it", pdev->get_devname(), vdev->info().get_name()); - continue; + + LOGINFO("total size of type {} in this homestore is {}", vparam.dev_type, total_type_size) + + uint32_t total_created_chunks{0}; + + for (auto& pdev : pdevs) { + if (total_created_chunks >= vparam.num_chunks) break; + std::vector< uint32_t > chunk_ids; + + // the total number of chunks will be created in this pdev + auto total_chunk_num_in_pdev = + static_cast< uint32_t >(vparam.num_chunks * (pdev->data_size() / static_cast< float >(total_type_size))); + + RELEASE_ASSERT(vparam.num_chunks >= total_chunk_num_in_pdev, + "chunks in pdev {} is {}, larger than total chunks {} , which is expected to be created ", + pdev->get_devname(), total_chunk_num_in_pdev, vparam.num_chunks); + + LOGINFO("{} chunks is created on pdev {} for vdev {}, pdev data size is {}", total_chunk_num_in_pdev, + pdev->get_devname(), vparam.vdev_name, pdev->data_size()); + + // Create chunk ids for all chunks in each of these pdevs + for (uint32_t c{0}; c < total_chunk_num_in_pdev; ++c) { + auto chunk_id = m_chunk_id_bm.get_next_reset_bit(0u); + if (chunk_id == sisl::Bitset::npos) { throw std::out_of_range("System has no room for additional chunks"); } + m_chunk_id_bm.set_bit(chunk_id); + chunk_ids.push_back(chunk_id); } - // the device size is expected to be the same, so multiple should be an integer, and chunk_num can be divisible - // by multiple. - auto multiple = static_cast< float >(total_pdev_data_size) / pdev->data_size(); - auto expect_chunk_num_on_pdev = static_cast< uint32_t >(chunk_num / multiple); - auto available_chunks_on_pdev = static_cast< uint32_t >(pdev->data_size() / vdev->info().chunk_size); - pdev_chunk_num_map[pdev] = std::min(expect_chunk_num_on_pdev, available_chunks_on_pdev); - LOGINFO("pdev {} should add {} chunks to vdev {} , expect_chunk_num_on_pdev={}, available_chunks_on_pdev={}, " - "pdev_size={}", - pdev->get_devname(), pdev_chunk_num_map[pdev], vdev->info().get_name(), expect_chunk_num_on_pdev, - available_chunks_on_pdev, pdev->data_size()); - } - return pdev_chunk_num_map; -} -void DeviceManager::add_pdev_to_vdev(shared< VirtualDev > vdev, PhysicalDev* pdev, uint32_t chunks_on_pdev) { - std::vector< uint32_t > chunk_ids; + // Create all chunks at one shot and add each one to the vdev + auto chunks = pdev->create_chunks(chunk_ids, vdev_id, vparam.chunk_size); + for (auto& chunk : chunks) { + vdev->add_chunk(chunk, true /* fresh_chunk */); + m_chunks[chunk->chunk_id()] = chunk; + } - LOGINFO("Add pdev {} to vdev {}, chunks_on_pdev={}", pdev->get_devname(), vdev->info().get_name(), chunks_on_pdev); - // Create chunk ids for all chunks in each of these pdevs - for (uint32_t c{0}; c < chunks_on_pdev; ++c) { - auto chunk_id = m_chunk_id_bm.get_next_reset_bit(0u); - if (chunk_id == sisl::Bitset::npos) { throw std::out_of_range("System has no room for additional chunks"); } - m_chunk_id_bm.set_bit(chunk_id); - chunk_ids.push_back(chunk_id); + total_created_chunks += total_chunk_num_in_pdev; } - // Create all chunks at one shot and add each one to the vdev - auto chunks = pdev->create_chunks(chunk_ids, vdev->info().get_vdev_id(), vdev->info().chunk_size); - for (auto& chunk : chunks) { - vdev->add_chunk(chunk, true /* fresh_chunk */); - m_chunks[chunk->chunk_id()] = chunk; + LOGINFO("{} chunks is created for vdev {}, expected {}", total_created_chunks, vparam.vdev_name, vparam.num_chunks); + // Handle any initialization needed. + vdev->init(); + + // Locate and write the vdev info in the super blk area of all pdevs this vdev will be created on + for (auto& pdev : pdevs) { + uint64_t offset = hs_super_blk::vdev_sb_offset() + (vdev_id * vdev_info::size); + pdev->write_super_block(buf, vdev_info::size, offset); } + + vinfo->~vdev_info(); + hs_utils::iobuf_free(buf, sisl::buftag::superblk); + LOGINFO("Virtal Dev={} of size={} successfully created", vparam.vdev_name, in_bytes(vparam.vdev_size)); + return vdev; } void DeviceManager::load_vdevs() { @@ -547,7 +404,6 @@ void DeviceManager::load_vdevs() { for (auto& pdev : m_all_pdevs) { // we might have some missing pdevs in the sparse_vector m_all_pdevs, so skip them if (!pdev) continue; - // Empty device will skip this callback. pdev->load_chunks([this](cshared< Chunk >& chunk) -> bool { // Found a chunk for which vdev information is missing if (m_vdevs[chunk->vdev_id()] == nullptr) { @@ -664,7 +520,7 @@ uint32_t DeviceManager::populate_pdev_info(const dev_info& dinfo, const iomgr::d const uuid_t& uuid, pdev_info_header& pinfo) { bool hdd = is_hdd(dinfo.dev_name); - pinfo.pdev_id = ++m_first_blk_hdr.cur_pdev_id; + pinfo.pdev_id = m_cur_pdev_id++; pinfo.mirror_super_block = hdd ? 0x01 : 0x00; pinfo.max_pdev_chunks = hs_super_blk::max_chunks_in_pdev(dinfo); @@ -716,23 +572,6 @@ static void populate_vdev_info(const vdev_parameters& vparam, uint32_t vdev_id, out_info->compute_checksum(); } -// This function populates the vdev_parameters from the vdev_info(loaded from existing disks) in the vdev recovery -// process. Because vdev_info doesn't store chunk_num, leave vparam.chunk_num empty and it will be calculated in -// `compose_vparam` as an intermediate param to calculate the chunk num on each pdev. -static void populate_vparam(vdev_parameters& vparam, vdev_info& vinfo) { - vparam.vdev_size = vinfo.vdev_size; - vparam.chunk_size = vinfo.chunk_size; - vparam.blk_size = vinfo.blk_size; - vparam.multi_pdev_opts = static_cast< vdev_multi_pdev_opts_t >(vinfo.multi_pdev_choice); - vparam.dev_type = static_cast< HSDevType >(vinfo.hs_dev_type); - vparam.vdev_name = vinfo.name; - vparam.context_data = sisl::blob{vinfo.get_user_private(), vinfo.user_private_size}; - vparam.alloc_type = static_cast< blk_allocator_type_t >(vinfo.alloc_type); - vparam.chunk_sel_type = static_cast< chunk_selector_type_t >(vinfo.chunk_sel_type); - vparam.size_type = vinfo.size_type; - vparam.use_slab_allocator = vinfo.use_slab_allocator == 1; -} - std::vector< vdev_info > DeviceManager::read_vdev_infos(const std::vector< PhysicalDev* >& pdevs) { std::vector< vdev_info > ret_vinfos; auto buf = diff --git a/src/lib/device/hs_super_blk.h b/src/lib/device/hs_super_blk.h index 18a9e963b..9d0a3140d 100644 --- a/src/lib/device/hs_super_blk.h +++ b/src/lib/device/hs_super_blk.h @@ -77,18 +77,16 @@ struct disk_attr { struct first_block_header { static constexpr const char* PRODUCT_NAME{"HomeStore4x"}; static constexpr size_t s_product_name_size{64}; - static constexpr uint32_t CURRENT_SUPERBLOCK_VERSION{5}; + static constexpr uint32_t CURRENT_SUPERBLOCK_VERSION{4}; public: - uint64_t gen_number{0}; // Generation count of this structure, will be incremented on every fields change - uint32_t version{0}; // Version Id of this structure + uint64_t gen_number{0}; // Generation count of this structure + uint32_t version{0}; // Version Id of this structure char product_name[s_product_name_size]{}; // Product name uint32_t num_pdevs{0}; // Total number of pdevs homestore is being created on uint32_t max_vdevs{0}; // Max VDevs possible, this cannot be changed post formatting uint32_t max_system_chunks{0}; // Max Chunks possible, this cannot be changed post formatting - uint32_t cur_pdev_id{0}; // The current max pdev id of all formatted disks and used to assign next pdev id for new - // disks. It is a monotonically increasing value and is not inherited in case of disk replacement. uuid_t system_uuid; public: @@ -102,7 +100,6 @@ struct first_block_header { get_product_name(), get_system_uuid_str()); return str; } - bool is_empty() const { return gen_number == 0 && version == 0 && std::string(product_name).empty(); } }; struct pdev_info_header { diff --git a/src/lib/device/journal_vdev.cpp b/src/lib/device/journal_vdev.cpp index 3e4dda2a0..6ca2678fc 100644 --- a/src/lib/device/journal_vdev.cpp +++ b/src/lib/device/journal_vdev.cpp @@ -24,8 +24,6 @@ #include #include #include -#include -#include "replication/repl_dev/raft_repl_dev.h" #include "device/chunk.h" #include "device/device.h" #include "device/physical_dev.hpp" @@ -36,8 +34,6 @@ #include "common/resource_mgr.hpp" #include "common/crash_simulator.hpp" -SISL_LOGGING_DECL(journalvdev) - namespace homestore { JournalVirtualDev::JournalVirtualDev(DeviceManager& dmgr, const vdev_info& vinfo, vdev_event_cb_t event_cb) : VirtualDev{dmgr, vinfo, std::move(event_cb), false /* is_auto_recovery */} { @@ -46,15 +42,15 @@ JournalVirtualDev::JournalVirtualDev(DeviceManager& dmgr, const vdev_info& vinfo m_init_private_data = std::make_shared< JournalChunkPrivate >(); m_chunk_pool = std::make_unique< ChunkPool >( dmgr, - ChunkPool::Params{HS_DYNAMIC_CONFIG(generic.journal_chunk_pool_capacity), - [this]() { - m_init_private_data->created_at = get_time_since_epoch_ms(); - m_init_private_data->end_of_chunk = m_vdev_info.chunk_size; - sisl::blob private_blob{r_cast< uint8_t* >(m_init_private_data.get()), - sizeof(JournalChunkPrivate)}; - return private_blob; - }, - m_vdev_info.hs_dev_type, m_vdev_info.vdev_id, m_vdev_info.chunk_size}); + ChunkPool::Params{ + HS_DYNAMIC_CONFIG(generic.journal_chunk_pool_capacity), + [this]() { + m_init_private_data->created_at = get_time_since_epoch_ms(); + m_init_private_data->end_of_chunk = m_vdev_info.chunk_size; + sisl::blob private_blob{r_cast< uint8_t* >(m_init_private_data.get()), sizeof(JournalChunkPrivate)}; + return private_blob; + }, + m_vdev_info.hs_dev_type, m_vdev_info.vdev_id, m_vdev_info.chunk_size}); resource_mgr().register_journal_vdev_exceed_cb([this]([[maybe_unused]] int64_t dirty_buf_count, bool critical) { // either it is critical or non-critical, call cp_flush; diff --git a/src/lib/device/physical_dev.hpp b/src/lib/device/physical_dev.hpp index 41eb9221d..33010f34d 100644 --- a/src/lib/device/physical_dev.hpp +++ b/src/lib/device/physical_dev.hpp @@ -31,7 +31,6 @@ #include #include "hs_super_blk.h" -SISL_LOGGING_DECL(device) namespace homestore { class PhysicalDevMetrics : public sisl::MetricsGroupWrapper { diff --git a/src/lib/device/vchunk.cpp b/src/lib/device/vchunk.cpp index ecc9c132b..a809450d1 100644 --- a/src/lib/device/vchunk.cpp +++ b/src/lib/device/vchunk.cpp @@ -25,8 +25,6 @@ const uint8_t* VChunk::get_user_private() const { return m_internal_chunk->user_ blk_num_t VChunk::get_total_blks() const { return m_internal_chunk->blk_allocator()->get_total_blks(); } -blk_num_t VChunk::get_used_blks() const { return m_internal_chunk->blk_allocator()->get_used_blks(); } - void VChunk::reset() { m_internal_chunk->blk_allocator_mutable()->reset(); } blk_num_t VChunk::available_blks() const { return m_internal_chunk->blk_allocator()->available_blks(); } @@ -35,8 +33,6 @@ blk_num_t VChunk::get_defrag_nblks() const { return m_internal_chunk->blk_alloca uint32_t VChunk::get_pdev_id() const { return m_internal_chunk->physical_dev()->pdev_id(); } -const std::string& VChunk::get_pdev_name() const { return m_internal_chunk->physical_dev()->get_devname(); } - uint16_t VChunk::get_chunk_id() const { return m_internal_chunk->chunk_id(); } uint64_t VChunk::size() const { return m_internal_chunk->size(); } diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp index b06859b0b..a3f060e4a 100644 --- a/src/lib/device/virtual_dev.cpp +++ b/src/lib/device/virtual_dev.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include "device/chunk.h" #include "device/physical_dev.hpp" @@ -48,8 +49,6 @@ #include "blkalloc/append_blk_allocator.h" #include "blkalloc/fixed_blk_allocator.h" -SISL_LOGGING_DECL(device) - namespace homestore { static std::shared_ptr< BlkAllocator > create_blk_allocator(blk_allocator_type_t btype, uint32_t vblock_size, @@ -202,35 +201,12 @@ BlkAllocStatus VirtualDev::alloc_contiguous_blks(blk_count_t nblks, blk_alloc_hi return ret; } -BlkAllocStatus VirtualDev::alloc_n_contiguous_blks(blk_count_t nblks, blk_alloc_hints hints, MultiBlkId& out_blkid) { - BlkAllocStatus ret; - try { - MultiBlkId mbid; - if (!hints.is_contiguous) { - HS_DBG_ASSERT(false, "Expected alloc_contiguous_blk call to be with hints.is_contiguous=true"); - hints.is_contiguous = true; - } - ret = alloc_blks(nblks, hints, mbid); - - if (ret == BlkAllocStatus::SUCCESS || (ret == BlkAllocStatus::PARTIAL && hints.partial_alloc_ok)) { - out_blkid = mbid; - } - - // for failure case, fall through and return the status to caller; - } catch (const std::exception& e) { - ret = BlkAllocStatus::FAILED; - HS_DBG_ASSERT(0, "{}", e.what()); - } - return ret; -} - BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId& out_blkid) { try { // First select a chunk to allocate it from BlkAllocStatus status; Chunk* chunk; size_t attempt{0}; - auto start_time = Clock::now(); if (hints.chunk_id_hint) { // this is a target-chunk allocation; chunk = m_dmgr.get_chunk_mutable(*(hints.chunk_id_hint)); @@ -241,7 +217,7 @@ BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& do { chunk = m_chunk_selector->select_chunk(nblks, hints).get(); if (chunk == nullptr) { - status = BlkAllocStatus::BLK_ALLOC_NONE; + status = BlkAllocStatus::SPACE_FULL; break; } @@ -258,7 +234,6 @@ BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& COUNTER_INCREMENT(m_metrics, vdev_num_alloc_failure, 1); } - HISTOGRAM_OBSERVE(m_metrics, blk_alloc_latency, get_elapsed_time_us(start_time)); return status; } catch (const std::exception& e) { LOGERROR("exception happened {}", e.what()); @@ -276,29 +251,23 @@ BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& h.is_contiguous = true; blk_count_t nblks_remain = nblks; BlkAllocStatus status; - auto start_time = Clock::now(); + do { - MultiBlkId mbid; - status = alloc_n_contiguous_blks(nblks_remain, h, mbid); + out_blkids.emplace_back(); // Put an empty MultiBlkId and use that for allocating them + BlkId& out_bid = out_blkids.back(); + status = alloc_contiguous_blks(nblks_remain, h, out_bid); + + auto nblks_this_iter = out_bid.blk_count(); + nblks_remain = (nblks_remain < nblks_this_iter) ? 0 : (nblks_remain - nblks_this_iter); + if (status != BlkAllocStatus::SUCCESS && status != BlkAllocStatus::PARTIAL) { out_blkids.pop_back(); // all chunks has been tried, but still failed to allocate; // break out and return status to caller; break; } - - blk_count_t nblks_this_iter = 0; - auto it = mbid.iterate(); - while (auto const b = it.next()) { - nblks_this_iter += (*b).blk_count(); - out_blkids.emplace_back(*b); - } - - nblks_remain = (nblks_remain < nblks_this_iter) ? 0 : (nblks_remain - nblks_this_iter); - } while (nblks_remain); - HISTOGRAM_OBSERVE(m_metrics, blk_alloc_latency, get_elapsed_time_us(start_time)); return status; } @@ -315,14 +284,6 @@ BlkAllocStatus VirtualDev::alloc_blks_from_chunk(blk_count_t nblks, blk_alloc_hi chunk->blk_allocator_mutable()->free(out_blkid); out_blkid = MultiBlkId{}; status = BlkAllocStatus::FAILED; - } else if (status == BlkAllocStatus::SUCCESS || status == BlkAllocStatus::PARTIAL) { - blk_count_t nblks_alloc = 0; - auto it = out_blkid.iterate(); - while (auto const b = it.next()) { - nblks_alloc += (*b).blk_count(); - } - // Inform chunk selector on the number of blks alloced - m_chunk_selector->on_alloc_blk(chunk->chunk_id(), nblks_alloc); } return status; @@ -339,8 +300,6 @@ void VirtualDev::free_blk(BlkId const& bid, VDevCPContext* vctx) { if (!chunk) HS_DBG_ASSERT(false, "chunk is missing for blkid {}", b.to_string()); BlkAllocator* allocator = chunk->blk_allocator_mutable(); allocator->free(b); - // Inform chunk selector on the number of blks freed - m_chunk_selector->on_free_blk(chunk->chunk_id(), b.blk_count()); } }; @@ -371,7 +330,9 @@ folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32 HS_DBG_ASSERT_EQ(bid.is_multi(), false, "async_write needs individual pieces of blkid - not MultiBlkid"); #ifdef _PRERELEASE - if (hs()->crash_simulator().is_crashed()) { return folly::makeFuture< std::error_code >(std::error_code()); } + if (hs()->crash_simulator().is_in_crashing_phase()) { + return folly::makeFuture< std::error_code >(std::error_code()); + } #endif Chunk* chunk; @@ -393,7 +354,9 @@ folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32 folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { #ifdef _PRERELEASE - if (hs()->crash_simulator().is_crashed()) { return folly::makeFuture< std::error_code >(std::error_code()); } + if (hs()->crash_simulator().is_in_crashing_phase()) { + return folly::makeFuture< std::error_code >(std::error_code()); + } #endif if (sisl_unlikely(!is_chunk_available(chunk))) { @@ -414,7 +377,9 @@ folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, cons bool part_of_batch) { HS_DBG_ASSERT_EQ(bid.is_multi(), false, "async_writev needs individual pieces of blkid - not MultiBlkid"); #ifdef _PRERELEASE - if (hs()->crash_simulator().is_crashed()) { return folly::makeFuture< std::error_code >(std::error_code()); } + if (hs()->crash_simulator().is_in_crashing_phase()) { + return folly::makeFuture< std::error_code >(std::error_code()); + } #endif Chunk* chunk; @@ -436,7 +401,9 @@ folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, cons folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, const int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { #ifdef _PRERELEASE - if (hs()->crash_simulator().is_crashed()) { return folly::makeFuture< std::error_code >(std::error_code()); } + if (hs()->crash_simulator().is_in_crashing_phase()) { + return folly::makeFuture< std::error_code >(std::error_code()); + } #endif if (sisl_unlikely(!is_chunk_available(chunk))) { @@ -457,7 +424,7 @@ folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, cons ////////////////////////// sync write section ////////////////////////////////// std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId const& bid) { #ifdef _PRERELEASE - if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; } + if (hs()->crash_simulator().is_in_crashing_phase()) { return std::error_code{}; } #endif HS_DBG_ASSERT_EQ(bid.is_multi(), false, "sync_write needs individual pieces of blkid - not MultiBlkid"); @@ -475,7 +442,7 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId con std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { #ifdef _PRERELEASE - if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; } + if (hs()->crash_simulator().is_in_crashing_phase()) { return std::error_code{}; } #endif HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(), @@ -491,7 +458,7 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId cons HS_DBG_ASSERT_EQ(bid.is_multi(), false, "sync_writev needs individual pieces of blkid - not MultiBlkid"); #ifdef _PRERELEASE - if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; } + if (hs()->crash_simulator().is_in_crashing_phase()) { return std::error_code{}; } #endif Chunk* chunk; @@ -515,7 +482,7 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId cons std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { #ifdef _PRERELEASE - if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; } + if (hs()->crash_simulator().is_in_crashing_phase()) { return std::error_code{}; } #endif if (sisl_unlikely(!is_chunk_available(chunk))) { @@ -575,6 +542,12 @@ std::error_code VirtualDev::sync_read(char* buf, uint32_t size, BlkId const& bid return chunk->physical_dev_mutable()->sync_read(buf, size, dev_offset); } +std::pair< std::error_code, sisl::io_blob_safe > VirtualDev::sync_read(BlkId const& bid) { + auto buf = sisl::io_blob_safe(bid.blk_count() * block_size(), align_size(), sisl::buftag::common); + auto ec = sync_read(charptr_cast(buf.bytes()), buf.size(), bid); + return std::pair(ec, std::move(buf)); +} + std::error_code VirtualDev::sync_read(char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { if (sisl_unlikely(!is_chunk_available(chunk))) { return std::make_error_code(std::errc::resource_unavailable_try_again); @@ -729,10 +702,8 @@ void VirtualDev::cp_flush(VDevCPContext* v_cp_ctx) { CP* cp = v_cp_ctx->cp(); // pass down cp so that underlying components can get their customized CP context if needed; - m_chunk_selector->foreach_chunks([this, cp](cshared< Chunk >& chunk) { - HS_LOG(TRACE, device, "Flushing chunk: {}, vdev: {}", chunk->chunk_id(), m_vdev_info.name); - chunk->blk_allocator_mutable()->cp_flush(cp); - }); + m_chunk_selector->foreach_chunks( + [this, cp](cshared< Chunk >& chunk) { chunk->blk_allocator_mutable()->cp_flush(cp); }); // All of the blkids which were captured in the current vdev cp context will now be freed and hence available for // allocation on the new CP dirty collection session which is ongoing diff --git a/src/lib/device/virtual_dev.hpp b/src/lib/device/virtual_dev.hpp index d853acac2..eb6b63192 100644 --- a/src/lib/device/virtual_dev.hpp +++ b/src/lib/device/virtual_dev.hpp @@ -54,7 +54,6 @@ class VirtualDevMetrics : public sisl::MetricsGroupWrapper { REGISTER_COUNTER(default_chunk_allocation_cnt, "default chunk allocation count"); REGISTER_COUNTER(random_chunk_allocation_cnt, "random chunk allocation count"); // ideally it should be zero for hdd - REGISTER_HISTOGRAM(blk_alloc_latency, "Blk allocation latency", "blk_alloc_latency"); register_me_to_farm(); } @@ -139,13 +138,6 @@ class VirtualDev { /// @return BlkAllocStatus : Status about the allocation virtual BlkAllocStatus alloc_contiguous_blks(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid); - /// @brief This method allocates multiple contiguous blocks in the vdev - /// @param nblks : Number of blocks to allocate - /// @param hints : Hints about block allocation, (specific device to allocate, stream etc) - /// @param out_blkid : Reference to where allocated MultiBlkId to be placed - /// @return BlkAllocStatus : Status about the allocation - virtual BlkAllocStatus alloc_n_contiguous_blks(blk_count_t nblks, blk_alloc_hints hints, MultiBlkId& out_blkid); - /// @brief This method allocates blocks in the vdev and it could be non-contiguous, hence multiple BlkIds are /// returned /// @param nblks : Number of blocks to allocate @@ -253,6 +245,8 @@ class VirtualDev { // TODO: This needs to be removed once Journal starting to use AppendBlkAllocator std::error_code sync_read(char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk); + std::pair< std::error_code, sisl::io_blob_safe > sync_read(BlkId const& bid); + /// @brief Synchronously read the data for a given BlkId to vector of buffers /// @param iov : Vector of buffer to write read to /// @param iovcnt : Count of buffer diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 141ff2063..f7e4f9019 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -28,11 +28,9 @@ #include #include #include -#include #include #include -#include "index/wb_cache.hpp" #include "common/homestore_utils.hpp" #include "common/homestore_config.hpp" #include "common/homestore_assert.hpp" @@ -42,7 +40,9 @@ #include "device/virtual_dev.hpp" #include "common/resource_mgr.hpp" #include "meta/meta_sb.hpp" +#ifdef REPLICATION_SUPPORT #include "replication/service/generic_repl_svc.h" +#endif #include "common/crash_simulator.hpp" /* @@ -57,10 +57,8 @@ namespace homestore { HomeStoreSafePtr HomeStore::s_instance{nullptr}; -static std::unique_ptr< FaultContainmentCallback > s_fc_cb; static std::unique_ptr< IndexServiceCallbacks > s_index_cbs; -static shared< ChunkSelector > s_custom_data_chunk_selector{nullptr}; -static shared< ChunkSelector > s_custom_index_chunk_selector{nullptr}; +static shared< ChunkSelector > s_custom_chunk_selector{nullptr}; static shared< ReplApplication > s_repl_app{nullptr}; std::string version = PACKAGE_VERSION; @@ -69,40 +67,46 @@ HomeStore* HomeStore::instance() { return s_instance.get(); } -HomeStore& HomeStore::with_fault_containment(std::unique_ptr< FaultContainmentCallback > cb) { - m_services.svcs |= HS_SERVICE::FAULT_CMT; - s_fc_cb = std::move(cb); - return *this; +HomeStore::HomeStore() { + REGISTER_LOG_MODS(btree, device, blkalloc, cp, logstore, replication, journalvdev); + + // Always start the meta service + for (uint32_t i{0}; i < enum_count< ServiceType >(); ++i) { + m_services.emplace_back(std::vector< ServiceSubType >{}); + } + m_services[uint32_cast(ServiceType::META)].push_back(ServiceSubType::DEFAULT); } HomeStore& HomeStore::with_data_service(cshared< ChunkSelector >& custom_chunk_selector) { - m_services.svcs |= HS_SERVICE::DATA; - m_services.svcs &= ~HS_SERVICE::REPLICATION; // ReplicationDataSvc or DataSvc are mutually exclusive - s_custom_data_chunk_selector = std::move(custom_chunk_selector); + m_services[uint32_cast(ServiceType::DATA)] = std::vector< ServiceSubType >{1u, ServiceSubType::DEFAULT}; + m_services[uint32_cast(ServiceType::REPLICATION)].clear(); // ReplicationDataSvc or DataSvc are mutually exclusive + s_custom_chunk_selector = std::move(custom_chunk_selector); return *this; } HomeStore& HomeStore::with_index_service(std::unique_ptr< IndexServiceCallbacks > cbs, - cshared< ChunkSelector >& custom_chunk_selector) { - m_services.svcs |= HS_SERVICE::INDEX; + std::vector< ServiceSubType > sub_types) { + m_services[uint32_cast(ServiceType::INDEX)] = std::move(sub_types); s_index_cbs = std::move(cbs); - s_custom_index_chunk_selector = std::move(custom_chunk_selector); return *this; } HomeStore& HomeStore::with_log_service() { - m_services.svcs |= HS_SERVICE::LOG; + m_services[uint32_cast(ServiceType::LOG)] = std::vector< ServiceSubType >{1u, ServiceSubType::DEFAULT}; return *this; } +#ifdef REPLICATION_SUPPORT HomeStore& HomeStore::with_repl_data_service(cshared< ReplApplication >& repl_app, cshared< ChunkSelector >& custom_chunk_selector) { - m_services.svcs |= HS_SERVICE::REPLICATION | HS_SERVICE::LOG; - m_services.svcs &= ~HS_SERVICE::DATA; // ReplicationDataSvc or DataSvc are mutually exclusive + m_services[uint32_cast(ServiceType::REPLICATION)] = std::vector< ServiceSubType >{1u, ServiceSubType::DEFAULT}; + m_services[uint32_cast(ServiceType::LOG)] = std::vector< ServiceSubType >{1u, ServiceSubType::DEFAULT}; + m_services[uint32_cast(ServiceType::DATA)].clear(); // ReplicationDataSvc or DataSvc are mutually exclusive s_repl_app = repl_app; - s_custom_data_chunk_selector = std::move(custom_chunk_selector); + s_custom_chunk_selector = std::move(custom_chunk_selector); return *this; } +#endif #ifdef _PRERELEASE HomeStore& HomeStore::with_crash_simulator(std::function< void(void) > cb) { @@ -111,6 +115,23 @@ HomeStore& HomeStore::with_crash_simulator(std::function< void(void) > cb) { } #endif +std::string HomeStore::services_list() const { + std::string str; + if (has_meta_service()) { str += "meta,"; } + if (has_data_service()) { str += "data,"; } + if (has_index_service()) { + for (auto const& sub_type : m_services[uint32_cast(ServiceType::INDEX)]) { + if (sub_type == ServiceSubType::DEFAULT) { str += "index_default,"; } + if (sub_type == ServiceSubType::INDEX_BTREE_COPY_ON_WRITE) { str += "index_copy_on_write,"; } + if (sub_type == ServiceSubType::INDEX_BTREE_INPLACE) { str += "index_inplace_btree,"; } + if (sub_type == ServiceSubType::INDEX_BTREE_MEMORY) { str += "index_mem_btree,"; } + } + } + if (has_log_service()) { str += "log,"; } + if (has_repl_data_service()) { str += "replication,"; } + return str; +} + bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_cb_t svcs_starting_cb) { auto& hs_config = HomeStoreStaticConfig::instance(); hs_config.input = input; @@ -155,8 +176,8 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < s_cast< int >(data_fetch_max_size_in_byte)) { LOGERROR("max_grpc_message_size {} is too small to hold max_data_size {}, max_snapshot_batch_size {} and " "data_fetch_max_size {}", - HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), input.max_data_size, input.max_snapshot_batch_size, - data_fetch_max_size_in_byte); + HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), input.max_data_size, + input.max_snapshot_batch_size, data_fetch_max_size_in_byte); throw std::invalid_argument("max_grpc_message_size is insufficient for the configured data or snapshot sizes"); } @@ -166,34 +187,31 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ if (m_crash_simulator == nullptr) { m_crash_simulator = std::make_unique< CrashSimulator >(nullptr); } #endif - LOGINFO("Homestore is loading with following services: {}", m_services.list()); + LOGINFO("Homestore is loading with following services: {}", services_list()); if (has_meta_service()) { m_meta_service = std::make_unique< MetaBlkService >(); } if (has_index_service()) { m_index_service = - std::make_unique< IndexService >(std::move(s_index_cbs), std::move(s_custom_index_chunk_selector)); + std::make_unique< IndexService >(std::move(s_index_cbs), m_services[uint32_cast(ServiceType::INDEX)]); } if (has_repl_data_service()) { m_log_service = std::make_unique< LogStoreService >(); - m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_data_chunk_selector)); + m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_chunk_selector)); +#ifdef REPLICATION_SUPPORT m_repl_service = GenericReplService::create(std::move(s_repl_app)); +#endif } else { if (has_log_service()) { m_log_service = std::make_unique< LogStoreService >(); } if (has_data_service()) { - m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_data_chunk_selector)); + m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_chunk_selector)); } } - if (has_fc_service()) { m_fc_service = std::make_unique< FaultContainmentService >(std::move(s_fc_cb)); } - m_cp_mgr = std::make_unique< CPManager >(); m_dev_mgr = std::make_unique< DeviceManager >(input.devices, bind_this(HomeStore::create_vdev_cb, 2)); if (!m_dev_mgr->is_first_time_boot()) { m_dev_mgr->load_devices(); - if (input.has_fast_dev()) { - hs_utils::set_btree_mempool_size(m_dev_mgr->atomic_page_size({HSDevType::Fast})); - } else { - hs_utils::set_btree_mempool_size(m_dev_mgr->atomic_page_size({HSDevType::Data})); - } + hs_utils::set_btree_mempool_size( + m_dev_mgr->atomic_page_size(input.has_fast_dev() ? HSDevType::Fast : HSDevType::Data)); do_start(); return false; } else { @@ -201,10 +219,10 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ } } -void HomeStore::format_and_start(std::map< uint32_t, hs_format_params >&& format_opts) { +void HomeStore::format_and_start(std::map< ServiceId, hs_format_params >&& format_opts) { std::map< HSDevType, float > total_pct_by_type = {{HSDevType::Fast, 0.0f}, {HSDevType::Data, 0.0f}}; // Accumulate total percentage of services on each device type - for (const auto& [svc_type, fparams] : format_opts) { + for (const auto& [_, fparams] : format_opts) { total_pct_by_type[fparams.dev_type] += fparams.size_pct; } @@ -232,31 +250,28 @@ void HomeStore::format_and_start(std::map< uint32_t, hs_format_params >&& format } #endif m_dev_mgr->format_devices(); - if (HomeStoreStaticConfig::instance().input.has_fast_dev()) { - hs_utils::set_btree_mempool_size(m_dev_mgr->atomic_page_size({HSDevType::Fast})); - } else { - hs_utils::set_btree_mempool_size(m_dev_mgr->atomic_page_size({HSDevType::Data})); - } + hs_utils::set_btree_mempool_size( + m_dev_mgr->atomic_page_size(HS_STATIC_CONFIG(input).has_fast_dev() ? HSDevType::Fast : HSDevType::Data)); std::vector< folly::Future< std::error_code > > futs; - for (const auto& [svc_type, fparams] : format_opts) { + for (const auto& [svc_id, fparams] : format_opts) { if (fparams.size_pct == 0) { continue; } - if ((svc_type & HS_SERVICE::META) && has_meta_service()) { + if ((svc_id.type == ServiceType::META) && has_meta_service()) { m_meta_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type, fparams.num_chunks); - } else if ((svc_type & HS_SERVICE::LOG) && has_log_service()) { + } else if ((svc_id.type == ServiceType::LOG) && has_log_service()) { futs.emplace_back(m_log_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type, fparams.chunk_size)); - } else if ((svc_type & HS_SERVICE::INDEX) && has_index_service()) { - m_index_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type, - fparams.num_chunks, fparams.chunk_sel_type); - } else if ((svc_type & HS_SERVICE::DATA) && has_data_service()) { + } else if ((svc_id.type == ServiceType::INDEX) && has_index_service()) { + m_index_service->create_vdev(svc_id.sub_type, pct_to_size(fparams.size_pct, fparams.dev_type), + fparams.dev_type, fparams.num_chunks); + } else if ((svc_id.type == ServiceType::DATA) && has_data_service()) { m_data_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type, fparams.block_size, fparams.alloc_type, fparams.chunk_sel_type, fparams.num_chunks, fparams.chunk_size); - } else if ((svc_type & HS_SERVICE::REPLICATION) && has_repl_data_service()) { + } else if ((svc_id.type == ServiceType::REPLICATION) && has_repl_data_service()) { m_data_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type, fparams.block_size, fparams.alloc_type, fparams.chunk_sel_type, fparams.num_chunks, fparams.chunk_size); @@ -291,7 +306,9 @@ void HomeStore::do_start() { if (has_index_service()) { m_index_service->start(); } if (has_repl_data_service()) { +#ifdef REPLICATION_SUPPORT s_cast< GenericReplService* >(m_repl_service.get())->start(); // Replservice starts logstore & data service +#endif } else { if (has_data_service()) { m_data_service->start(); } if (has_log_service() && inp_params.auto_recovery) { @@ -305,7 +322,7 @@ void HomeStore::do_start() { // boot going forward on next reboot. if (m_dev_mgr->is_first_time_boot()) { // Take the first CP after we have initialized all subsystems and wait for it to complete. - m_cp_mgr->trigger_cp_flush(true /* force */).get(); + m_cp_mgr->trigger_cp_flush(true /* force */, CPTriggerReason::Timer).get(); m_dev_mgr->commit_formatting(); } @@ -323,39 +340,32 @@ void HomeStore::shutdown() { LOGINFO("Homestore shutdown is started"); - m_resource_mgr->stop(); - - // 1 stop all the services, after which all the upper layer api call are rejected and there is not on-going request. - // Note that, after stopping, all the service are alive. - if (has_repl_data_service()) - // Log and Data services are stopped by repl service - s_cast< GenericReplService* >(m_repl_service.get())->stop(); - else { - if (has_log_service()) m_log_service->stop(); - if (has_data_service()) m_data_service->stop(); - } - - if (has_index_service()) m_index_service->stop(); - - // 2 call cp_manager shutdown, which will which trigger cp flush to make sure all the in-memory data of all the - // services are flushed to disk. since all the upper layer api call are rejected and there is not on-going request, - // so after cp flush is done, we can guarantee all the necessary data are persisted to disk. m_cp_mgr->shutdown(); m_cp_mgr.reset(); - // 3 call reset/shutdown to clear all the services and after that all the services are dead, excluding metasevice + m_resource_mgr->stop(); + if (has_repl_data_service()) { +#ifdef REPLICATION_SUPPORT + // Log and Data services are stopped by repl service + s_cast< GenericReplService* >(m_repl_service.get())->stop(); m_log_service.reset(); m_data_service.reset(); m_repl_service.reset(); +#endif } else { - if (has_log_service()) m_log_service.reset(); - if (has_data_service()) m_data_service.reset(); + if (has_log_service()) { + m_log_service->stop(); + m_log_service.reset(); + } + if (has_data_service()) { m_data_service.reset(); } } - if (has_index_service()) m_index_service.reset(); + if (has_index_service()) { + m_index_service->stop(); + // m_index_service.reset(); + } - // 4 close metaservice and device_manager. if (has_meta_service()) { m_meta_service->stop(); m_meta_service.reset(); @@ -378,7 +388,7 @@ cap_attrs HomeStore::get_system_capacity() const { // cap.data_capacity = get_data_blkstore()->get_size(); // } if (has_index_service()) { - cap.used_index_size = m_index_service->used_size(); + cap.used_index_size = m_index_service->space_occupied(); cap.meta_capacity += m_index_service->total_size(); } if (has_log_service()) { @@ -396,15 +406,13 @@ cap_attrs HomeStore::get_system_capacity() const { bool HomeStore::is_first_time_boot() const { return m_dev_mgr->is_first_time_boot(); } -bool HomeStore::has_index_service() const { return m_services.svcs & HS_SERVICE::INDEX; } -bool HomeStore::has_data_service() const { return m_services.svcs & HS_SERVICE::DATA; } -bool HomeStore::has_repl_data_service() const { return m_services.svcs & HS_SERVICE::REPLICATION; } -bool HomeStore::has_meta_service() const { return m_services.svcs & HS_SERVICE::META; } -bool HomeStore::has_log_service() const { - auto const s = m_services.svcs; - return (s & HS_SERVICE::LOG); +bool HomeStore::has_index_service() const { return (m_services[uint32_cast(ServiceType::INDEX)].size() != 0); } +bool HomeStore::has_data_service() const { return (m_services[uint32_cast(ServiceType::DATA)].size() != 0); } +bool HomeStore::has_repl_data_service() const { + return (m_services[uint32_cast(ServiceType::REPLICATION)].size() != 0); } -bool HomeStore::has_fc_service() const { return (m_services.svcs & HS_SERVICE::FAULT_CMT); } +bool HomeStore::has_meta_service() const { return (m_services[uint32_cast(ServiceType::META)].size() != 0); } +bool HomeStore::has_log_service() const { return (m_services[uint32_cast(ServiceType::LOG)].size() != 0); } #if 0 void HomeStore::init_cache() { @@ -449,7 +457,9 @@ shared< VirtualDev > HomeStore::create_vdev_cb(const vdev_info& vinfo, bool load break; case hs_vdev_type_t::INDEX_VDEV: - if (has_index_service()) { ret_vdev = m_index_service->open_vdev(vinfo, load_existing); } + if (has_index_service()) { + ret_vdev = m_index_service->open_vdev(vdev_context->sub_type, vinfo, load_existing); + } break; case hs_vdev_type_t::DATA_VDEV: diff --git a/src/lib/index/CMakeLists.txt b/src/lib/index/CMakeLists.txt index 7bad58240..2bdd749dd 100644 --- a/src/lib/index/CMakeLists.txt +++ b/src/lib/index/CMakeLists.txt @@ -2,10 +2,14 @@ cmake_minimum_required(VERSION 3.11) include_directories (BEFORE ../) +add_subdirectory(cow_btree) +add_subdirectory(mem_btree) + set(INDEX_SOURCE_FILES index_service.cpp + btree_base.cpp index_cp.cpp - wb_cache.cpp ) add_library(hs_index OBJECT ${INDEX_SOURCE_FILES}) target_link_libraries(hs_index ${COMMON_DEPS}) +#add_dependencies(hs_index hs_cow_btree hs_mem_btree) diff --git a/src/lib/index/btree_base.cpp b/src/lib/index/btree_base.cpp new file mode 100644 index 000000000..f2da67036 --- /dev/null +++ b/src/lib/index/btree_base.cpp @@ -0,0 +1,455 @@ +#include +#include +#include +#include +#include +#include "common/homestore_assert.hpp" + +namespace homestore { +BtreeBase::BtreeBase(BtreeConfig const& cfg, uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size) : + Index::Index{cfg.store_type() == IndexStore::Type::MEM_BTREE}, + m_bt_cfg{cfg}, + m_metrics{m_bt_cfg.name().c_str()} { + m_sb.create(sizeof(IndexSuperBlock)); + m_sb->uuid = uuid; + m_sb->parent_uuid = parent_uuid; + m_sb->user_sb_size = user_sb_size; + m_sb->index_store_type = cfg.store_type(); + m_sb->ordinal = hs()->index_service().reserve_ordinal(); + + auto bt_sb = new (m_sb.get()->underlying_index_sb.data()) BtreeSuperBlock(); + m_store = + std::static_pointer_cast< BtreeStore >(hs()->index_service().lookup_or_create_store(cfg.store_type(), {})); + + if (m_bt_cfg.m_btree_name.empty()) { m_bt_cfg.m_btree_name = "btree" + std::to_string(m_sb->ordinal); } + + // Determine the correct node size + auto const max_node_size = m_store->max_node_size(); + if ((m_bt_cfg.m_node_size == 0) || (m_bt_cfg.m_node_size > max_node_size)) { m_bt_cfg.m_node_size = max_node_size; } + m_bt_cfg.finalize(sizeof(BtreeNode::PersistentHeader)); + + // Create the underlying btree instance + m_bt_private = std::move(m_store->create_underlying_btree(*this, false /* load_existing */)); + + bt_sb->node_size = m_bt_cfg.m_node_size; + m_sb.write(); +} + +BtreeBase::BtreeBase(BtreeConfig const& cfg, superblk< IndexSuperBlock >&& sb) : + Index::Index{cfg.store_type() == IndexStore::Type::MEM_BTREE}, + m_bt_cfg{cfg}, + m_metrics{m_bt_cfg.name().c_str()} { + HS_REL_ASSERT_EQ(cfg.store_type(), sb->index_store_type, + "Config requirement and super block differs in store_type"); + m_sb = std::move(sb); + m_store = + std::static_pointer_cast< BtreeStore >(hs()->index_service().lookup_or_create_store(cfg.store_type(), {})); + + if (m_bt_cfg.m_btree_name.empty()) { m_bt_cfg.m_btree_name = "btree" + std::to_string(m_sb->ordinal); } + + // Retrieve the correct node_size + auto bt_sb = r_cast< BtreeSuperBlock* >(m_sb.get()->underlying_index_sb.data()); + m_bt_cfg.m_node_size = bt_sb->node_size; + HS_DBG_ASSERT_NE(bt_sb->node_size, 0, "Invalid node_size in the btree super block"); + HS_DBG_ASSERT_LE(bt_sb->node_size, m_store->max_node_size(), + "Node size in btree super block, exceeds store max node size"); + m_bt_cfg.finalize(sizeof(BtreeNode::PersistentHeader)); + + m_bt_private = std::move(m_store->create_underlying_btree(*this, true /* load_existing*/)); + m_root_node_info = m_bt_private->load_root_node_id(); +} + +BtreeBase::~BtreeBase() = default; +uint32_t BtreeBase::node_size() const { return m_bt_cfg.node_size(); } + +uint64_t BtreeBase::space_occupied() const { return m_bt_private->space_occupied(); } + +uint32_t BtreeBase::ordinal() const { return m_sb->ordinal; } + +std::string BtreeBase::name() const { return m_bt_cfg.name(); } + +BtreeRouteTracer& BtreeBase::route_tracer() { return m_route_tracer; } + +#define lock_node(a, b, c) _lock_node(a, b, c, __FILE__, __LINE__) + +btree_status_t BtreeBase::create_root_node() { + auto cpg = bt_cp_guard(); + auto cp_context = cpg.context(cp_consumer_t::INDEX_SVC); + + // Assign one node as root node and also create a child leaf node and set it as edge + BtreeNodePtr root = create_leaf_node(cp_context); + if (root == nullptr) { return btree_status_t::space_not_avail; } + + root->set_level(0u); + auto ret = write_node(root, cp_context); + if (ret != btree_status_t::success) { + remove_node(root, locktype_t::NONE, cp_context); + return btree_status_t::space_not_avail; + } + + m_root_node_info = BtreeLinkInfo{root->node_id(), root->link_version()}; + ret = m_bt_private->on_root_changed(root, cp_context); + if (ret != btree_status_t::success) { + remove_node(root, locktype_t::NONE, cp_context); + m_root_node_info = BtreeLinkInfo{}; + } + return ret; +} + +btree_status_t BtreeBase::read_and_lock_node(bnodeid_t id, BtreeNodePtr& node_ptr, locktype_t int_lock_type, + locktype_t leaf_lock_type, CPContext* context) const { + auto ret = m_bt_private->read_node(id, node_ptr); + if (node_ptr == nullptr) { + BT_LOG(ERROR, "read failed, reason: {}", ret); + return ret; + } + + auto acq_lock = (node_ptr->is_leaf()) ? leaf_lock_type : int_lock_type; + ret = lock_node(node_ptr, acq_lock, context); + if (ret != btree_status_t::success) { BT_LOG(ERROR, "Node lock and refresh failed"); } + + return ret; +} + +btree_status_t BtreeBase::get_child_and_lock_node(const BtreeNodePtr& node, uint32_t index, BtreeLinkInfo& child_info, + BtreeNodePtr& child_node, locktype_t int_lock_type, + locktype_t leaf_lock_type, CPContext* context) const { + if (index == node->total_entries()) { + if (!node->has_valid_edge()) { + BT_NODE_LOG_ASSERT(false, node, "Child index {} does not have valid bnode_id", index); + return btree_status_t::not_found; + } + child_info = node->get_edge_value(); + } else { + BT_NODE_LOG_ASSERT_LT(index, node->total_entries(), node); + node->get_nth_value(index, &child_info, false /* copy */); + } + + return (read_and_lock_node(child_info.bnode_id(), child_node, int_lock_type, leaf_lock_type, context)); +} + +btree_status_t BtreeBase::write_node(const BtreeNodePtr& node, CPContext* context) { + COUNTER_INCREMENT_IF_ELSE(m_metrics, node->is_leaf(), btree_leaf_node_writes, btree_int_node_writes, 1); + HISTOGRAM_OBSERVE_IF_ELSE(m_metrics, node->is_leaf(), btree_leaf_node_occupancy, btree_int_node_occupancy, + ((node_size() - node->available_size()) * 100) / node_size()); + + return (m_bt_private->write_node(node, context)); +} + +/* Caller of this api doesn't expect read to fail in any circumstance */ +void BtreeBase::read_node_or_fail(bnodeid_t id, BtreeNodePtr& node) const { + BT_NODE_REL_ASSERT_EQ(m_bt_private->read_node(id, node), btree_status_t::success, node); +} + +/* + * This function upgrades the parent node and child node locks from read lock to write lock and take required steps if + * things have changed during the upgrade. + * + * Inputs: + * parent_node - Parent Node to upgrade + * child_node - Child Node to upgrade + * child_cur_lock - Current child node which is held + * context - Context to pass down + * + * Returns - If successfully able to upgrade both the nodes, return success, else return status of upgrade_node. + * In case of not success, all nodes locks are released. + * + * NOTE: This function expects both the parent_node and child_node to be already locked. Parent node is + * expected to be read locked and child node could be either read or write locked. + */ +btree_status_t BtreeBase::upgrade_node_locks(const BtreeNodePtr& parent_node, const BtreeNodePtr& child_node, + locktype_t& parent_cur_lock, locktype_t& child_cur_lock, + CPContext* context) { + btree_status_t ret = btree_status_t::success; + + auto const parent_prev_gen = parent_node->node_gen(); + auto const child_prev_gen = child_node->node_gen(); + + unlock_node(child_node, child_cur_lock); + unlock_node(parent_node, parent_cur_lock); + + ret = lock_node(parent_node, locktype_t::WRITE, context); + if (ret != btree_status_t::success) { + parent_cur_lock = child_cur_lock = locktype_t::NONE; + return ret; + } + + ret = lock_node(child_node, locktype_t::WRITE, context); + if (ret != btree_status_t::success) { + unlock_node(parent_node, locktype_t::WRITE); + parent_cur_lock = child_cur_lock = locktype_t::NONE; + return ret; + } + + // If the node things have been changed between unlock and lock example, it has been made invalid (probably by merge + // nodes) ask caller to start over again. + if (parent_node->is_node_deleted() || (parent_prev_gen != parent_node->node_gen()) || + child_node->is_node_deleted() || (child_prev_gen != child_node->node_gen())) { + unlock_node(child_node, locktype_t::WRITE); + unlock_node(parent_node, locktype_t::WRITE); + parent_cur_lock = child_cur_lock = locktype_t::NONE; + return btree_status_t::retry; + } + + parent_cur_lock = child_cur_lock = locktype_t::WRITE; +#if 0 +#ifdef _PRERELEASE + { + auto time = iomgr_flip::instance()->get_test_flip< uint64_t >("btree_upgrade_delay"); + if (time) { std::this_thread::sleep_for(std::chrono::microseconds{time.get()}); } + } +#endif +#endif + +#if 0 +#ifdef _PRERELEASE + { + int is_leaf = 0; + + if (child_node && child_node->is_leaf()) { is_leaf = 1; } + if (iomgr_flip::instance()->test_flip("btree_upgrade_node_fail", is_leaf)) { + unlock_node(my_node, cur_lock); + cur_lock = locktype_t::NONE; + if (child_node) { + unlock_node(child_node, child_cur_lock); + child_cur_lock = locktype_t::NONE; + } + ret = btree_status_t::retry; + } + } +#endif +#endif + + return ret; +} + +btree_status_t BtreeBase::upgrade_node_lock(const BtreeNodePtr& node, locktype_t& cur_lock, CPContext* context) { + auto const prev_gen = node->node_gen(); + + unlock_node(node, cur_lock); + cur_lock = locktype_t::NONE; + + auto ret = lock_node(node, locktype_t::WRITE, context); + if (ret != btree_status_t::success) { return ret; } + + if (node->is_node_deleted() || (prev_gen != node->node_gen())) { + unlock_node(node, locktype_t::WRITE); + return btree_status_t::retry; + } + cur_lock = locktype_t::WRITE; + return ret; +} + +btree_status_t BtreeBase::_lock_node(const BtreeNodePtr& node, locktype_t type, CPContext* context, const char* fname, + int line) const { +#ifdef _DEBUG + _start_of_lock(node, type, fname, line); +#endif + node->lock(type); + + auto ret = m_bt_private->refresh_node(node, (type == locktype_t::WRITE), context); + if (ret != btree_status_t::success) { + node->unlock(type); +#ifdef _DEBUG + end_of_lock(node, type); +#endif + return ret; + } + + return btree_status_t::success; +} + +void BtreeBase::unlock_node(const BtreeNodePtr& node, locktype_t type) const { + node->unlock(type); +#ifdef _DEBUG + auto time_spent = end_of_lock(node, type); + observe_lock_time(node, type, time_spent); +#endif +} + +BtreeNodePtr BtreeBase::create_leaf_node(CPContext* context) { + BtreeNodePtr n = m_bt_private->create_node(true /* is_leaf */, context); + if (n) { + COUNTER_INCREMENT(m_metrics, btree_leaf_node_count, 1); + ++m_total_nodes; + } + return n; +} + +BtreeNodePtr BtreeBase::create_interior_node(CPContext* context) { + BtreeNodePtr n = m_bt_private->create_node(false /* is_leaf */, context); + if (n) { + COUNTER_INCREMENT(m_metrics, btree_int_node_count, 1); + ++m_total_nodes; + } + return n; +} + +BtreeNodePtr BtreeBase::clone_temp_node(const BtreeNode& node) { + BtreeNodePtr tmp_node = new_node(node.node_id(), node.is_leaf(), BtreeNode::Allocator::default_token); + tmp_node->overwrite(node); + return tmp_node; +} + +[[nodiscard]] CPGuard BtreeBase::bt_cp_guard() { return CPGuard{is_ephemeral() ? nullptr : &(cp_mgr())}; } + +/* Note:- This function assumes that access of this node is thread safe. */ + +void BtreeBase::remove_node(const BtreeNodePtr& node, locktype_t cur_lock, CPContext* context) { + BT_NODE_LOG(TRACE, node, "Removing node"); + + COUNTER_DECREMENT_IF_ELSE(m_metrics, node->is_leaf(), btree_leaf_node_count, btree_int_node_count, 1); + if (cur_lock != locktype_t::NONE) { + BT_NODE_DBG_ASSERT_NE(cur_lock, locktype_t::READ, node, "We can't remove a node with read lock type right?"); + node->set_node_deleted(); + unlock_node(node, cur_lock); + } + --m_total_nodes; + + m_bt_private->remove_node(node, context); + // intrusive_ptr_release(node.get()); +} + +#ifdef _DEBUG +void BtreeBase::observe_lock_time(const BtreeNodePtr& node, locktype_t type, uint64_t time_spent) const { + if (time_spent == 0) { return; } + + if (type == locktype_t::READ) { + HISTOGRAM_OBSERVE_IF_ELSE(m_metrics, node->is_leaf(), btree_inclusive_time_in_leaf_node, + btree_inclusive_time_in_int_node, time_spent); + } else { + HISTOGRAM_OBSERVE_IF_ELSE(m_metrics, node->is_leaf(), btree_exclusive_time_in_leaf_node, + btree_exclusive_time_in_int_node, time_spent); + } +} + +void BtreeBase::_start_of_lock(const BtreeNodePtr& node, locktype_t ltype, const char* fname, int line) { + NodeLockInfo info; + + info.fname = fname; + info.line = line; + + info.start_time = Clock::now(); + info.node = node.get(); + if (ltype == locktype_t::WRITE) { + thread_vars()->wr_locked_nodes.push_back(info); + LOGTRACEMOD(btree, "ADDING node {} to write locked nodes list, its size={}", (void*)info.node, + thread_vars()->wr_locked_nodes.size()); + } else if (ltype == locktype_t::READ) { + thread_vars()->rd_locked_nodes.push_back(info); + LOGTRACEMOD(btree, "ADDING node {} to read locked nodes list, its size={}", (void*)info.node, + thread_vars()->rd_locked_nodes.size()); + } else { + DEBUG_ASSERT(false, "Invalid locktype_t {}", ltype); + } +} + +bool BtreeBase::remove_locked_node(const BtreeNodePtr& node, locktype_t ltype, NodeLockInfo* out_info) { + auto pnode_infos = (ltype == locktype_t::WRITE) ? &thread_vars()->wr_locked_nodes : &thread_vars()->rd_locked_nodes; + + if (!pnode_infos->empty()) { + auto info = pnode_infos->back(); + if (info.node == node.get()) { + *out_info = info; + pnode_infos->pop_back(); + LOGTRACEMOD(btree, "REMOVING node {} from {} locked nodes list, its size = {}", (void*)info.node, + (ltype == locktype_t::WRITE) ? "write" : "read", pnode_infos->size()); + return true; + } else if (pnode_infos->size() > 1) { + info = pnode_infos->at(pnode_infos->size() - 2); + if (info.node == node.get()) { + *out_info = info; + pnode_infos->at(pnode_infos->size() - 2) = pnode_infos->back(); + pnode_infos->pop_back(); + LOGTRACEMOD(btree, "REMOVING node {} from {} locked nodes list, its size = {}", (void*)info.node, + (ltype == locktype_t::WRITE) ? "write" : "read", pnode_infos->size()); + return true; + } + } + } + + if (pnode_infos->empty()) { + LOGERRORMOD(btree, "locked_node_list: node = {} not found, locked node list empty", (void*)node.get()); + } else if (pnode_infos->size() == 1) { + LOGERRORMOD(btree, "locked_node_list: node = {} not found, total list count = 1, Expecting node = {}", + (void*)node.get(), (void*)pnode_infos->back().node); + } else { + LOGERRORMOD(btree, "locked_node_list: node = {} not found, total list count = {}, Expecting nodes = {} or {}", + (void*)node.get(), pnode_infos->size(), (void*)pnode_infos->back().node, + (void*)pnode_infos->at(pnode_infos->size() - 2).node); + } + return false; +} + +uint64_t BtreeBase::end_of_lock(const BtreeNodePtr& node, locktype_t ltype) { + NodeLockInfo info; + if (!remove_locked_node(node, ltype, &info)) { + DEBUG_ASSERT(false, "Expected node = {} is not there in locked_node_list", (void*)node.get()); + return 0; + } + // DEBUG_ASSERT_EQ(node.get(), info.node); + return get_elapsed_time_ns(info.start_time); +} + +void BtreeBase::check_lock_debug() { + // both wr_locked_nodes and rd_locked_nodes are thread_local; + // nothing will be dumpped if there is no assert failure; + for (const auto& x : thread_vars()->wr_locked_nodes) { + x.dump(); + } + for (const auto& x : thread_vars()->rd_locked_nodes) { + x.dump(); + } + DEBUG_ASSERT_EQ(thread_vars()->wr_locked_nodes.size(), 0); + DEBUG_ASSERT_EQ(thread_vars()->rd_locked_nodes.size(), 0); +} +#endif + +BtreeRouteTracer::BtreeRouteTracer(uint32_t buf_size_per_op, bool log_if_rolled) : + m_max_buf_size_per_op{buf_size_per_op}, m_log_if_rolled{log_if_rolled} { + m_enabled_ops.reserve(enum_count< BtreeRouteTracer::Op >()); + m_ops_routes.reserve(enum_count< BtreeRouteTracer::Op >()); + + for (uint32_t i{0}; i < enum_count< BtreeRouteTracer::Op >(); ++i) { + m_enabled_ops.push_back(false); + } +} + +void BtreeRouteTracer::append_to(Op op, std::string const& route_str) { + std::string& cur_buf = m_ops_routes[uint32_cast(op)]; + if (!m_enabled_ops[uint32_cast(op)]) { return; } + + std::unique_lock< iomgr::FiberManagerLib::shared_mutex > lock{m_append_mtx}; + while (cur_buf.size() + route_str.size() > m_max_buf_size_per_op) { + size_t head_pos = cur_buf.find("Route size="); + size_t next_pos = cur_buf.find("Route size=", head_pos + 1); + if (m_log_if_rolled) { + // TODO: We need to change this to btree specific log. + LOGINFOMOD(btree, "Btree Route Trace: {}", std::string_view(cur_buf).substr(head_pos, next_pos)); + } + + if (next_pos == std::string::npos) { + cur_buf.clear(); + break; + } else { + cur_buf.erase(0, next_pos); + } + } + cur_buf.append(route_str); +} + +std::string BtreeRouteTracer::get(Op op) const { + m_append_mtx.lock_shared(); + std::shared_lock< iomgr::FiberManagerLib::shared_mutex > lock{m_append_mtx}; + auto const ret = m_ops_routes[uint32_cast(op)]; + m_append_mtx.unlock_shared(); + return ret; +} + +std::vector< std::string > BtreeRouteTracer::get_all() const { + m_append_mtx.lock_shared(); + auto const ret = m_ops_routes; + m_append_mtx.unlock_shared(); + return ret; +} +} // namespace homestore \ No newline at end of file diff --git a/src/lib/index/cow_btree/CMakeLists.txt b/src/lib/index/cow_btree/CMakeLists.txt new file mode 100644 index 000000000..6d8be490d --- /dev/null +++ b/src/lib/index/cow_btree/CMakeLists.txt @@ -0,0 +1,13 @@ +cmake_minimum_required(VERSION 3.11) + +include_directories (BEFORE ../) + +add_library(hs_cow_btree OBJECT) +target_sources(hs_cow_btree PRIVATE + cow_btree_store.cpp + cow_btree.cpp + #cow_btree_node.cpp + cow_btree_cp.cpp + ) +target_link_libraries(hs_cow_btree hs_common ${COMMON_DEPS}) +#add_library(hs_cow_btree OBJECT ${COW_BTREE_SOURCE_FILES}) diff --git a/src/lib/index/cow_btree/cow_btree.cpp b/src/lib/index/cow_btree/cow_btree.cpp new file mode 100644 index 000000000..0b04a60a8 --- /dev/null +++ b/src/lib/index/cow_btree/cow_btree.cpp @@ -0,0 +1,943 @@ +#include +#include "index/cow_btree/cow_btree.h" +#include "index/cow_btree/cow_btree_cp.h" +//#include "index/cow_btree/cow_btree_node.h" +#include "index/index_cp.h" +#include "common/homestore_config.hpp" +#include "common/homestore_utils.hpp" +#include "common/crash_simulator.hpp" +#include "device/virtual_dev.hpp" + +namespace homestore { +#define COWBT_PERIODIC_LOG(level, cp_id, ...) \ + HS_PERIODIC_DETAILED_LOG(level, cp, "cp_id", cp_id, "btree", m_base_btree.bt_config().name(), __VA_ARGS__) + +static constexpr uint64_t btree_nodeid_bits = sizeof(uint32_t) * 8; +static constexpr uint64_t btree_ordinal_bits = 64 - btree_nodeid_bits; +static constexpr uint64_t btree_nodeid_mask = ((1ull << btree_nodeid_bits) - 1); +static constexpr uint64_t btree_ordinal_mask = ((1ull << btree_ordinal_bits) - 1) << btree_nodeid_bits; + +static constexpr uint32_t initial_bnodeid_map_persistent_size = 512 * 1024; + +static inline COWBtree::CompactNodeId to_compact_nodeid(bnodeid_t node_id) { return node_id & btree_nodeid_mask; } + +static BlkId alloc_blks_or_fail(VirtualDev* vdev, uint32_t size, blk_alloc_hints const& hints) { + BlkId out_blkid; + BlkAllocStatus status = vdev->alloc_contiguous_blks(size, hints, out_blkid); + HS_REL_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, + "No space to write the bnode map, which cannot be proceeded further, crashing the system for now"); + return out_blkid; +} + +static void write_or_fail(VirtualDev* vdev, sisl::io_blob const& blob, BlkId location) { + auto err = vdev->sync_write(r_cast< const char* >(blob.cbytes()), blob.size(), location); + HS_REL_ASSERT(!err, "Flush of full map failed with err={}. best is to crash the system and replay", err.message()); +} + +COWBtree::COWBtree(BtreeBase& bt, shared< VirtualDev > vdev, + shared< sisl::SimpleCache< bnodeid_t, BtreeNodePtr > > cache, + std::vector< unique< Journal > > journals, BtreeNode::Allocator::Token token, bool load_existing) : + m_base_btree{bt}, + m_cache{std::move(cache)}, + m_nodeid_generator(std::numeric_limits< uint32_t >::max()), + m_vdev{std::move(vdev)}, + m_btree_ordinal{bt.super_blk()->ordinal}, + m_ordinal_shifted{uint64_cast(m_btree_ordinal) << btree_nodeid_bits}, + m_bufalloc_token{token} { + for (auto& cp_session : m_cp_sessions) { + cp_session = std::make_unique< CPSession >(*this); + } + + if (load_existing) { + m_root_node_id = m_base_btree.bt_super_blk().root_node_id; + if (m_root_node_id != empty_bnodeid) { + HS_REL_ASSERT_EQ(m_root_node_id & btree_ordinal_mask, m_ordinal_shifted, + "Ordinal of root node_id inside the superblk doesn't match btree's ordinal"); + } + + // If we have full map persisted before, recover that + for (uint32_t i{0}; i < cow_bt_super_blk().num_map_heads; ++i) { + recover_bnode_map(cow_bt_super_blk().map_heads[i]); + } + + // Apply all incremental journal entries containing map updates/removes. Each journal_buf listed here + // corresponding to a journal written as part of cps, sorted by the cp_id + for (auto& journal : journals) { + apply_incremental_map(*journal); + } + } else { + // New COWBtree, format the cow btree superblk area + new (m_base_btree.bt_super_blk().underlying_btree_sb.data()) SuperBlock(); + } +} + +static inline COWBtreeCPContext* to_my_cp_ctx(CPContext* context) { + return IndexCPContext::convert< COWBtreeCPContext >(context, IndexStore::Type::COPY_ON_WRITE_BTREE); +} + +BtreeNodePtr COWBtree::create_node(bool is_leaf, CPContext* context) { + auto n = m_base_btree.new_node(generate_node_id(), is_leaf, m_bufalloc_token); + // COWBtreeNode::construct(n); + + // Add the node to the cache + auto status = m_cache->insert(n); + HS_REL_ASSERT_EQ(status, sisl::SimpleCacheStatus::success, + "Unable to add alloc'd node to cache, low memory or duplicate inserts?"); + + add_to_dirty_list(FlushNodeInfo{n}, to_my_cp_ctx(context)); + n->set_modified_cp_id(context->id()); + return n; +} + +btree_status_t COWBtree::write_node(BtreeNodePtr const& node, CPContext*) { + // All the required actions are performed during refresh_node with read_modify_write=true + return btree_status_t::success; +} + +btree_status_t COWBtree::read_node(bnodeid_t node_id, BtreeNodePtr& node) const { +retry: + // Attempt to locate the node in the cache + auto status = m_cache->get(node_id, node); + if (status == sisl::SimpleCacheStatus::success) { return btree_status_t::success; } + + // Need to read from the blk, so check that in the map + BlkId blkid = get_blkid_for_nodeid(node_id); + if (!blkid.is_valid()) { return btree_status_t::not_found; } + + auto raw_buf = BtreeNode::Allocator::get(m_bufalloc_token).alloc_node_buf(m_base_btree.node_size()); + m_vdev->sync_read(r_cast< char* >(raw_buf), m_base_btree.node_size(), blkid); + + // Initialize the node + node = m_base_btree.load_node(raw_buf, node_id, m_bufalloc_token); + // COWBtreeNode::construct(node) + + // Add the node to the cache + status = m_cache->insert(node); + if (status == sisl::SimpleCacheStatus::duplicate) { + // There is a race between 2 concurrent reads of same node, Re-read from cache again + // COWBtreeNode::destruct(node.get()); + goto retry; + } else if (status == sisl::SimpleCacheStatus::success) { + return btree_status_t::success; + } else { + HS_DBG_ASSERT(false, "Insert read node to cache failed, probably because of low memory status={}", + enum_name(status)); + return btree_status_t::space_not_avail; + } +} + +btree_status_t COWBtree::refresh_node(BtreeNodePtr const& node, bool for_read_modify_write, CPContext* context) { + if (context == nullptr || !for_read_modify_write) { return btree_status_t::success; } + + auto cp_ctx = to_my_cp_ctx(context); + auto const mod_cp_id = node->get_modified_cp_id(); + auto const cur_cp_id = cp_ctx->id(); + if (mod_cp_id == cur_cp_id) { + // For same cp, we don't need a copy, we can rewrite on the same buffer + return btree_status_t::success; + } else if (mod_cp_id > cur_cp_id) { + return btree_status_t::cp_mismatch; // We are asked to provide the buffer of an older CP, which is not + // possible + } else { + add_to_dirty_list(FlushNodeInfo{node}, cp_ctx); + node->set_modified_cp_id(cur_cp_id); + } + return btree_status_t::success; +} + +void COWBtree::remove_node(BtreeNodePtr const& node, CPContext* context) { + // Add the node id to dirty deleted list, which will be applied during the cp flush + auto cp_ctx = to_my_cp_ctx(context); + add_to_remove_list(node->node_id(), cp_ctx); + + // Now we can remove the node from cache. + BtreeNodePtr tmp; + auto status = m_cache->remove(node->node_id(), tmp); + HS_DBG_ASSERT_EQ(status, sisl::SimpleCacheStatus::success, "Race on cache removal of btree blkid?"); +} + +btree_status_t COWBtree::transact_nodes(const BtreeNodeList& new_nodes, const BtreeNodeList& removed_nodes, + const BtreeNodePtr& left_child_node, const BtreeNodePtr& parent_node, + CPContext* context) { + for (const auto& node : new_nodes) { + m_base_btree.write_node(node, context); + } + m_base_btree.write_node(left_child_node, context); + m_base_btree.write_node(parent_node, context); + + for (const auto& node : removed_nodes) { + m_base_btree.remove_node(node, locktype_t::WRITE, context); + } + return btree_status_t::success; +} + +BtreeLinkInfo COWBtree::load_root_node_id() { + return BtreeLinkInfo{m_root_node_id, m_base_btree.bt_super_blk().root_link_version}; +} + +btree_status_t COWBtree::on_root_changed(BtreeNodePtr const& new_root, CPContext* cp_ctx) { + m_root_node_id = new_root->node_id(); + cp_session(cp_ctx->id())->m_new_root_id.store(m_root_node_id); + return btree_status_t::success; +} + +uint64_t COWBtree::space_occupied() const { + size_t num_nodes{0}; + { + std::shared_lock< iomgr::FiberManagerLib::shared_mutex > lg(m_bnodeid_map.m_mtx); + num_nodes = m_bnodeid_map.m_map.size(); + } + + for (auto const& cp_session : m_cp_sessions) { + num_nodes += cp_session->m_modified_nodes.size(); + num_nodes -= cp_session->m_deleted_nodes.size(); + } + + return num_nodes * m_vdev->block_size(); +} + +void COWBtree::destroy() { + // Free all the blks allocated for the nodes + std::unique_lock< iomgr::FiberManagerLib::shared_mutex > lg(m_bnodeid_map.m_mtx); + BtreeNodePtr tmp; + for (auto const [node_id, blkid] : m_bnodeid_map.m_map) { + m_vdev->free_blk(blkid.to_blkid()); + m_cache->remove(m_ordinal_shifted | node_id, tmp); + } + + // Free all the blks allocated for the map + for (auto const& locs : m_bnodeid_map.m_locations) { + m_vdev->free_blk(locs); + } + + // Reset the map, cp_session etc. + m_bnodeid_map.m_map.clear(); + m_bnodeid_map.m_updates_since_last_flush = 0; + m_bnodeid_map.m_locations.clear(); + + // Reset all the dirty nodes, deleted nodes etc. + for (auto& cp_session : m_cp_sessions) { + for (auto const& finfo : cp_session->m_modified_nodes) { + m_cache->remove(finfo.node->node_id(), tmp); + } + cp_session->finish(); + } + + // Destroy this btree's superblk, so that it can be re-initialized again. + m_base_btree.super_blk().destroy(); +} + +//////////////////////// COWBtree specific methods //////////////////////////////////////// +bnodeid_t COWBtree::generate_node_id() { + std::unique_lock lg{m_id_mtx}; + return (m_ordinal_shifted | m_nodeid_generator.reserve()); +} + +BlkId COWBtree::get_blkid_for_nodeid(bnodeid_t nodeid) const { return lookup_bnode_map(to_compact_nodeid(nodeid)); } + +void COWBtree::add_to_dirty_list(FlushNodeInfo finfo, COWBtreeCPContext* cp_ctx) { + cp_ctx->increment_dirty_size(finfo.node->node_size()); + cp_session(cp_ctx->id())->m_modified_nodes.emplace_back(std::move(finfo)); +} + +void COWBtree::add_to_remove_list(bnodeid_t node_id, COWBtreeCPContext* cp_ctx) { + cp_ctx->increment_pending_free_size(m_base_btree.node_size()); + cp_session(cp_ctx->id())->m_deleted_nodes.push_back(node_id); +} + +// FlushUnit represents one contiguous block where all btree nodes that can be packed are done and written at once +struct NodeFlushUnit { +#pragma pack(1) + struct JournalEntry { + COWBtree::CompactBlkId nodes_location; // Location where nodes from this unit are written + uint16_t n_nodes{0}; // Total number of nodes written + COWBtree::CompactNodeId nodes[1]; // Array of node ids written in the blk above + + uint32_t size() const { return size(n_nodes); } + static uint32_t size(uint16_t num_nodes) { + return sizeof(JournalEntry) + (num_nodes * sizeof(COWBtree::CompactNodeId)) - + sizeof(COWBtree::CompactNodeId); + } + }; +#pragma pack() + + COWBtreeCPContext* m_cp_ctx; + JournalEntry* m_jentry{nullptr}; + std::vector< iovec > m_iovs; + std::vector< COWBtree::FlushNodeInfo > m_flush_infos; + BlkId m_nodes_location; + uint32_t m_nodes_count{0}; + + NodeFlushUnit(COWBtreeCPContext* cp_ctx, BlkId location, sisl::blob& journal_area) : + m_cp_ctx{cp_ctx}, m_nodes_location{location} { + if (journal_area.bytes() != nullptr) { + m_jentry = new (journal_area.bytes()) JournalEntry(); + m_jentry->nodes_location = location; + } + m_iovs.reserve(location.blk_count()); + m_flush_infos.reserve(location.blk_count()); + } + + void add(COWBtree::FlushNodeInfo finfo) { + HS_DBG_ASSERT_LT(m_nodes_count, m_nodes_location.blk_count(), "Adding more nodes than node allocated for"); + // m_node_bufs.emplace_back(cow_node->get_flush_version_buf(m_cp_ctx->id())); + m_iovs.emplace_back(iovec{.iov_base = finfo.bytes(), .iov_len = finfo.node->node_size()}); + ++m_nodes_count; + if (m_jentry) { m_jentry->nodes[m_jentry->n_nodes++] = to_compact_nodeid(finfo.node->node_id()); } + m_flush_infos.emplace_back(std::move(finfo)); + } +}; + +struct BNodeMapWriteUnit { +#pragma pack(1) + struct Header { + COWBtree::CompactBlkId next_unit_location; // Location of where the next meta (for map) is present + uint32_t size{sizeof(Header)}; // Total size of this unit. + uint32_t n_entries{0}; // Total number of entries in this unit + uint32_t checksum{0}; // Checksum excluding this header + }; + + // One Entry per continguos nodeids. + struct MapEntry { + COWBtree::CompactNodeId nodeid_start{0}; + uint16_t nodes_count{0}; + COWBtree::CompactBlkId nodes_locations[1]; + + static size_t size(uint32_t count) { + return sizeof(MapEntry) + (count ? (count - 1) * sizeof(COWBtree::CompactBlkId) : 0); + } + size_t size() const { return size(nodes_count); } + + bool merge_if_possible(COWBtree::CompactNodeId n, COWBtree::CompactBlkId b) { + if (nodes_count == 0) { + nodeid_start = n; + nodes_locations[nodes_count++] = b; + return true; + } else if ((nodeid_start + nodes_count) == n) { + nodes_locations[nodes_count++] = b; + return true; + } + return false; + } + }; +#pragma pack() + + VirtualDev* m_vdev; + sisl::io_blob_safe m_buf; + uint32_t m_available_space{0}; + BlkId m_location; + MapEntry* m_cur_entry{nullptr}; + +public: + // Guess the size expecting 64 nodes packed together. + static constexpr const uint32_t expected_nodes_packed_per_entry = 64; + + static uint32_t size_guess(uint32_t num_nodes) { + return MapEntry::size(num_nodes / expected_nodes_packed_per_entry); + } + + static constexpr uint32_t const min_blks_per_write_unit = 128; + + BNodeMapWriteUnit(VirtualDev* vdev, uint32_t nodes_count) : m_vdev{vdev} { + m_available_space = sisl::round_up(size_guess(nodes_count), m_vdev->block_size()); + auto const reqd_blks = (m_available_space - 1) / m_vdev->block_size() + 1; + + // First allocate the blks and adjust the available space to how much ever we were able to allocate + // contiguously. + + blk_alloc_hints hints = {.partial_alloc_ok = true, + .min_blks_per_piece = std::min(reqd_blks, min_blks_per_write_unit)}; + m_location = alloc_blks_or_fail(m_vdev, m_available_space, hints); + m_available_space = m_location.blk_count() * m_vdev->block_size(); + + // Allocate buffer to hold up that much disk space we allocated. + m_buf = sisl::io_blob_safe(m_available_space, vdev->align_size(), sisl::buftag::metablk); + memset(m_buf.bytes(), 0, m_available_space); + + // Initialize the in-memory pointers + new (m_buf.bytes()) Header(); + m_available_space -= sizeof(Header); + m_cur_entry = r_cast< MapEntry* >(m_buf.bytes() + sizeof(Header)); + } + + // Recovery constructor + BNodeMapWriteUnit(VirtualDev* vdev, sisl::io_blob_safe buf, BlkId location) : + m_vdev{vdev}, m_buf{std::move(buf)}, m_location{location} { + HS_DBG_ASSERT_GE(m_buf.size(), header()->size, "Read buf is less than MapWriteUnit size on-disk"); + HS_REL_ASSERT_EQ(header()->checksum, compute_crc(), "CRC Mismatch on MapWriteUnit"); + + m_available_space = m_buf.size() - header()->size; + m_cur_entry = header()->n_entries ? r_cast< MapEntry* >(m_buf.bytes() + sizeof(Header)) : nullptr; + } + + bool has_room() const { return (m_available_space > MapEntry::size(1)); } + + bool is_empty() const { return (header_const()->size == sizeof(Header)); } + + void add_entry(COWBtree::CompactNodeId n, COWBtree::CompactBlkId b) { + HS_REL_ASSERT_EQ(has_room(), true, "Calling add_entry without any room"); + if (m_cur_entry->merge_if_possible(n, b)) { + header()->size += sizeof(COWBtree::CompactBlkId); + m_available_space -= sizeof(COWBtree::CompactBlkId); + } else { + ++(header()->n_entries); + m_cur_entry = r_cast< MapEntry* >(uintptr_cast(m_cur_entry) + m_cur_entry->size()); + m_cur_entry->merge_if_possible(n, b); + + m_available_space -= MapEntry::size(1u); + header()->size += MapEntry::size(1u); + } + } + + MapEntry* next_entry() { + MapEntry* ret_entry = m_cur_entry; + if (m_cur_entry) { + uint8_t* next_ptr = uintptr_cast(m_cur_entry) + m_cur_entry->size(); + m_cur_entry = (next_ptr > (m_buf.bytes() + m_buf.size())) ? nullptr : r_cast< MapEntry* >(next_ptr); + } + return ret_entry; + } + + void link(BNodeMapWriteUnit& next) { header()->next_unit_location = COWBtree::CompactBlkId{next.m_location}; } + + sisl::io_blob finalize() { + ++(header()->n_entries); // We increment as the last entry would be open until we finalize + + // Trim down the alloc size and actual blks (if we alloced them) + auto const occupied_blks = m_location.blk_count() - (m_available_space / m_vdev->block_size()); + auto const [valid, freeable] = m_location.split(occupied_blks); + m_vdev->free_blk(freeable); + m_location = valid; + m_available_space = 0; + + // Write the checksum + header()->checksum = compute_crc(); + + return sisl::io_blob{m_buf.cbytes(), valid.blk_count() * m_vdev->block_size(), true /* is_aligned */}; + } + + Header* header() { return r_cast< Header* >(m_buf.bytes()); } + Header const* header_const() const { return r_cast< Header const* >(m_buf.cbytes()); } + + uint32_t compute_crc() const { + return crc32_ieee(init_crc32, r_cast< const uint8_t* >(header_const()) + sizeof(Header), + header_const()->size - sizeof(Header)); + } + + std::string to_string() const { + std::string str; + auto* hdr = header_const(); + fmt::vformat_to( + std::back_inserter(str), + fmt::string_view{ + "\nLocation: [{}], Header: [next_unit_location:[{}], size={}, n_entries={}, checksum={}]\n"}, + fmt::make_format_args(m_location.to_string(), hdr->next_unit_location.to_compact_string(), hdr->size, + hdr->n_entries, hdr->checksum)); + + auto* entry = r_cast< MapEntry const* >(m_buf.cbytes() + sizeof(Header)); + for (uint32_t i{0}; i < hdr->n_entries; ++i) { + fmt::vformat_to( + std::back_inserter(str), fmt::string_view{" NodeEntry{}: [ids=[{}-{}], locations:["}, + fmt::make_format_args(i, entry->nodeid_start, entry->nodeid_start + entry->nodes_count - 1)); + + for (uint16_t j{0}; j < entry->nodes_count; ++j) { + fmt::vformat_to(std::back_inserter(str), fmt::string_view{"[{}],"}, + fmt::make_format_args(entry->nodes_locations[j].to_compact_string())); + } + fmt::format_to(std::back_inserter(str), "]\n"); + entry = r_cast< MapEntry const* >(r_cast< uint8_t const* >(entry) + entry->size()); + } + return str; + } +}; + +std::tuple< bool, unique< COWBtree::Journal > > COWBtree::flush_nodes(COWBtreeCPContext* cp_ctx) { + CPSession* session = cp_session(cp_ctx->id()); + + // We prepare to flush nodes, by allocating blks in vdev to accomodate all the dirty blks. Its obviously not + // possible to put all nodes in a single huge contiguous blk. However, it tries to allocate as big as possible and + // then pack nodes inside these blks. + if (!session->prepare_to_flush_nodes(cp_ctx)) { + // Already flushed the cp and moved on. + return std::make_tuple(false, nullptr); + } + + // 3 steps on per btree CP node flush + // + // Step 1: Flush all the nodes by building flush units (with each unit consists of 1 contiguous blk worth) and + // while doing so, keep updating the in-memory map as well as adding to incremental journal with the map + // updates. + do { + auto [location, mod_it, journal_area] = session->next_dirty(); + if (!location.is_valid()) { + break; // We are done with dirty buffers + } + + NodeFlushUnit nfunit(cp_ctx, location, journal_area); + for (uint16_t i{0}; i < location.blk_count(); ++i) { + FlushNodeInfo finfo{std::move(*mod_it)}; + ++mod_it; + + // Keep updating the full inmemory map of nodeid and blkid. + // IMPORTANT NODE: We do that before actually writing the data. It is ok to do so, under the assumption that + // there will be no reads into the bnode map while this is being flushed because nodes are cached until + // flush is completed. If for any reason we need to support skipping cache, then we should update this bnode + // map after it has been written. We are doing this here as an optimization to avoid looping for every node + // and then update. + update_bnode_map(to_compact_nodeid(finfo.node->node_id()), CompactBlkId{location, i}, + false /* in_recovery */); + nfunit.add(std::move(finfo)); + } + + auto err = m_vdev->sync_writev(nfunit.m_iovs.data(), int_cast(nfunit.m_iovs.size()), nfunit.m_nodes_location); + HS_REL_ASSERT(!err, "Flush of nodes failed during cp, best is to crash the system and retry on reboot"); + } while (true); + + if (session->done_flushing_nodes()) { + // + // Step 2: During cp io phase, all deleted nodes are tracked, we delete them from in-memory map and also + // build the journal with this delete operation. + // + auto [it, end_it, journal_area] = session->next_deleted(); + CompactNodeId* delete_jentries = r_cast< CompactNodeId* >(journal_area.bytes()); + uint32_t deleted_count = 0; + while (it != end_it) { + auto nodeid = *it; + delete_from_bnode_map(nodeid, false /* in_recovery */); + if (delete_jentries) { delete_jentries[deleted_count++] = nodeid; } + ++it; + } + + COWBT_PERIODIC_LOG(DEBUG, cp_ctx->id(), "Flushed {} dirty nodes and deleted {} nodes", + session->m_modified_count, session->m_deleted_count); + m_bnodeid_map.m_updates_since_last_flush.fetch_add(session->m_modified_count + session->m_deleted_count); + + // If map has to be updated, we need to hold onto the session and it will be completed after that is done. + // Otherwise, we can complete the session now (which means all dirty node list, deleted node list, journal and + // everything has been cleaned) + if (cp_ctx->need_full_map_flush()) { + return std::make_tuple(session->m_modified_count || session->m_deleted_count, + std::move(session->m_journal)); + } else { + auto journal = std::move(session->m_journal); + + // Step 3: Update the new root into the journal + auto new_root = session->new_root_id(); + if (new_root != empty_bnodeid) { journal->header()->new_root_nodeid = to_compact_nodeid(new_root); } + + bool const has_modified = session->m_modified_count || session->m_deleted_count; + session->finish(); + return std::make_tuple(has_modified, std::move(journal)); + } + } else { + return std::make_tuple(false, nullptr); + } +} + +void COWBtree::flush_map(COWBtreeCPContext* cp_ctx) { + HS_DBG_ASSERT(cp_ctx->need_full_map_flush(), "Flush map called on a cp which doesn't need full map flush"); + + if (m_bnodeid_map.m_updates_since_last_flush.load() == 0) { + COWBT_PERIODIC_LOG(DEBUG, cp_ctx->id(), "No update of the bnodeid map since last flush, so ignoring"); + return; + } + + CPSession* session = cp_session(cp_ctx->id()); + auto const entries = session->prepare_to_flush_map(cp_ctx); + auto count = entries.size(); + if (count == 0) { return; } + + auto it = entries.begin(); + + // Worst Estimate of 1 entry per count packed in a single blk + std::vector< BlkId > map_locations; + map_locations.reserve((BNodeMapWriteUnit::MapEntry::size(1) * count) / m_vdev->block_size()); + + auto munit = std::make_unique< BNodeMapWriteUnit >(m_vdev.get(), count); + while (count > 0) { + if (!munit->has_room()) { + auto new_unit = std::make_unique< BNodeMapWriteUnit >(m_vdev.get(), count); + munit->link(*new_unit); + + auto const blob = munit->finalize(); + COWBT_PERIODIC_LOG(TRACE, cp_ctx->id(), "Flushing a map unit: {}", munit->to_string()); + write_or_fail(m_vdev.get(), blob, munit->m_location); + map_locations.emplace_back(munit->m_location); + + munit = std::move(new_unit); + } + munit->add_entry(it->first, it->second); + ++it; + --count; + } + + if (!munit->is_empty()) { + auto const blob = munit->finalize(); + COWBT_PERIODIC_LOG(TRACE, cp_ctx->id(), "Flushing a map unit: {}", munit->to_string()); + write_or_fail(m_vdev.get(), blob, munit->m_location); + map_locations.emplace_back(munit->m_location); + } + + auto const [done, all_map_locations] = session->done_flushing_map(std::move(map_locations), entries.size()); + if (!done) { + // Still there are other fibers flushing the map. + return; + } + +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("crash_during_full_map_flush", ordinal())) { + LOGINFOMOD(btree, "Simulating crash during the full map flush on btree={}", ordinal()); + hs()->crash_simulator().start_crash(); + } +#endif + + // We are the last fiber to finish parallel flush of map, its time to update the superblk with all map locations and + // flush the superblk and free up old map blks. + SuperBlock& sb = cow_bt_super_blk(); + sb.num_map_heads = 0; + sb.cp_id = cp_ctx->id(); + for (auto const& map_locs : all_map_locations) { + sb.map_heads[sb.num_map_heads++] = map_locs[0]; // Pick head of each map locs from different fibers + } + + // Persist the superblk now with the updated root_id + auto root_node = session->new_root_id(); + if (root_node != empty_bnodeid) { m_base_btree.bt_super_blk().root_node_id = root_node; } + m_base_btree.super_blk().write(); + session->finish(); + + // We have completed the flush of map and now we can free up the old map blks. It is ok if system crashed after + // persisting superblk containing new map location and before freeing this blks, because these blks are in-memory + // bitmap, hence it will not be marked as busy upon restart. + for (auto const& loc : m_bnodeid_map.m_locations) { + m_vdev->free_blk(loc); + } + + // We need to replace the previous map_locations in-memory with this new set of locations where map is written + m_bnodeid_map.m_locations.clear(); + for (auto const& loc_array : all_map_locations) { + m_bnodeid_map.m_locations.insert(m_bnodeid_map.m_locations.end(), loc_array.begin(), loc_array.end()); + } + m_bnodeid_map.m_updates_since_last_flush.store(0); // Reset the count, as we just flushed the full map +} + +void COWBtree::update_bnode_map(CompactNodeId nodeid, CompactBlkId cblkid, bool in_recovery) { + auto do_update = [this](CompactNodeId nodeid, CompactBlkId cblkid) -> bool { + auto it = m_bnodeid_map.m_map.find(nodeid); + bool newly_inserted{false}; + if (it != m_bnodeid_map.m_map.end()) { + m_vdev->free_blk(it->second.to_blkid()); + it->second = cblkid; + } else { + m_bnodeid_map.m_map.emplace(nodeid, cblkid); + newly_inserted = true; + } + return newly_inserted; + }; + + if (in_recovery) { + if (do_update(nodeid, cblkid)) { m_nodeid_generator.reserve(nodeid); } + m_vdev->commit_blk(cblkid.to_blkid()); + } else { + std::unique_lock< iomgr::FiberManagerLib::shared_mutex > lg(m_bnodeid_map.m_mtx); + do_update(nodeid, cblkid); + } +} + +void COWBtree::delete_from_bnode_map(CompactNodeId nodeid, bool in_recovery) { + auto do_delete = [this](CompactNodeId nodeid) { + m_vdev->free_blk(lookup_bnode_map(nodeid)); + m_bnodeid_map.m_map.erase(nodeid); + m_nodeid_generator.unreserve(nodeid); + }; + + if (in_recovery) { + std::unique_lock< iomgr::FiberManagerLib::shared_mutex > lg(m_bnodeid_map.m_mtx); + do_delete(nodeid); + } else { + do_delete(nodeid); + } +} + +BlkId COWBtree::lookup_bnode_map(CompactNodeId nodeid) const { + std::shared_lock< iomgr::FiberManagerLib::shared_mutex > lg(m_bnodeid_map.m_mtx); + auto const it = m_bnodeid_map.m_map.find(nodeid); + return (it == m_bnodeid_map.m_map.cend()) ? BlkId{} : it->second.to_blkid(); +} + +void COWBtree::recover_bnode_map(BlkId const& map_loc) { + // LOGINFMOD(btree, "Recovering NodeID to blkid from location=[{}]", map_loc); + + BlkId next_loc = map_loc; + do { + auto [ec, buf] = m_vdev->sync_read(next_loc); + HS_REL_ASSERT(!ec, "Error while reading bnodeid map, cannot proceed further"); + + m_vdev->commit_blk(next_loc); + m_bnodeid_map.m_locations.push_back(next_loc); + + BNodeMapWriteUnit munit(m_vdev.get(), std::move(buf), next_loc); + for (uint32_t i{0}; i < munit.header()->n_entries; ++i) { + BNodeMapWriteUnit::MapEntry* e = munit.next_entry(); + for (uint32_t n{0}; n < e->nodes_count; ++n) { + update_bnode_map(e->nodeid_start + n, e->nodes_locations[n], true /* in_recovery */); + } + } + next_loc = munit.header()->next_unit_location.to_blkid(); + } while (next_loc.is_valid()); +} + +uint32_t COWBtree::align_size() const { return m_vdev->align_size(); } + +void COWBtree::apply_incremental_map(Journal& journal) { + auto jhdr = journal.header(); + HS_REL_ASSERT_EQ(jhdr->ordinal, m_btree_ordinal, "Btree Ordinal mismatch between journal and in-memory"); + + // If the full map recovered already have recorded this cp_id, we should skip these journals + if (journal.m_cp_id <= cow_bt_super_blk().cp_id) { + SPECIFIC_BT_LOG(INFO, m_base_btree, + "Btree journal for cp_id={} is SKIPPED, because full map already recovered with cp_id={}", + journal.m_cp_id, cow_bt_super_blk().cp_id); + return; + } + + if (jhdr->new_root_nodeid != EmptyCompactNodeId) { + // Root was changed in this incremental map. + m_root_node_id = m_ordinal_shifted | jhdr->new_root_nodeid; + } + + for (uint32_t i{0}; i < jhdr->num_flush_units; ++i) { + NodeFlushUnit::JournalEntry const* nf_jentry = r_cast< NodeFlushUnit::JournalEntry const* >(journal.m_cur_ptr); + BlkId const location = nf_jentry->nodes_location.to_blkid(); + for (uint16_t n{0}; n < nf_jentry->n_nodes; ++n) { + update_bnode_map(nf_jentry->nodes[n], CompactBlkId{location, n}, true /* in_recovery */); + } + journal.m_cur_ptr += nf_jentry->size(); + } + + auto* deleted_nodes = r_cast< CompactNodeId const* >(journal.m_cur_ptr); + for (uint32_t i{0}; i < jhdr->num_delete_units; ++i) { + delete_from_bnode_map(deleted_nodes[i], true /* in_recovery */); + } +} + +COWBtree::CPSession* COWBtree::cp_session(cp_id_t cp_id) { + COWBtree::CPSession* session = m_cp_sessions[cp_id % CPManager::max_concurent_cps].get(); + if (sisl_unlikely(session->m_cp_id != cp_id)) { + session->m_state = CPSession::FlushState::DIRTYING; + session->m_cp_id = cp_id; + } + return session; +} + +#if 0 +FlushNodeInfo COWBtree::get_flush_version_buf(BtreeNodePtr node, cp_id_t cur_cp_id) { + auto [ret_buf, buf_share_count] = node->share_phys_node_buf(); + + if (buf_share_count != 0) { + // Buffer was already shared with another cp session, we need to make a copy + HS_DBG_ASSERT_EQ(node->get_modified_cp_id(), cur_cp_id - 1, + "We have shared buffer of node with cp, but its cp modified id is more than 1, we only " + "support 2 concurrent cp sessions"); + auto new_buf = hs_utils::iobuf_alloc(node->node_size(), sisl::buftag::btree_node, align_size()); + std::memcpy(new_buf, ret_buf, node->node_size()); + ret_buf = new_buf; + node->set_phys_node_buf(new_buf); + } + + node->set_modified_cp_id(cur_cp_id); + return FlushNodeInfo{std::move(node), ret_buf}; +} + +void COWBtree::release_flush_version_buf(FlushNodeInfo const& f) { + auto const buf_share_count = f->node->release_phys_node_buf(); + if (buf_share_count > 1) { + // Some other cp session already has made a copy and updated the phys_node buf, so we need to free this buffer + hs_utils::io_buf_free(f->buf, sisl::bufag::btree_node); + } +} +#endif + +////////////////////////////////////////////// CPSession Section ////////////////////////////////////////////// +bool COWBtree::CPSession::prepare_to_flush_nodes(COWBtreeCPContext* cp_ctx) { + std::lock_guard lg{m_flush_mtx}; + + if (m_state == FlushState::NODES_FLUSHED) { + return false; // Bail out if we have already flushed this session + } else if (m_state == FlushState::NODES_FLUSHING) { + ++m_flushing_req_count; + return true; // Everything is prepared already, join the flush + } + + m_modified_count = m_modified_nodes.size(); + m_deleted_count = m_deleted_nodes.size(); + + if ((m_modified_count == 0) & (m_deleted_count == 0)) { + // Nothing has been dirtied in this btree in this session to flush. + m_state = FlushState::NODES_FLUSHED; + return false; + } + + auto const status = + m_bt.m_vdev->alloc_blks(m_modified_count, blk_alloc_hints{.partial_alloc_ok = true}, m_node_locations); + if ((status != BlkAllocStatus::SUCCESS) && (status != BlkAllocStatus::PARTIAL)) { + HS_REL_ASSERT(false, "Blk allocation to persist btree pages failed, we are crashing for now"); + } + + // Setup all the iterators + m_next_location_idx = 0; + m_modified_it = m_modified_nodes.begin(); + m_deleted_it = m_deleted_nodes.begin(); + + if (!cp_ctx->need_full_map_flush()) { + // Setup the journal buffers + // Size deterimination: + // One location which is a blkid corresponds to 1 flush unit, so total journal size would be + // Journal Header + (Number of flush units * Flush unit header) + Number of nodes + Number of deleted nodes + auto const journal_size = sizeof(Journal::Header) + + (NodeFlushUnit::JournalEntry::size(0) * m_node_locations.size()) + + ((m_modified_count + m_deleted_count) * sizeof(CompactNodeId)); + m_journal = std::make_unique< Journal >(m_bt.m_btree_ordinal, journal_size, cp_ctx->id()); + m_journal->header()->num_flush_units = m_node_locations.size(); + m_journal->header()->num_delete_units = m_deleted_count; + } + + m_state = FlushState::NODES_FLUSHING; + ++m_flushing_req_count; + return true; +} + +std::tuple< BlkId, COWBtree::DirtyNodeList::iterator, sisl::blob > COWBtree::CPSession::next_dirty() { + std::lock_guard lg{m_flush_mtx}; + HS_DBG_ASSERT_EQ(m_state, FlushState::NODES_FLUSHING, + "Unexpected state while pulling a dirty nodes, we expect all fibers have drained the iterator " + "before moving to flushed or collecting state"); + + sisl::blob ret_blob; + if (m_next_location_idx == m_node_locations.size()) { + // We have reached the end of all nodes location's, which means there should be no more dirty + HS_DBG_ASSERT(m_modified_it == m_modified_nodes.end(), + "Mismatch between number of blks allocated for node and dirty node iterator"); + return std::make_tuple(BlkId{}, m_modified_it, ret_blob); + } + + HS_DBG_ASSERT( + m_modified_it != m_modified_nodes.end(), + "There are more blks allocated for nodes, but the dirty list doesn't have anymore node to fill it in"); + + BlkId ret_loc = m_node_locations[m_next_location_idx++]; + auto ret_it = m_modified_it; + m_modified_it += ret_loc.blk_count(); // Move the iterator past the blk_count(). + + if (m_journal) { + auto junit_size = NodeFlushUnit::JournalEntry::size(ret_loc.blk_count()); + ret_blob = sisl::blob{m_journal->allocate(junit_size), junit_size}; + } + return std::make_tuple(ret_loc, ret_it, ret_blob); +} + +std::tuple< COWBtree::DeletedNodeList::iterator, COWBtree::DeletedNodeList::iterator, sisl::blob > +COWBtree::CPSession::next_deleted() { + std::lock_guard lg{m_flush_mtx}; + HS_DBG_ASSERT_EQ(m_state, FlushState::NODES_FLUSHED, + "Unexpected state while pulling a dirty nodes, we expect all fibers have drained the iterator " + "before moving to flushed or collecting state"); + auto ret_it = m_deleted_it; + m_deleted_it = m_deleted_nodes.end(); // Set to end, so that any subsequent requests will get ret_it as end iterator + + sisl::blob ret_blob; + if (m_journal) { + uint32_t const jdel_size = m_deleted_count * sizeof(CompactNodeId); + ret_blob = sisl::blob{m_journal->allocate(jdel_size), jdel_size}; + } + return std::make_tuple(std::move(ret_it), m_deleted_nodes.end(), std::move(ret_blob)); +} + +bnodeid_t COWBtree::CPSession::new_root_id() { return m_new_root_id.exchange(empty_bnodeid); } + +bool COWBtree::CPSession::done_flushing_nodes() { + std::lock_guard lg{m_flush_mtx}; + HS_DBG_ASSERT_EQ(m_state, FlushState::NODES_FLUSHING, + "Received a flush done while state was not in flushing, some race condition?"); + if (--m_flushing_req_count == 0) { + m_state = FlushState::NODES_FLUSHED; + return true; + } + return false; +} + +std::vector< std::pair< COWBtree::CompactNodeId, COWBtree::CompactBlkId > > +COWBtree::CPSession::prepare_to_flush_map(COWBtreeCPContext* cp_ctx) { + std::lock_guard lg{m_flush_mtx}; + + auto to_vector = [this](uint64_t count) { + std::vector< std::pair< COWBtree::CompactNodeId, COWBtree::CompactBlkId > > ret; + ret.reserve(count); + for (uint32_t i{0}; ((i < count) && (m_next_full_map_it != m_bt.m_bnodeid_map.m_map.end())); + ++m_next_full_map_it, ++i) { + ret.emplace_back(*m_next_full_map_it); + } + return ret; + }; + + if (m_state == FlushState::MAP_FLUSHING) { + // Some other fiber has started the flushing, get the next range of maps and iterate over and start flushing + return to_vector(m_parallel_flush_range); + } else if (m_state != FlushState::NODES_FLUSHED) { + // The nodes themselves have not been flushed or we have already finished flushing map, so we don't need to + // anything now + HS_DBG_ASSERT(m_next_full_map_it == m_bt.m_bnodeid_map.m_map.end(), + "In {} state, but outstanding count is non zero", m_state); + return {}; + } else { + // First fiber to start flushing, prepare the iterator. First flusher wll also get reminder of range also + m_state = FlushState::MAP_FLUSHING; + + // First fiber to flush the full map in this session. All fibers get equal portion to flush except the first + // one which gets additional + auto const total_count = m_bt.m_bnodeid_map.m_map.size(); + m_pending_map_entries_to_flush = total_count; + m_parallel_flush_range = total_count / cp_ctx->m_parallel_flushers_count; + m_next_full_map_it = m_bt.m_bnodeid_map.m_map.begin(); + return to_vector(m_parallel_flush_range + (total_count % cp_ctx->m_parallel_flushers_count)); + } +} + +std::pair< bool, std::vector< std::vector< BlkId > > > +COWBtree::CPSession::done_flushing_map(std::vector< BlkId > map_locations, size_t num_flushed_entries) { + std::lock_guard lg{m_flush_mtx}; + HS_DBG_ASSERT_EQ(m_state, FlushState::MAP_FLUSHING, + "Received a flush done while state was not in flushing, some race condition?"); + + if (!map_locations.empty()) { m_location_chains.emplace_back(std::move(map_locations)); } + m_pending_map_entries_to_flush -= num_flushed_entries; + + if (m_pending_map_entries_to_flush > 0) { return std::pair(false, std::vector< std::vector< BlkId > >{}); } + HS_DBG_ASSERT(m_next_full_map_it == m_bt.m_bnodeid_map.m_map.end(), + "We have no pending fibers flushing map, but the iterator has not pointing to end"); + + m_state = FlushState::MAP_FLUSHED; + return std::pair(true, std::move(m_location_chains)); +} + +void COWBtree::CPSession::finish() { + std::lock_guard lg{m_flush_mtx}; + m_modified_nodes.clear(); + m_deleted_nodes.clear(); + m_new_root_id.store(empty_bnodeid); + m_state = FlushState::ALL_DONE; + m_flushing_req_count = 0; + + m_node_locations.clear(); + m_next_location_idx = 0; + m_modified_it = DirtyNodeList::iterator{}; + m_deleted_it = DeletedNodeList::iterator{}; + m_modified_count = 0; + m_deleted_count = 0; + m_journal.reset(); + + m_next_full_map_it = m_bt.m_bnodeid_map.m_map.end(); + m_parallel_flush_range = 0; + m_location_chains.clear(); +} +} // namespace homestore \ No newline at end of file diff --git a/src/lib/index/cow_btree/cow_btree.h b/src/lib/index/cow_btree/cow_btree.h new file mode 100644 index 000000000..2d4e49dd5 --- /dev/null +++ b/src/lib/index/cow_btree/cow_btree.h @@ -0,0 +1,321 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common/large_id_reserver.hpp" +#include "common/concurrent_vector.hpp" +//#include "index/cow_btree/cow_btree_node.h" + +namespace homestore { +class COWBtreeCPContext; +class VirtualDev; + +class COWBtree : public UnderlyingBtree { +public: + struct Journal; + struct FlushNodeInfo { + BtreeNodePtr node; + uint8_t* buf{nullptr}; + + FlushNodeInfo() = default; + FlushNodeInfo(BtreeNodePtr n) : node{std::move(n)}, buf{node->share_phys_node_buf()} {} + FlushNodeInfo(FlushNodeInfo const& other) = delete; + FlushNodeInfo& operator=(FlushNodeInfo const& other) = delete; + FlushNodeInfo(FlushNodeInfo&& other) { + node = std::move(other.node); + buf = other.buf; + other.buf = nullptr; + } + + FlushNodeInfo& operator=(FlushNodeInfo&& other) { + node = std::move(other.node); + buf = other.buf; + other.buf = nullptr; + return *this; + } + + ~FlushNodeInfo() { + if (node) { node->release_phys_node_buf(buf); } + } + uint8_t* bytes() { return buf; } + }; + +public: + COWBtree(BtreeBase& bt, shared< VirtualDev > vdev, shared< sisl::SimpleCache< bnodeid_t, BtreeNodePtr > > cache, + std::vector< unique< Journal > > journal_bufs, BtreeNode::Allocator::Token token, bool load_existing); + virtual ~COWBtree() = default; + + // All overridden methods of UndelyingBtree class + BtreeNodePtr create_node(bool is_leaf, CPContext* context) override; + btree_status_t write_node(const BtreeNodePtr& node, CPContext* context) override; + btree_status_t read_node(bnodeid_t id, BtreeNodePtr& node) const override; + btree_status_t refresh_node(const BtreeNodePtr& node, bool for_read_modify_write, CPContext* context) override; + void remove_node(const BtreeNodePtr& node, CPContext* context) override; + btree_status_t transact_nodes(const BtreeNodeList& new_nodes, const BtreeNodeList& removed_nodes, + const BtreeNodePtr& left_child_node, const BtreeNodePtr& parent_node, + CPContext* context) override; + BtreeLinkInfo load_root_node_id() override; + btree_status_t on_root_changed(BtreeNodePtr const& root, CPContext* context) override; + uint64_t space_occupied() const override; + + bnodeid_t generate_node_id(); + void add_to_dirty_list(FlushNodeInfo finfo, COWBtreeCPContext* cp_ctx); + void add_to_remove_list(bnodeid_t node_id, COWBtreeCPContext* cp_ctx); + void destroy(); + + BlkId get_blkid_for_nodeid(bnodeid_t nodeid) const; + uint64_t used_size() const; + uint32_t align_size() const; + uint32_t ordinal() const { return m_btree_ordinal; } + + std::tuple< bool, unique< Journal > > flush_nodes(COWBtreeCPContext* cp_ctx); + void flush_map(COWBtreeCPContext* cp_ctx); + void flush_sb(COWBtreeCPContext* cp_ctx); + + static COWBtree* cast_to(BtreeBase& btree) { return r_cast< COWBtree* >(btree.underlying_btree()); } + + static COWBtree const* cast_to(BtreeBase const& btree) { + return r_cast< COWBtree const* >(btree.underlying_btree()); + } + + static COWBtree* cast_to(Index* index) { + return r_cast< COWBtree* >(s_cast< BtreeBase* >(index)->underlying_btree()); + } + + static COWBtree const* cast_to(Index const* index) { + return r_cast< COWBtree const* >(s_cast< BtreeBase const* >(index)->underlying_btree()); + } + +public: + using CompactNodeId = uint32_t; + static constexpr CompactNodeId EmptyCompactNodeId = std::numeric_limits< CompactNodeId >::max(); + +#pragma pack(1) + struct CompactBlkId { + blk_num_t is_valid : 1; + blk_num_t blk_num : 31; + chunk_num_t chunk_num; + + CompactBlkId() : is_valid{false} {} + CompactBlkId(BlkId const& b) : is_valid{true}, blk_num{b.blk_num()}, chunk_num{b.chunk_num()} {} + CompactBlkId(BlkId const& b, uint16_t offset) : + is_valid{true}, blk_num{b.blk_num() + offset}, chunk_num{b.chunk_num()} {} + + BlkId to_blkid() const { return is_valid ? BlkId{blk_num, 1u, chunk_num} : BlkId{}; }; + std::string to_string() const { + return is_valid ? fmt::format("blknum={},chunk={}", blk_num, chunk_num) : fmt::format("Invalid"); + } + std::string to_compact_string() const { + return is_valid ? fmt::format("{}:{}", blk_num, chunk_num) : fmt::format("NA"); + } + }; +#pragma pack() + +#pragma pack(1) + struct SuperBlock { + cp_id_t cp_id{-1}; // CPID when this superblock was written + uint16_t num_map_heads{0}; // Total number of map heads + BlkId map_heads[1]; // Array of heads of chain which contains the blkid map data + + static uint32_t max_map_heads(uint32_t sb_size) { + return (sb_size - sizeof(SuperBlock) + sizeof(BlkId)) / sizeof(BlkId); + } + }; + static_assert(sizeof(SuperBlock) < 512, "Expected superblk to be within the btree superblk underlying btree size"); +#pragma pack() + + struct Journal { +#pragma pack(1) + struct Header { + uint32_t ordinal; // Journal for which btree ordinal + uint32_t size{sizeof(Header)}; // Size of this journal + uint32_t num_flush_units{0}; // Number of flush units in this journal + uint32_t num_delete_units{0}; // Number of nodes removed for this btree. + CompactNodeId new_root_nodeid{EmptyCompactNodeId}; // New root node id + + // Followed by an array of FlushUnitentry and then array of Deleted nodeids + }; +#pragma pack() + + sisl::io_blob_safe m_base_buf; + sisl::byte_view m_loaded_journal_buf; // In case the journal was loaded, we use this + Header* m_header{nullptr}; + uint8_t* m_cur_ptr; + cp_id_t m_cp_id; // CP Id this journal is for (mainly useful while loading) + + Journal(uint32_t ordinal, uint32_t initial_size, cp_id_t cp_id) : + m_base_buf{std::max(initial_size, uint32_cast(sizeof(Header))), meta_service().align_size(), + sisl::buftag::metablk} { + m_header = new (m_base_buf.bytes()) Header(); + m_header->size = initial_size; + m_header->ordinal = ordinal; + m_cur_ptr = m_base_buf.bytes() + sizeof(Header); + m_cp_id = cp_id; + } + + Journal(sisl::byte_view journal_buf, cp_id_t cp_id) : + m_loaded_journal_buf{journal_buf}, + m_header{const_cast< Header* >(r_cast< Header const* >(m_loaded_journal_buf.bytes()))}, + m_cur_ptr{const_cast< uint8_t* >(m_loaded_journal_buf.bytes()) + sizeof(Header)}, + m_cp_id{cp_id} {} + + uint8_t* allocate(uint32_t num_bytes) { + if (available_space() < num_bytes) { + // We need to realloc the buffer and adjust the pointers. By default try to increase 50% more everytime + // (instead of doubling). + auto const cur_size = occupied_size(); + m_base_buf.buf_realloc( + std::max(num_bytes - available_space(), m_base_buf.size() + m_base_buf.size() / 2), + meta_service().align_size(), sisl::buftag::metablk); + m_cur_ptr = m_base_buf.bytes() + cur_size; + header()->size += num_bytes; + } + auto ret_ptr = m_cur_ptr; + m_cur_ptr += num_bytes; + return ret_ptr; + } + + uint8_t* make_room(uint32_t num_bytes) { + if (available_space() < num_bytes) { + // We need to realloc the buffer and adjust the pointers. + // By default try to increase 50% more everytime (instead of + // doubling). + m_base_buf.buf_realloc( + std::max(num_bytes - available_space(), m_base_buf.size() + m_base_buf.size() / 2), + meta_service().align_size(), sisl::buftag::metablk); + m_cur_ptr = m_base_buf.bytes() + occupied_size(); + } + return m_cur_ptr; + } + + sisl::io_blob& raw_buf() { return m_base_buf; } + Header* header() { return m_header; } + uint32_t occupied_size() const { return m_cur_ptr - m_base_buf.cbytes(); } + uint32_t available_space() const { return (m_base_buf.size() - occupied_size()); } + }; + + using BNodeIDMap = std::map< CompactNodeId, CompactBlkId >; + + struct FullBNodeIdMap { + // + // Why std::map with mutex instead of undrdered_map or concurrenthashmap? + // + // We persist this map in sorted by nodeid fashion, so as to pack consecutive nodes together. Given that we try + // to allocate node ids in consective manner, such structure would result in significant savings in persisting + // data size and thus performance. + BNodeIDMap m_map; + mutable iomgr::FiberManagerLib::shared_mutex m_mtx; + + // Why persisting as a chain instead of meta_blks + // + // Metablk as of now expects the entire map to be created in one large memory area and then persist them in + // pieces synchronously. For such a large map, this could be very slow, since only 1 thread will be doing IO for + // large map. The approach here uses link of the blkid (similar to metablk_mgr), but we persist it everytime we + // need to find a fragment or break in chain (every link) and also concurrently. This should speed up the + // persistence of the map. // List of locations where bnodeid maps are chained together + // + std::vector< BlkId > m_locations; + + // Keeping track of number of updates in the map since last full map flush. This prevents unnecessary full flush + // on dormant btrees + std::atomic< uint64_t > m_updates_since_last_flush{0}; + }; + + // using DirtyNodeList = sisl::ConcurrentInsertVector< BtreeNodePtr >; + // using DeletedNodeList = sisl::ConcurrentInsertVector< CompactNodeId >; + using DirtyNodeList = ConcurrentVector< FlushNodeInfo >; + using DeletedNodeList = ConcurrentVector< CompactNodeId >; + + struct CPSession { + public: + /////////////// All Dirtying operation related /////////////////////// + COWBtree& m_bt; + cp_id_t m_cp_id{-1}; + DirtyNodeList m_modified_nodes; + DeletedNodeList m_deleted_nodes; + std::atomic< bnodeid_t > m_new_root_id{empty_bnodeid}; + + /////////////// Common flushing related entitites /////////////////////// + SCOPED_ENUM_DECL(FlushState, uint8_t); + iomgr::FiberManagerLib::mutex m_flush_mtx; + FlushState m_state; + int32_t m_flushing_req_count{0}; + + /////////////// Node flush related entities /////////////////////// + std::vector< BlkId > m_node_locations; + size_t m_next_location_idx; // Next blkid to pick for next unit + DirtyNodeList::iterator m_modified_it; // Iterator of the dirtied nodes + DeletedNodeList::iterator m_deleted_it; // Iterator of the deleted nodes + uint32_t m_modified_count; // Cache the modified nodes + uint32_t m_deleted_count; // Cache them since deleted_nodes_.size() is an expensive operation + unique< Journal > m_journal; + + /////////////// Map and SB flush related entities /////////////////////// + BNodeIDMap::iterator m_next_full_map_it; + uint32_t m_parallel_flush_range{0}; + size_t m_pending_map_entries_to_flush{0}; + std::vector< std::vector< BlkId > > m_location_chains; + + public: + CPSession(COWBtree& bt) : m_bt{bt} {} + bool prepare_to_flush_nodes(COWBtreeCPContext* cp_ctx); + std::tuple< BlkId, DirtyNodeList::iterator, sisl::blob > next_dirty(); + std::tuple< DeletedNodeList::iterator, DeletedNodeList::iterator, sisl::blob > next_deleted(); + bnodeid_t new_root_id(); + bool done_flushing_nodes(); + + std::vector< std::pair< COWBtree::CompactNodeId, COWBtree::CompactBlkId > > + prepare_to_flush_map(COWBtreeCPContext* cp_ctx); + std::pair< bool, std::vector< std::vector< BlkId > > > done_flushing_map(std::vector< BlkId > map_locations, + size_t num_flushed_entries); + + void finish(); + }; + + friend class CPSession; + +private: + BtreeBase& m_base_btree; + shared< sisl::SimpleCache< bnodeid_t, BtreeNodePtr > > m_cache; + FullBNodeIdMap m_bnodeid_map; + LargeIDReserver m_nodeid_generator; + shared< VirtualDev > m_vdev; + + uint32_t m_btree_ordinal; + uint64_t m_ordinal_shifted; + bnodeid_t m_root_node_id; + + // All dirty items for a btree for each cp is tracked here (instead in cp_ctx) + std::array< unique< CPSession >, CPManager::max_concurent_cps > m_cp_sessions; + + // Flush related structures + iomgr::FiberManagerLib::mutex m_flush_mtx; + iomgr::FiberManagerLib::mutex m_id_mtx; + BtreeNode::Allocator::Token m_bufalloc_token; + +private: + void update_bnode_map(CompactNodeId nodeid, CompactBlkId blkid, bool in_recovery); + void delete_from_bnode_map(CompactNodeId nodeid, bool in_recovery); + void recover_bnode_map(BlkId const& map_loc); + BlkId lookup_bnode_map(CompactNodeId nodeid) const; + void apply_incremental_map(Journal& journal); + + CPSession* cp_session(cp_id_t cp_id); + + SuperBlock const& cow_bt_super_blk() const { + return *(r_cast< SuperBlock const* >(m_base_btree.bt_super_blk().underlying_btree_sb.data())); + } + + SuperBlock& cow_bt_super_blk() { + return const_cast< SuperBlock& >(s_cast< const COWBtree* >(this)->cow_bt_super_blk()); + } +}; + +SCOPED_ENUM_DEF(COWBtree::CPSession, FlushState, uint8_t, DIRTYING, NODES_FLUSHING, NODES_FLUSHED, MAP_FLUSHING, + MAP_FLUSHED, ALL_DONE); +} // namespace homestore \ No newline at end of file diff --git a/src/lib/index/cow_btree/cow_btree_cp.cpp b/src/lib/index/cow_btree/cow_btree_cp.cpp new file mode 100644 index 000000000..d010ecc0d --- /dev/null +++ b/src/lib/index/cow_btree/cow_btree_cp.cpp @@ -0,0 +1,120 @@ +#include +#include "index/cow_btree/cow_btree_cp.h" +#include "index/cow_btree/cow_btree_store.h" +#include "index/index_cp.h" +#include "common/homestore_assert.hpp" + +namespace homestore { +COWBtreeCPCallbacks::COWBtreeCPCallbacks(COWBtreeStore* store) : m_bt_store{store} {} + +std::unique_ptr< CPContext > COWBtreeCPCallbacks::on_switchover_cp(CP* cur_cp, CP* new_cp) { + return std::make_unique< COWBtreeCPContext >(new_cp, m_bt_store); +} + +folly::Future< bool > COWBtreeCPCallbacks::cp_flush(CP* cp) { + auto ctx = IndexCPContext::store_context< COWBtreeCPContext >(cp, IndexStore::Type::COPY_ON_WRITE_BTREE); + return m_bt_store->async_cp_flush(ctx); +} + +void COWBtreeCPCallbacks::cp_cleanup(CP* cp) {} + +int COWBtreeCPCallbacks::cp_progress_percent() { return 100; } + +/////////////////////// COWBtreeCPContext section /////////////////////////// +COWBtreeCPContext::COWBtreeCPContext(CP* cp, COWBtreeStore* bt_store) : + CPContext(cp), + m_parallel_flushers_count{bt_store->parallel_map_flushers_count()}, + m_merged_journal_buf{4096u, bt_store->align_size(), sisl::buftag::btree_journal} { + // NOTE: We calculate this on every CP is because we are making this resource limit of max dirty as hot swappable. + // However, instead of doing this calculation on every dirty buf increment, it is reasonable to calculate the dirty + // size per CP + m_max_dirty_size = uint64_cast(HS_DYNAMIC_CONFIG(resource_limits.index_max_dirty_memory_percent) * + HS_STATIC_CONFIG(input.io_mem_size()) / 100); + m_max_pending_free_size = uint64_cast(HS_DYNAMIC_CONFIG(resource_limits.index_max_free_space_accumulate_percent) * + bt_store->max_capacity() / 100); +} + +bool COWBtreeCPContext::need_full_map_flush() const { return m_is_full_map_flush; } + +std::string COWBtreeCPContext::to_string() const { + // TODO: Fill with approp details + return std::string(); +} + +void COWBtreeCPContext::increment_dirty_size(uint32_t size) { + if (m_dirty_size.increment_test_ge(m_max_dirty_size, size)) { + hs()->cp_mgr().trigger_cp_flush(false /* force */, CPTriggerReason::IndexBufferFull); + } +} + +void COWBtreeCPContext::increment_pending_free_size(uint32_t size) { + if (m_pending_free_size.increment_test_ge(m_max_pending_free_size, size)) { + hs()->cp_mgr().trigger_cp_flush(false /* force */, CPTriggerReason::IndexFreeBlksExceeded); + } +} + +void COWBtreeCPContext::prepare_to_flush(bool full_map_flush) { + // First we need to decide if this should be full map flush or incremental flush + CP_PERIODIC_LOG(DEBUG, id(), + "CowBtree has dirty node buffer size={}, pending node free size={} across all btrees, flushing " + "the nodes and flush {} map", + m_dirty_size.get(), m_pending_free_size.get(), full_map_flush ? "FULL" : "only INCREMENTAL"); + m_is_full_map_flush = full_map_flush; + + if (!m_is_full_map_flush) { + COWBtreeStore::Journal hdr_sb; + hdr_sb.cp_id = id(); + hdr_sb.index_store_type = IndexStore::Type::COPY_ON_WRITE_BTREE; + + m_merged_journal_buf.append(sisl::blob{uintptr_cast(&hdr_sb), uint32_cast(sizeof(COWBtreeStore::Journal))}); + m_journal_header = r_cast< COWBtreeStore::Journal* >(m_merged_journal_buf.bytes()); + } + + // Get all the current btrees in the system. + m_all_btrees = std::move(hs()->index_service().get_all_index_tables()); +} + +void COWBtreeCPContext::flushed_a_btree(COWBtree* cow_btree, COWBtree::Journal const* journal) { + std::unique_lock lg{m_bt_list_mtx}; + ++m_flushed_btrees_count; + + // This btree was dirtied in this cp, keep track of these btrees to persist their full map (if full + // map cp) or if superblk is changed. + // NOTE: We cannot persist superblk before persisting the journal that all btrees have been built. + // That is why we need to keep track of all btrees whose superblk has been changed and then write + // later. + if (m_is_full_map_flush) { + m_active_btree_list.emplace_back(cow_btree); + } else { + append_btree_journal(journal->m_base_buf); + } +} + +folly::Future< folly::Unit > COWBtreeCPContext::add_to_destroyed_list(shared< Index > btree) { + std::unique_lock lg{m_bt_list_mtx}; + m_destroyed_btrees.emplace_back(std::pair(btree, folly::Promise< folly::Unit >{})); + return m_destroyed_btrees.back().second.getFuture(); +} + +void COWBtreeCPContext::actual_destroy_btrees() { + // If there are any destroyed btrees as part of the CP, do the actual destroy now. + for (auto& [btree, p] : m_destroyed_btrees) { + COWBtree::cast_to(btree.get())->destroy(); + p.setValue(); + } + + CP_PERIODIC_LOG(INFO, id(), + "CowBtreeStore has {} btrees destroyed in this cp, destroyed all persistent structures for them", + m_destroyed_btrees.size()); +} + +void COWBtreeCPContext::append_btree_journal(sisl::io_blob_safe const& btree_journal_buf) { + HS_DBG_ASSERT_EQ(m_is_full_map_flush, false, "Btree journal update on full map flush"); + ++m_journal_header->num_btrees; + m_journal_header->size += btree_journal_buf.size(); + m_merged_journal_buf.append(btree_journal_buf); +} + +sisl::byte_view COWBtreeCPContext::store_journal() const { return m_merged_journal_buf.view(); } + +} // namespace homestore diff --git a/src/lib/index/cow_btree/cow_btree_cp.h b/src/lib/index/cow_btree/cow_btree_cp.h new file mode 100644 index 000000000..3b05e5440 --- /dev/null +++ b/src/lib/index/cow_btree/cow_btree_cp.h @@ -0,0 +1,84 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "device/virtual_dev.hpp" +#include "index/cow_btree/cow_btree_store.h" +#include "index/cow_btree/cow_btree.h" + +namespace homestore { +class Index; +class COWBtree; + +class COWBtreeCPCallbacks : public CPCallbacks { +public: + COWBtreeCPCallbacks(COWBtreeStore* store); + virtual ~COWBtreeCPCallbacks() = default; + +public: + std::unique_ptr< CPContext > on_switchover_cp(CP* cur_cp, CP* new_cp) override; + folly::Future< bool > cp_flush(CP* cp) override; + void cp_cleanup(CP* cp) override; + int cp_progress_percent() override; + +private: + COWBtreeStore* m_bt_store; +}; + +struct COWBtreeCPContext : public CPContext { +public: + sisl::atomic_counter< int64_t > m_dirty_size{0}; + sisl::atomic_counter< int64_t > m_pending_free_size{0}; + sisl::atomic_counter< int64_t > m_flushing_fibers_count{0}; + uint32_t const m_parallel_flushers_count; + + bool m_is_full_map_flush{false}; + iomgr::FiberManagerLib::shared_mutex m_bt_list_mtx; + std::vector< shared< Index > > m_all_btrees; + uint32_t m_flushed_btrees_count{0}; + std::vector< std::pair< shared< Index >, folly::Promise< folly::Unit > > > m_destroyed_btrees; + std::vector< COWBtree* > m_active_btree_list; + sisl::buf_builder m_merged_journal_buf; + COWBtreeStore::Journal* m_journal_header; + uint64_t m_max_dirty_size; + uint64_t m_max_pending_free_size; + +public: + COWBtreeCPContext(CP* cp, COWBtreeStore* bt_store); + virtual ~COWBtreeCPContext() = default; + bool need_full_map_flush() const; + void increment_dirty_size(uint32_t size); + void increment_pending_free_size(uint32_t size); + void prepare_to_flush(bool full_map_flush); + + void flushed_a_btree(COWBtree* cow_btree, COWBtree::Journal const* journal); + folly::Future< folly::Unit > add_to_destroyed_list(shared< Index > btree); + void actual_destroy_btrees(); + bool any_dirty_nodes() const { return (!m_dirty_size.testz() || !m_pending_free_size.testz()); } + void append_btree_journal(sisl::io_blob_safe const& btree_journal_buf); + sisl::byte_view store_journal() const; + std::string to_string() const; +}; +} // namespace homestore diff --git a/src/lib/index/cow_btree/cow_btree_node.cpp b/src/lib/index/cow_btree/cow_btree_node.cpp new file mode 100644 index 000000000..b2cb8e8fd --- /dev/null +++ b/src/lib/index/cow_btree/cow_btree_node.cpp @@ -0,0 +1,79 @@ +#include +#include "index/cow_btree/cow_btree_cp.h" +#include "index/cow_btree/cow_btree_node.h" +#include "index/cow_btree/cow_btree.h" +#include "common/homestore_utils.hpp" +#include "common/homestore_assert.hpp" + +namespace homestore { +BtreeNode* COWBtreeNode::to_btree_node() { return r_cast< BtreeNode* >(uintptr_cast(this) + sizeof(COWBtreeNode)); } + +COWBtreeNode* COWBtreeNode::construct(BtreeNodePtr const& node) { + return new (uintptr_cast(node.get()) - sizeof(COWBtreeNode)) COWBtreeNode(); +} + +void COWBtreeNode::destruct(BtreeNode* node) { + HS_DBG_ASSERT_EQ(COWBtreeNode::convert(node)->m_is_buf_flushing.load(), false, + "A destructing node shouldn't be flushing the buffer with cp, but here it is shared.", + fmt::ptr(node->get_phys_node_buf())); + + // Release the node buffer + hs_utils::iobuf_free(node->get_phys_node_buf(), sisl::buftag::btree_node); + + // Release the entire BtreeNode covering structure + uint8_t* ptr = uintptr_cast(node) - sizeof(COWBtreeNode); + r_cast< COWBtreeNode* >(ptr)->~COWBtreeNode(); + delete[] ptr; +} + +COWBtreeNode* COWBtreeNode::convert(BtreeNodePtr const& n) { + return r_cast< COWBtreeNode* >(uintptr_cast(n.get()) - sizeof(COWBtreeNode)); +} + +COWBtreeNode* COWBtreeNode::convert(BtreeNode* n) { + return r_cast< COWBtreeNode* >(uintptr_cast(n) - sizeof(COWBtreeNode)); +} + +COWBtreeNode::FlushInfo COWBtreeNode::prepare_flush_buf(COWBtree const& bt, BtreeNodePtr node, cp_id_t cur_cp_id) { + bool expected = false; + // If the buffer is already flushing, we need to make a copy, otherwise it is safe to mark it as flushing and let + // flush thread use the same buffer. + uint8_t* ret_buf = m_is_buf_flushing.compare_exchange_strong(expected, true) ? node->get_phys_node_buf() : nullptr; + + // If the buffer for the current version was written as part of previous cp (exactly 1 behind requested cp), + // then we need to check if previous cp is still in flushing. If so, we have to make a copy and use new version + // to write. We preserve existing version in the dirty list until it is flushed. + auto const node_cp_id = node->get_modified_cp_id(); + if (node_cp_id == (cur_cp_id - 1)) { + if (ret_buf == nullptr) { + // We couldn't share the buffer, because the node physical buffer is already shared. + auto new_buf = hs_utils::iobuf_alloc(node->node_size(), sisl::buftag::btree_node, bt.align_size()); + std::memcpy(new_buf, node->get_phys_node_buf(), node->node_size()); + ret_buf = new_buf; + node->set_phys_node_buf(new_buf); + } + } else { + HS_DBG_ASSERT_NE((void*)ret_buf, (void*)nullptr, + "Node={} was modified by earlier cp_id, but we couldn't share the buffer.", node->to_string()); + } + node->set_modified_cp_id(cur_cp_id); + return FlushInfo{std::move(node), ret_buf}; +} + +void COWBtreeNode::release_buf(uint8_t* buf) { + auto flushing_flag_on = m_is_buf_flushing.exchange(false); + if (!flushing_flag_on) { + // Looks like the earlier buffer which was shared to us has been released and a new copy was used. So we + // need to free the buffer + hs_utils::iobuf_free(buf, sisl::buftag::btree_node); + } else { + HS_DBG_ASSERT_EQ((void*)buf, (void*)to_btree_node()->get_phys_node_buf(), + "Buffer is not same as the one we shared to flush. buf={}", fmt::ptr(buf)); + } +} + +COWBtreeNode::FlushInfo::~FlushInfo() { + if (buf) { COWBtreeNode::convert(node)->release_buf(buf); } +} + +} // namespace homestore \ No newline at end of file diff --git a/src/lib/index/cow_btree/cow_btree_node.h b/src/lib/index/cow_btree/cow_btree_node.h new file mode 100644 index 000000000..3318ac205 --- /dev/null +++ b/src/lib/index/cow_btree/cow_btree_node.h @@ -0,0 +1,53 @@ +#pragma once +#include +#include + +namespace homestore { +class COWBtree; + +struct COWBtreeNode { +public: + // Is the buffer for the node is currently being flushed + std::atomic< bool > m_is_buf_flushing{false}; + + struct FlushInfo { + BtreeNodePtr node; + uint8_t* buf{nullptr}; + + FlushInfo() = default; + FlushInfo(BtreeNodePtr n, uint8_t* b) : node{std::move(n)}, buf{b} {} + FlushInfo(FlushInfo const& other) = delete; + FlushInfo& operator=(FlushInfo const& other) = delete; + ~FlushInfo(); + + FlushInfo(FlushInfo&& other) { + node = std::move(other.node); + buf = other.buf; + other.buf = nullptr; + } + + FlushInfo& operator=(FlushInfo&& other) { + node = std::move(other.node); + buf = other.buf; + other.buf = nullptr; + return *this; + } + uint8_t* bytes() { return buf; } + }; + + static COWBtreeNode* construct(BtreeNodePtr const& node); + static void destruct(BtreeNode* node); + static COWBtreeNode* convert(BtreeNodePtr const& node); + static COWBtreeNode* convert(BtreeNode* node); + +private: + COWBtreeNode() = default; + ~COWBtreeNode() = default; + +public: + FlushInfo prepare_flush_buf(COWBtree const& bt, BtreeNodePtr node, cp_id_t cur_cp_id); + void release_buf(uint8_t* buf); + BtreeNode* to_btree_node(); +}; + +} // namespace homestore \ No newline at end of file diff --git a/src/lib/index/cow_btree/cow_btree_store.cpp b/src/lib/index/cow_btree/cow_btree_store.cpp new file mode 100644 index 000000000..e1369df8b --- /dev/null +++ b/src/lib/index/cow_btree/cow_btree_store.cpp @@ -0,0 +1,326 @@ +#include +#include +#include "index/cow_btree/cow_btree_store.h" +//#include "index/cow_btree/cow_btree_node.h" +#include "index/cow_btree/cow_btree.h" +#include "index/cow_btree/cow_btree_cp.h" +#include "index/index_cp.h" +#include "device/virtual_dev.hpp" +#include "common/crash_simulator.hpp" + +namespace homestore { + +static std::vector< iomgr::io_fiber_t > start_flush_threads() { + // Start WBCache flush threads + struct Context { + std::condition_variable cv; + std::mutex mtx; + uint32_t thread_cnt{0}; + std::vector< iomgr::io_fiber_t > cp_flush_fibers; + }; + auto ctx = std::make_shared< Context >(); + + auto const nthreads = HS_DYNAMIC_CONFIG(generic.btree_cp_flush_threads); + for (uint32_t i{0}; i < nthreads; ++i) { + iomanager.create_reactor("index_cp_flush" + std::to_string(i), iomgr::INTERRUPT_LOOP, + HS_DYNAMIC_CONFIG(generic.btree_cp_flush_fibers_per_thread), [ctx](bool is_started) { + if (is_started) { + { + auto fibers = iomanager.sync_io_capable_fibers(); + std::unique_lock< std::mutex > lk{ctx->mtx}; + ctx->cp_flush_fibers.insert(ctx->cp_flush_fibers.end(), fibers.begin(), + fibers.end()); + ++(ctx->thread_cnt); + } + ctx->cv.notify_one(); + } + }); + } + + { + std::unique_lock< std::mutex > lk{ctx->mtx}; + ctx->cv.wait(lk, [ctx, nthreads] { return (ctx->thread_cnt == nthreads); }); + } + return std::move(ctx->cp_flush_fibers); +} + +COWBtreeStore::COWBtreeStore(shared< VirtualDev > vdev, std::vector< superblk< IndexStoreSuperBlock > > store_sbs) : + m_vdev{std::move(vdev)}, + m_cache{std::make_shared< sisl::SimpleCache< bnodeid_t, BtreeNodePtr > >( + hs()->evictor(), 500000 /* num_buckets */, + [](const BtreeNodePtr& node) -> bnodeid_t { return node->node_id(); }, + [](const BtreeNodePtr& node) -> uint32_t { return node->node_size(); }, + [](const sisl::CacheRecord& rec) -> bool { + const auto& hnode = (sisl::SingleEntryHashNode< BtreeNodePtr >&)rec; + return (hnode.m_value->m_refcount.test_le(1)); + })} { + m_bufalloc_token = BtreeNode::Allocator::add(BtreeNode::Allocator{ + [](uint32_t size) { return new uint8_t[size]; }, // alloc_btree_node + [](BtreeNode* node) { + node->~BtreeNode(); + delete[] uintptr_cast(node); + }, // free_btree_node + [this](uint32_t node_size) -> uint8_t* { // alloc_node_buf + return hs_utils::iobuf_alloc(node_size, sisl::buftag::btree_node, m_vdev->align_size()); + }, + [](uint8_t* buf) { hs_utils::iobuf_free(buf, sisl::buftag::btree_node); }}); + + if (store_sbs.size()) { + // There can be multiple sbs, each sb containing a journal for a particular cp. We need to sort based on + // cp_id and then split them as + std::sort(store_sbs.begin(), store_sbs.end(), [](auto& lhs, auto& rhs) { + return (r_cast< Journal* >(lhs.get())->cp_id < r_cast< Journal* >(rhs.get())->cp_id); + }); + + m_journals_by_cpid = std::move(store_sbs); + for (auto& journal : m_journals_by_cpid) { + load_journal(journal); + } + } + m_cp_flush_fibers = std::move(start_flush_threads()); + + // Register ourselves to the IndexCPCallbacks. Make sure you call this at the end of constructor. + r_cast< IndexCPCallbacks* >(cp_mgr().get_consumer(cp_consumer_t::INDEX_SVC)) + ->register_consumer(IndexStore::Type::COPY_ON_WRITE_BTREE, std::make_unique< COWBtreeCPCallbacks >(this)); +} + +void COWBtreeStore::stop() { + m_cache.reset(); + BtreeNode::Allocator::remove(m_bufalloc_token); +} + +uint32_t COWBtreeStore::max_capacity() const { return m_vdev->size(); } +uint32_t COWBtreeStore::max_node_size() const { return m_vdev->atomic_page_size(); } +uint32_t COWBtreeStore::align_size() const { return m_vdev->align_size(); } + +void COWBtreeStore::on_recovery_completed() { + HS_DBG_ASSERT_EQ(m_journals_by_btree.size(), 0, + "Even after recovery is completed, there are some btree journals are yet to be loaded, perhaps " + "its index super block is missing?"); + + // All btrees are loaded and recovery is completed. We can free up the journal buffers now. Note that we do not + // free up the superblk itself which contains critical meta_cookie info to remove the journal record itself once + // we do full map flush. + for (auto& journal : m_journals_by_cpid) { + journal.raw_buf().reset(); // This should free up the underlying byte_array only. + } +} + +unique< UnderlyingBtree > COWBtreeStore::create_underlying_btree(BtreeBase& btree, bool load_existing) { + unique< COWBtree > cbtree; + + auto it = m_journals_by_btree.find(btree.ordinal()); + if (it == m_journals_by_btree.end()) { + cbtree = std::make_unique< COWBtree >(btree, m_vdev, m_cache, std::vector< unique< COWBtree::Journal > >{}, + m_bufalloc_token, load_existing); + } else { + HS_DBG_ASSERT_EQ(load_existing, true, "Btree is found, but we are asked to create a new one"); + cbtree = std::make_unique< COWBtree >(btree, m_vdev, m_cache, std::move(it->second), m_bufalloc_token, + load_existing); + m_journals_by_btree.erase(it); // We no longer need btree specific journal records after it is created. + } + return cbtree; +} + +folly::Future< folly::Unit > COWBtreeStore::destroy_underlying_btree(BtreeBase& bt) { + CPGuard cpg = cp_mgr().cp_guard(); + auto context = cpg->context(cp_consumer_t::INDEX_SVC); + auto cp_ctx = IndexCPContext::convert< COWBtreeCPContext >(context, IndexStore::Type::COPY_ON_WRITE_BTREE); + return cp_ctx->add_to_destroyed_list(bt.shared_from_this()); +} + +// void COWBtreeStore::on_node_freed(BtreeNode* node) { COWBtreeNode::destruct(node); } + +class FlushGuard { +public: + FlushGuard(COWBtreeCPContext* ctx, std::function< void(COWBtreeCPContext* cp_ctx) > done_cb) : + m_cp_ctx{ctx}, m_done_cb{std::move(done_cb)} { + ctx->m_flushing_fibers_count.increment(1); + } + + ~FlushGuard() { + if (m_cp_ctx->m_flushing_fibers_count.decrement_testz(1)) { m_done_cb(m_cp_ctx); } + } + + FlushGuard(FlushGuard const& other) { + m_cp_ctx = other.m_cp_ctx; + m_done_cb = other.m_done_cb; + m_cp_ctx->m_flushing_fibers_count.increment(1); + } + + FlushGuard(FlushGuard&& other) = delete; + + FlushGuard operator=(FlushGuard const& other) { + m_cp_ctx = other.m_cp_ctx; + m_done_cb = other.m_done_cb; + m_cp_ctx->m_flushing_fibers_count.increment(1); + return *this; + } + + FlushGuard operator=(FlushGuard&& other) = delete; + + COWBtreeCPContext* cp_ctx() { return m_cp_ctx; } + +private: + COWBtreeCPContext* m_cp_ctx; + std::function< void(COWBtreeCPContext* ctx) > m_done_cb; +}; + +folly::Future< bool > COWBtreeStore::async_cp_flush(COWBtreeCPContext* cp_ctx) { + CP_PERIODIC_LOG(DEBUG, cp_ctx->id(), "Starting COWBtree CP Flush with cp context={}", cp_ctx->to_string()); + if (!cp_ctx->any_dirty_nodes()) { + if (cp_ctx->id() == 0) { + // For the first CP, we need to flush the journal buffer to the meta blk + // LOGINFO("First time boot cp, we shall flush the vdev to ensure all cp information is created"); + // m_vdev->cp_flush(cp_ctx); + } else { + CP_PERIODIC_LOG(DEBUG, cp_ctx->id(), "Btree does not have any dirty buffers to flush"); + } + return folly::makeFuture< bool >(true); // nothing to flush + } + + auto has_hit_incremental_flush_count_threshold = [this]() -> bool { + return (m_num_incremental_flushes >= HS_DYNAMIC_CONFIG(btree->cow_max_incremental_map_flushes)); + }; + + auto has_hit_meta_vdev_size_threshold = [this]() -> bool { + return ( + meta_service().used_size() > + uint64_cast( + (HS_DYNAMIC_CONFIG(btree->cow_full_map_flush_size_threshold_pct) * meta_service().total_size()) / 100)); + }; + + // First determine if this CP flush should be full_map flush + if (has_hit_incremental_flush_count_threshold() || has_hit_meta_vdev_size_threshold()) { + cp_ctx->prepare_to_flush(true); // Full map flush + } else { + cp_ctx->prepare_to_flush(false); // Incremental map flush + ++m_num_incremental_flushes; + } + + auto on_flush_nodes_done = [this](COWBtreeCPContext* cp_ctx) { + cp_ctx->actual_destroy_btrees(); + + CP_PERIODIC_LOG( + INFO, cp_ctx->id(), + "CowBtreeStore has {} btrees destroyed in this cp, destroyed all persistent structures for them", + cp_ctx->m_destroyed_btrees.size()); + + // All dirty nodes from all btrees have been flushed, now we can flush the full map or journal + // (depending on cp type) for each of the modified btree + flush_map(cp_ctx); + }; + + FlushGuard fg{cp_ctx, on_flush_nodes_done}; + for (auto& fiber : m_cp_flush_fibers) { + iomanager.run_on_forget(fiber, [fg]() mutable { + // Each thread will walk through all btrees created and alive at the point of CP flush and try to flush + // their dirty nodes. We take this approach as against marking the dirtied btree seperately while + // dirtying is that, we keep the code path of dirtying as waitfree as possible. It is more critical code + // path. However, we pay the cost during the flushing by walking across all btrees and then check if + // they are dirty. I feel this is much lower cost than doing in critical IO path. + auto cp_ctx = fg.cp_ctx(); + for (auto const& btree : cp_ctx->m_all_btrees) { + COWBtree* cow_btree = COWBtree::cast_to(btree.get()); + auto const [has_flushed, journal] = cow_btree->flush_nodes(cp_ctx); + +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("crash_on_flush_cow_btree_nodes", cow_btree->ordinal())) { + LOGINFOMOD(btree, "Simulating crash while flushing node for btree={}", cow_btree->ordinal()); + hs()->crash_simulator().start_crash(); + break; + } +#endif + + if (has_flushed) { + // Notify the cp context that we have flushed a btree and provide the journal. CP context will + // build the journal, which we will flush after all btrees are done flushing the nodes. + cp_ctx->flushed_a_btree(cow_btree, journal.get()); + } + } + }); + } + + return std::move(cp_ctx->get_future()); +} + +void COWBtreeStore::flush_map(COWBtreeCPContext* cp_ctx) { + if (cp_ctx->need_full_map_flush()) { + auto on_flush_map_done = [this](COWBtreeCPContext* cp_ctx) { + // We just flushed the full bnode map of all btrees, we can remove all previous journal + // superblks + for (auto& journal : m_journals_by_cpid) { + journal.destroy(); + } + CP_PERIODIC_LOG( + INFO, cp_ctx->id(), + "CowBtree has completed flush of nodes across {} btrees and persisted full map for all btrees", + cp_ctx->m_flushed_btrees_count); + + cp_ctx->complete(true); + }; + + FlushGuard fg{cp_ctx, on_flush_map_done}; + for (auto& fiber : m_cp_flush_fibers) { + iomanager.run_on_forget(fiber, [fg]() mutable { + auto cp_ctx = fg.cp_ctx(); + + // Yes we access m_active_btree_list outside of lock, but we are sure that there is no one mutating + // this btree list + for (auto cow_btree : cp_ctx->m_active_btree_list) { + cow_btree->flush_map(cp_ctx); + } + }); + } + } else { +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("crash_before_incr_map_flush_commit")) { + LOGINFO("Simulating crash before we commit the incremental map flush (btree journal write)"); + hs()->crash_simulator().start_crash(); + } +#endif + + auto sb = superblk< IndexStoreSuperBlock >{"index_store"}; + sb.load(cp_ctx->store_journal(), nullptr); // Load an empty meta_blk but with given buffer + sb.write(); // Write the metablk + auto const sb_size = sb.raw_buf()->size(); + sb.raw_buf().reset(); // after we wrote the superblk, we no longer need the merged journal buffer, free it + + // We only keep track of the metablk here, not buffer (so as to free after full map write) + m_journals_by_cpid.emplace_back(std::move(sb)); + + CP_PERIODIC_LOG(INFO, cp_ctx->id(), + "CowBtree has completed flush of nodes across {} btrees and persisted incremental journal for " + "map, journal size={}", + cp_ctx->m_flushed_btrees_count, sb_size); + cp_ctx->complete(true); + } +} + +void COWBtreeStore::load_journal(superblk< IndexStoreSuperBlock >& sb) { + auto store_journal = r_cast< COWBtreeStore::Journal* >(sb.get()); + uint32_t cur_offset = sizeof(COWBtreeStore::Journal); + + for (uint32_t i{0}; i < store_journal->num_btrees; ++i) { + COWBtree::Journal::Header* cur_bj = r_cast< COWBtree::Journal::Header* >(sb.get() + cur_offset); + + auto it = m_journals_by_btree.find(cur_bj->ordinal); + if (it == m_journals_by_btree.end()) { + bool happened; + std::tie(it, happened) = + m_journals_by_btree.insert(std::pair(cur_bj->ordinal, std::vector< unique< COWBtree::Journal > >{})); + HS_DBG_ASSERT(happened, "Insertion journal to journals list has failed for ordinal={}", cur_bj->ordinal); + } + it->second.emplace_back(std::make_unique< COWBtree::Journal >( + sisl::byte_view{sb.raw_buf(), cur_offset, cur_bj->size}, store_journal->cp_id)); + cur_offset += cur_bj->size; + } +} + +uint32_t COWBtreeStore::parallel_map_flushers_count() const { + // We cannot have more parallel fibers flushing than max heads we can put in the btree superblk, because each + // fiber will flush a portion of the full map and will have a head of the location chain. + return std::min(uint32_cast(m_cp_flush_fibers.size()), + COWBtree::SuperBlock::max_map_heads(BtreeSuperBlock::underlying_btree_sb_size)); +} +} // namespace homestore \ No newline at end of file diff --git a/src/lib/index/cow_btree/cow_btree_store.h b/src/lib/index/cow_btree/cow_btree_store.h new file mode 100644 index 000000000..fefc59e26 --- /dev/null +++ b/src/lib/index/cow_btree/cow_btree_store.h @@ -0,0 +1,80 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "common/homestore_utils.hpp" +#include "index/cow_btree/cow_btree.h" + +namespace homestore { +class COWBtreeCPContext; +class VirtualDev; + +class COWBtreeStore : public BtreeStore { +public: +#pragma pack(1) + struct Journal : public IndexStoreSuperBlock { + public: + cp_id_t cp_id; // CP Id for this journal, we have one meta blk which contains journal per CP + uint32_t size{sizeof(Journal)}; // Total journal size + uint32_t num_btrees{0}; // Total number of btrees updated in this + + // Followed by multiple cowbtree journals + }; +#pragma pack() + +private: + shared< VirtualDev > m_vdev; + + shared< sisl::SimpleCache< bnodeid_t, BtreeNodePtr > > m_cache; + + // List of fibers to flush (note that this could be on multiple threads) + std::vector< iomgr::io_fiber_t > m_cp_flush_fibers; + + // All loaded journals arranged by the btree ordinals + std::unordered_map< uint32_t, std::vector< unique< COWBtree::Journal > > > m_journals_by_btree; + + // All journals maintained (sorted) by its cp_id + std::vector< superblk< IndexStoreSuperBlock > > m_journals_by_cpid; + + // Total number of incremental cp flushes since last full flushes + uint32_t m_num_incremental_flushes{0}; + + BtreeNode::Allocator::Token m_bufalloc_token; + +public: + COWBtreeStore(shared< VirtualDev > vdev, std::vector< superblk< IndexStoreSuperBlock > > store_sbs); + virtual ~COWBtreeStore() = default; + void stop() override; + + //////////////////////// Override of IndexStore Interfaces ////////////////////////// + std::string store_type() const override { return "COW_BTREE"; } + void on_recovery_completed() override; + + ////////////////// Override Implementation of underlying store requirements ////////////////// + unique< UnderlyingBtree > create_underlying_btree(BtreeBase& btree, bool load_existing) override; + folly::Future< folly::Unit > destroy_underlying_btree(BtreeBase& bt) override; + // void on_node_freed(BtreeNode* node) override; + bool is_fast_destroy_supported() const override { return true; } + bool is_ephemeral() const { return false; } + uint32_t max_node_size() const override; + uint32_t align_size() const; + uint32_t max_capacity() const; + + // Implemenations for flush + folly::Future< bool > async_cp_flush(COWBtreeCPContext* cp_ctx); + uint32_t parallel_map_flushers_count() const; + +private: + void flush_map(COWBtreeCPContext* cp_ctx); + void load_journal(superblk< IndexStoreSuperBlock >& store_journal); +}; +} // namespace homestore diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index 30e6ac02e..1eb91da64 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -1,440 +1,66 @@ -#include -#include - -#include -#include "index/index_cp.hpp" -#include "index/wb_cache.hpp" -#include "common/homestore_assert.hpp" +#include +#include "index/index_cp.h" namespace homestore { -IndexCPCallbacks::IndexCPCallbacks(IndexWBCache* wb_cache) : m_wb_cache{wb_cache} {} +IndexCPCallbacks::IndexCPCallbacks() : m_store_cp_callbacks{enum_count< IndexStore::Type >()} {} std::unique_ptr< CPContext > IndexCPCallbacks::on_switchover_cp(CP* cur_cp, CP* new_cp) { - return std::make_unique< IndexCPContext >(new_cp); + std::vector< unique< CPContext > > store_contexts; + store_contexts.reserve(enum_count< IndexStore::Type >()); + for (auto& cp_callbacks : m_store_cp_callbacks) { + store_contexts.emplace_back(cp_callbacks ? std::move(cp_callbacks->on_switchover_cp(cur_cp, new_cp)) : nullptr); + } + return std::make_unique< IndexCPContext >(new_cp, std::move(store_contexts)); } folly::Future< bool > IndexCPCallbacks::cp_flush(CP* cp) { - auto ctx = s_cast< IndexCPContext* >(cp->context(cp_consumer_t::INDEX_SVC)); - return m_wb_cache->async_cp_flush(ctx); -} - -void IndexCPCallbacks::cp_cleanup(CP* cp) {} - -int IndexCPCallbacks::cp_progress_percent() { return 100; } - -/////////////////////// IndexCPContext section /////////////////////////// -IndexCPContext::IndexCPContext(CP* cp) : VDevCPContext(cp) {} - -void IndexCPContext::add_to_txn_journal(uint32_t index_ordinal, const IndexBufferPtr& parent_buf, - const IndexBufferPtr& left_child_buf, const IndexBufferPtrList& created_bufs, - const IndexBufferPtrList& freed_bufs) { - auto record_size = txn_record::size_for_num_ids(created_bufs.size() + freed_bufs.size() + (left_child_buf ? 1 : 0) + - (parent_buf ? 1 : 0)); - std::unique_lock< iomgr::FiberManagerLib::mutex > lg{m_txn_journal_mtx}; - if (m_txn_journal_buf.bytes() == nullptr) { - m_txn_journal_buf = - std::move(sisl::io_blob_safe{std::max(sizeof(txn_journal), 512ul), 512, sisl::buftag::metablk}); - txn_journal* tj = new (m_txn_journal_buf.bytes()) txn_journal(); - tj->cp_id = id(); - } - - txn_journal* tj = r_cast< txn_journal* >(m_txn_journal_buf.bytes()); - if (m_txn_journal_buf.size() < tj->size + record_size) { - m_txn_journal_buf.buf_realloc(m_txn_journal_buf.size() + std::max(tj->size + record_size, 512u), 512, - sisl::buftag::metablk); - tj = r_cast< txn_journal* >(m_txn_journal_buf.bytes()); - } - - { - auto rec = tj->append_record(index_ordinal); - if (parent_buf) { - rec->append(op_t::parent_inplace, parent_buf->blkid()); - if (parent_buf->is_meta_buf()) { rec->is_parent_meta = 0x1; } - } - if (left_child_buf && (left_child_buf != parent_buf)) { - rec->append(op_t::child_inplace, left_child_buf->blkid()); - } - for (auto const& buf : created_bufs) { - rec->append(op_t::child_new, buf->blkid()); - } - for (auto const& buf : freed_bufs) { - rec->free_node_level = buf->m_node_level; - rec->append(op_t::child_freed, buf->blkid()); - } + std::vector< folly::Future< bool > > futs; + for (auto& cp_callbacks : m_store_cp_callbacks) { + if (cp_callbacks) { futs.emplace_back(cp_callbacks->cp_flush(cp)); } } -} - -void IndexCPContext::add_to_dirty_list(const IndexBufferPtr& buf) { - m_dirty_buf_list.push_back(buf); - buf->set_state(index_buf_state_t::DIRTY); - m_dirty_buf_count.increment(1); -} - -bool IndexCPContext::any_dirty_buffers() const { return !m_dirty_buf_count.testz(); } - -void IndexCPContext::prepare_flush_iteration() { m_dirty_buf_it = m_dirty_buf_list.begin(); } -std::optional< IndexBufferPtr > IndexCPContext::next_dirty() { - if (m_dirty_buf_it == m_dirty_buf_list.end()) { return std::nullopt; } - IndexBufferPtr ret = *m_dirty_buf_it; - ++m_dirty_buf_it; - return ret; -} - -std::string IndexCPContext::to_string_small() { - return fmt::format("IndexCPContext cpid={}, dirty_buf_count={}, dirty_buf_list_size={}", m_cp->id(), - m_dirty_buf_count.get(), m_dirty_buf_list.size()); -} - -std::string IndexCPContext::to_string_free_list() { - std::string str{ - fmt::format("IndexCPContext cpid={} free_blkid_list_size={}\n[", m_cp->id(), m_free_blkid_list.size())}; - if (m_free_blkid_list.size() == 0) { return str + "empty]"; } - m_free_blkid_list.foreach_entry( - [&str](BlkId bid) { fmt::format_to(std::back_inserter(str), "{}:{}, ", bid.to_integer(), bid.to_string()); }); - return str + "]"; -} - -std::string IndexCPContext::to_string() { - std::string str{fmt::format("IndexCPContext cpid={} dirty_buf_count={} dirty_buf_list_size={}\n", m_cp->id(), - m_dirty_buf_count.get(), m_dirty_buf_list.size())}; - - // Mapping from a node to all its parents in the graph. - // Display all buffers and its dependencies and state. - std::unordered_map< IndexBuffer*, std::vector< IndexBuffer* > > parents; - - m_dirty_buf_list.foreach_entry([&parents](IndexBufferPtr buf) { - // Add this buf to his children. - parents[buf->m_up_buffer.get()].emplace_back(buf.get()); - }); - - m_dirty_buf_list.foreach_entry([&str, &parents](IndexBufferPtr buf) { - fmt::format_to(std::back_inserter(str), "{}", buf->to_string()); - auto first = true; - for (const auto& p : parents[buf.get()]) { - if (first) { - fmt::format_to(std::back_inserter(str), "\nDepends:"); - first = false; + return folly::collectAllUnsafe(futs).thenValue([](auto&& vf) { + bool all_success = true; + for (auto const& success : vf) { + if (!success.value()) { + all_success = false; + break; } - fmt::format_to(std::back_inserter(str), " {}({})", r_cast< void* >(p), s_cast< int >(p->state())); } - fmt::format_to(std::back_inserter(str), "\n"); + return folly::makeFuture< bool >(std::move(all_success)); }); - return str; } -void IndexCPContext::to_string_dot(const std::string& filename) { - std::ofstream file(filename); - if (!file.is_open()) { throw std::runtime_error("Failed to open file: " + filename); } - - file << "digraph G {\n"; - - // Mapping from a node to all its parents in the graph. - std::unordered_map< IndexBuffer*, std::vector< IndexBuffer* > > parents; - - m_dirty_buf_list.foreach_entry([&parents](IndexBufferPtr buf) { - // Add this buf to his children. - parents[buf->m_up_buffer.get()].emplace_back(buf.get()); - }); - m_dirty_buf_list.foreach_entry([&file, &parents, this](IndexBufferPtr buf) { - std::vector< std::string > colors = {"lightgreen", "lightcoral", "lightyellow"}; - auto sbuf = BtreeNode::to_string_buf(buf->raw_buffer()); - auto pos = sbuf.find("LEAF"); - if (pos != std::string::npos) { - sbuf.insert(pos + 4, "
"); - } else { - pos = sbuf.find("INTERIOR"); - if (pos != std::string::npos) { sbuf.insert(pos + 8, "
"); } - } - file << fmt::format( - "\"{}\" [shape={}, label=< {}
{} >, fillcolor=\"{}\", style=\"filled\", fontname=\"bold\"];\n", - r_cast< void* >(buf.get()), m_cp->id() == buf->m_created_cp_id ? "ellipse" : "box", buf->to_string_dot(), - sbuf, colors[s_cast< int >(buf->state())]); - for (const auto& p : parents[buf.get()]) { - file << fmt::format("\"{}\" -> \"{}\";\n", r_cast< void* >(p), r_cast< void* >(buf.get())); - } - }); - file << "}\n"; - - file.close(); - LOGINFO("cp dag is stored in file {}", filename); -} - -uint16_t IndexCPContext::num_dags() { - // count number of buffers whose up_buffers are nullptr - uint16_t count = 0; - std::unique_lock lg{m_flush_buffer_mtx}; - m_dirty_buf_list.foreach_entry([&count](IndexBufferPtr buf) { - if (buf->m_up_buffer == nullptr) { count++; } - }); - return count; -} - -std::string IndexCPContext::to_string_with_dags() { - struct DagNode { - IndexBufferPtr buf; - std::vector< shared< DagNode > > down_nodes; - }; - std::vector< shared< DagNode > > group_roots; - std::unordered_map< IndexBufferPtr, shared< DagNode > > buf_to_dag_node; - - auto get_insert_buf = [&buf_to_dag_node](IndexBufferPtr buf) { - auto it = buf_to_dag_node.find(buf); - if (it == buf_to_dag_node.end()) { - auto dgn = std::make_shared< DagNode >(); - dgn->buf = buf; - buf_to_dag_node[buf] = dgn; - return dgn; - } - return it->second; - }; - - std::unique_lock lg{m_flush_buffer_mtx}; - // Create the graph - m_dirty_buf_list.foreach_entry([&get_insert_buf, &group_roots](IndexBufferPtr buf) { - if (buf->m_up_buffer == nullptr) { - auto dgn = get_insert_buf(buf); - group_roots.emplace_back(dgn); - } else { - auto dgn = get_insert_buf(buf); - auto up_dgn = get_insert_buf(buf->m_up_buffer); - up_dgn->down_nodes.emplace_back(dgn); - } - }); - - // Now walk through the list of graphs and prepare formatted string - std::string str{fmt::format("IndexCPContext cpid={} dirty_buf_count={} dirty_buf_list_size={} #_of_dags={}\n", - m_cp->id(), m_dirty_buf_count.get(), m_dirty_buf_list.size(), group_roots.size())}; - int cnt = 1; - for (const auto& root : group_roots) { - std::vector< std::tuple< std::shared_ptr< DagNode >, int, int > > stack; - stack.emplace_back(root, 0, cnt++); - while (!stack.empty()) { - auto [node, level, index] = stack.back(); - stack.pop_back(); - fmt::format_to(std::back_inserter(str), "{}{}-{} \n", std::string(level * 4, ' '), index, - node->buf->to_string()); - int c = node->down_nodes.size(); - for (const auto& d : node->down_nodes) { - stack.emplace_back(d, level + 1, c--); - } - } - } - - return str; -} - -void IndexCPContext::log_dags() { - LOGINFO("{}", to_string_with_dags()); - sisl::logging::GetLogger()->flush(); -} - -std::map< BlkId, IndexBufferPtr > IndexCPContext::recover(sisl::byte_view sb) { - txn_journal const* tj = r_cast< txn_journal const* >(sb.bytes()); - if (tj->cp_id != id()) { - // On clean shutdown, cp_id would be lesser than the current cp_id, in that case ignore this sb - HS_DBG_ASSERT_LT(tj->cp_id, id(), "Persisted cp in wb txn journal is more than current cp"); - return {}; +void IndexCPCallbacks::cp_cleanup(CP* cp) { + for (auto& cp_callbacks : m_store_cp_callbacks) { + if (cp_callbacks) { cp_callbacks->cp_cleanup(cp); } } - HS_DBG_ASSERT_GT(tj->num_txns, 0, "Invalid txn_journal, num_txns is zero"); - HS_DBG_ASSERT_GT(tj->size, 0, "Invalid txn_journal, size of records is zero"); - - std::map< BlkId, IndexBufferPtr > buf_map; - uint8_t const* cur_ptr = r_cast< uint8_t const* >(tj) + sizeof(txn_journal); - - for (uint32_t t{0}; t < tj->num_txns; ++t) { - txn_record const* rec = r_cast< txn_record const* >(cur_ptr); - HS_DBG_ASSERT_GT(rec->total_ids(), 0, "Invalid txn_record, has no ids in it"); - - process_txn_record(rec, buf_map); - cur_ptr += rec->size(); - LOGTRACEMOD(wbcache, "Recovered txn record: {}: {}", t, rec->to_string()); - } - auto modifyBuffer = [](IndexBufferPtr& buffer) { - IndexBufferPtr up_buf = buffer->m_up_buffer; - auto real_up_buf = up_buf; - while (real_up_buf && real_up_buf->m_node_freed) { - real_up_buf = real_up_buf->m_up_buffer; - } - if (real_up_buf != up_buf) { - up_buf->remove_down_buffer(buffer); - buffer->m_up_buffer = real_up_buf; - real_up_buf->add_down_buffer(buffer); - LOGTRACEMOD(wbcache, "Change upbuffer from {} to {}", up_buf->to_string(), - buffer->m_up_buffer->to_string()); - } - }; -#if 0 - auto dag_print = [](const std::map< BlkId, IndexBufferPtr >& dags, std::string delimiter) { - int index = 1; - for (const auto& [blkid, bufferPtr] : dags) { - LOGTRACEMOD(wbcache, "{}{} - blkid {} buffer {} ", delimiter, index++, blkid.to_integer(), - bufferPtr->to_string()); - } - }; - LOGTRACEMOD(wbcache,"Before modify : \n "); - dag_print(buf_map, "Before: "); -#endif - for (auto& [blkid, bufferPtr] : buf_map) { - modifyBuffer(bufferPtr); - } - // LOGTRACEMOD(wbcache,"\n\n\nAFTER modify : \n "); - // dag_print(buf_map, "After: "); - - auto sanityCheck = [](const std::map< BlkId, IndexBufferPtr >& dags) { - for (const auto& [blkid, bufferPtr] : dags) { - auto up_buffer = bufferPtr->m_up_buffer; - if (up_buffer) { - HS_REL_ASSERT( - !up_buffer->m_node_freed, - "Sanity check failed: Buffer {} blkdid {} has an up_buffer {} blkid that is marked as freed.", - bufferPtr->to_string(), blkid.to_integer(), up_buffer->to_string(), - up_buffer->blkid().to_integer()); - HS_REL_ASSERT_EQ(up_buffer->m_created_cp_id, -1, - "Sanity check failed: Buffer {} has an up_buffer {} that just created", - bufferPtr->to_string(), up_buffer->to_string()); - HS_REL_ASSERT_EQ(up_buffer->m_index_ordinal, bufferPtr->m_index_ordinal, - "Sanity check failed: Buffer {} has an up_buffer {} that has different index_ordinal.", - bufferPtr->to_string(), up_buffer->to_string()); - HS_REL_ASSERT(!bufferPtr->is_meta_buf(), - "Sanity check failed: down buffer {} is meta buffer of up buffer {}", - bufferPtr->to_string(), up_buffer->to_string()); - HS_REL_ASSERT( - !up_buffer->m_wait_for_down_buffers.testz(), - "Sanity check failed: Buffer {} has an up_buffer {} that has zero m_wait_for_down_buffers.", - bufferPtr->to_string(), up_buffer->to_string()); -#ifdef _PRERELEASE - HS_DBG_ASSERT(up_buffer->is_in_down_buffers(bufferPtr), - "Sanity check failed: up_buffer {} has't {} as a down_buffer.", up_buffer->to_string(), - bufferPtr->to_string()); -#endif - } - HS_REL_ASSERT(!bufferPtr->m_node_freed || bufferPtr->m_wait_for_down_buffers.testz(), - "Sanity check failed: Freed buffer {} has non-zero m_wait_for_down_buffers.", - bufferPtr->to_string()); -#ifdef _PRERELEASE - HS_DBG_ASSERT(bufferPtr->m_wait_for_down_buffers.test_eq(bufferPtr->m_down_buffers.size()), - "Sanity check failed: Buffer {} has a mismatch between down_buffers_count and " - "m_wait_for_down_buffers.", - bufferPtr->to_string()); -#endif - } - }; - - sanityCheck(buf_map); - return buf_map; } -void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, IndexBufferPtr >& buf_map) { - auto cpg = cp_mgr().cp_guard(); - - auto const rec_to_buf = [&buf_map, &cpg](txn_record const* rec, bool is_meta, BlkId const& bid, - IndexBufferPtr const& up_buf) -> IndexBufferPtr { - IndexBufferPtr buf; - auto it = buf_map.find(bid); - if (it == buf_map.end()) { - if (is_meta) { - superblk< index_table_sb > tmp_sb; - buf = std::make_shared< MetaIndexBuffer >(tmp_sb); - } else { - buf = std::make_shared< IndexBuffer >(nullptr, bid); - } - - [[maybe_unused]] auto [it2, happened] = buf_map.insert(std::make_pair(bid, buf)); - DEBUG_ASSERT(happened, "buf_map insert failed"); - - buf->m_dirtied_cp_id = cpg->id(); - buf->m_index_ordinal = rec->index_ordinal; - } else { - buf = it->second; - } - - if (up_buf) { - auto real_up_buf = up_buf; - if (up_buf->m_created_cp_id == cpg->id()) { - real_up_buf = up_buf->m_up_buffer; - } else if (up_buf->m_node_freed) { - real_up_buf = up_buf->m_up_buffer; - LOGTRACEMOD(wbcache, "\n\n change upbuffer from {} to {}\n\n", up_buf->to_string(), - real_up_buf->to_string()); - } - -#ifndef NDEBUG - // if (!is_sibling_link || (buf->m_up_buffer == real_up_buf)) { return buf;} - // Already linked with same buf or its not a sibling link to override - if (real_up_buf->is_in_down_buffers(buf)) { return buf; } -#endif - - if (buf->m_up_buffer != real_up_buf) { - if (buf->m_up_buffer) { buf->m_up_buffer->remove_down_buffer(buf); } - real_up_buf->add_down_buffer(buf); - buf->m_up_buffer = real_up_buf; - } +int IndexCPCallbacks::cp_progress_percent() { + uint32_t count = 0; + uint32_t pct = 0; + for (auto& cp_callbacks : m_store_cp_callbacks) { + if (cp_callbacks) { + pct += cp_callbacks->cp_progress_percent(); + ++count; } - return buf; - }; - - uint32_t cur_idx = 0; - IndexBufferPtr parent_buf{nullptr}; - if (rec->has_inplace_parent) { parent_buf = rec_to_buf(rec, rec->is_parent_meta, rec->blk_id(cur_idx++), nullptr); } - - IndexBufferPtr inplace_child_buf{nullptr}; - if (rec->has_inplace_child) { - inplace_child_buf = rec_to_buf(rec, false /* is_meta */, rec->blk_id(cur_idx++), parent_buf); - } - - for (uint8_t idx{0}; idx < rec->num_new_ids; ++idx) { - auto new_buf = rec_to_buf(rec, false /* is_meta */, rec->blk_id(cur_idx++), - inplace_child_buf ? inplace_child_buf : parent_buf); - new_buf->m_created_cp_id = cpg->id(); - } - - for (uint8_t idx{0}; idx < rec->num_freed_ids; ++idx) { - auto freed_buf = rec_to_buf(rec, false /* is_meta */, rec->blk_id(cur_idx++), - inplace_child_buf ? inplace_child_buf : parent_buf); - freed_buf->m_node_level = rec->free_node_level; - freed_buf->m_node_freed = true; } + return (count) ? pct / count : 100; } -void IndexCPContext::txn_journal::log_records() const { LOGINFO("{}", to_string()); } - -std::string IndexCPContext::txn_journal::to_string() const { - std::string str = fmt::format("cp_id={}, num_txns={}, size={}", cp_id, num_txns, size); - uint8_t const* cur_ptr = r_cast< uint8_t const* >(this) + sizeof(txn_journal); - for (uint32_t t{0}; t < num_txns; ++t) { - txn_record const* rec = r_cast< txn_record const* >(cur_ptr); - fmt::format_to(std::back_inserter(str), "\n {}: {}", t, rec->to_string()); - cur_ptr += rec->size(); +void IndexCPCallbacks::register_consumer(IndexStore::Type store_type, unique< CPCallbacks > store_cp_cbs) { + // As soon as store cp callbacks is registered, we need to provide them the option to create new cp context + if (store_cp_cbs) { + auto cpg = cp_mgr().cp_guard(); + auto ctx = s_cast< IndexCPContext* >(cpg.context(cp_consumer_t::INDEX_SVC)); + ctx->m_store_contexts[(size_t)store_type] = std::move(store_cp_cbs->on_switchover_cp(nullptr, cpg.get())); } - return str; + m_store_cp_callbacks[uint32_cast(store_type)] = std::move(store_cp_cbs); } -std::string IndexCPContext::txn_record::to_string() const { - auto add_to_string = [this](std::string& str, uint8_t& idx, uint8_t id_count) { - if (id_count == 0) { - fmt::format_to(std::back_inserter(str), "empty]"); - } else { - for (uint8_t i{0}; i < id_count; ++i) { - fmt::format_to(std::back_inserter(str), "[{}],", blk_id(idx++).to_integer()); - } - fmt::format_to(std::back_inserter(str), "]"); - } - }; - - std::string str = fmt::format("ordinal={}, parent=[{}], in_place_child=[{}]", index_ordinal, parent_id_string(), - child_id_string(), num_new_ids, num_freed_ids); - - uint8_t idx = ((has_inplace_parent == 0x1) ? 1 : 0) + ((has_inplace_child == 0x1) ? 1 : 0); - fmt::format_to(std::back_inserter(str), ", new_ids=["); - add_to_string(str, idx, num_new_ids); +/////////////////////// IndexCPContext section /////////////////////////// +IndexCPContext::IndexCPContext(CP* cp, std::vector< unique< CPContext > > store_ctxs) : + CPContext(cp), m_store_contexts{std::move(store_ctxs)} {} - fmt::format_to(std::back_inserter(str), ", freed_ids=["); - add_to_string(str, idx, num_freed_ids); - if (num_freed_ids) { - fmt::format_to(std::back_inserter(str), ", freed_node_level= {}", (uint8_t)(free_node_level)); - }; - fmt::format_to(std::back_inserter(str), "{}", (is_parent_meta ? ", parent is meta" : "")); - return str; -} -} // namespace homestore +} // namespace homestore \ No newline at end of file diff --git a/src/lib/index/index_cp.h b/src/lib/index/index_cp.h new file mode 100644 index 000000000..17c87b0e1 --- /dev/null +++ b/src/lib/index/index_cp.h @@ -0,0 +1,63 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace homestore { +class IndexCPCallbacks : public CPCallbacks { +public: + IndexCPCallbacks(); + virtual ~IndexCPCallbacks() = default; + +public: + std::unique_ptr< CPContext > on_switchover_cp(CP* cur_cp, CP* new_cp) override; + folly::Future< bool > cp_flush(CP* cp) override; + void cp_cleanup(CP* cp) override; + int cp_progress_percent() override; + + void register_consumer(IndexStore::Type store_type, unique< CPCallbacks > store_cp_cbs); + +private: + std::vector< unique< CPCallbacks > > m_store_cp_callbacks; +}; + +struct IndexCPContext : public CPContext { +public: + std::vector< unique< CPContext > > m_store_contexts; + +public: + IndexCPContext(CP* cp, std::vector< unique< CPContext > > store_ctxs); + ~IndexCPContext() = default; + + template < typename T > + static T* convert(CPContext* ctx, IndexStore::Type store_type) { + return r_cast< T* >((r_cast< IndexCPContext* >(ctx))->m_store_contexts[uint32_cast(store_type)].get()); + } + + template < typename T > + static T* store_context(CP* cp, IndexStore::Type store_type) { + return convert< T >(cp->context(cp_consumer_t::INDEX_SVC), store_type); + } +}; +} // namespace homestore diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index db2e19172..738ba647d 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -15,347 +15,213 @@ *********************************************************************************/ #include #include -#include #include -#include "index/wb_cache.hpp" -#include "index/index_cp.hpp" + +#include #include "common/homestore_utils.hpp" #include "common/homestore_assert.hpp" #include "device/virtual_dev.hpp" #include "device/physical_dev.hpp" #include "device/chunk.h" +#include "index/cow_btree/cow_btree_store.h" +//#include "index/inplace_btree/inplace_btree_store.h" +#include "index/mem_btree/mem_btree_store.h" +#include "index/index_cp.h" namespace homestore { IndexService& index_service() { return hs()->index_service(); } -IndexService::IndexService(std::unique_ptr< IndexServiceCallbacks > cbs, shared< ChunkSelector > chunk_selector) : - m_svc_cbs{std::move(cbs)}, m_custom_chunk_selector{std::move(chunk_selector)} { +IndexService::IndexService(std::unique_ptr< IndexServiceCallbacks > cbs, + std::vector< ServiceSubType > const& sub_types) : + m_svc_cbs{std::move(cbs)} { m_ordinal_reserver = std::make_unique< sisl::IDReserver >(); meta_service().register_handler( - "index", + "index_table", [this](meta_blk* mblk, sisl::byte_view buf, size_t size) { - m_itable_sbs.emplace_back(std::pair{mblk, std::move(buf)}); + superblk< IndexSuperBlock > sb("index_table"); + sb.load(buf, mblk); + m_index_sbs.emplace_back(std::move(sb)); }, nullptr); meta_service().register_handler( - "wb_cache", - [this](meta_blk* mblk, sisl::byte_view buf, size_t size) { m_wbcache_sb = std::pair{mblk, std::move(buf)}; }, + "index_store", + [this](meta_blk* mblk, sisl::byte_view buf, size_t size) { + superblk< IndexStoreSuperBlock > sb("index_store"); + sb.load(buf, mblk); + m_store_sbs.emplace_back(std::move(sb)); + }, nullptr); } -void IndexService::create_vdev(uint64_t size, HSDevType devType, uint32_t num_chunks, - chunk_selector_type_t chunk_sel_type) { +void IndexService::create_vdev(ServiceSubType sub_type, uint64_t size, HSDevType devType, uint32_t num_chunks) { auto const atomic_page_size = hs()->device_mgr()->atomic_page_size(devType); hs_vdev_context vdev_ctx; vdev_ctx.type = hs_vdev_type_t::INDEX_VDEV; + vdev_ctx.sub_type = sub_type; hs()->device_mgr()->create_vdev(vdev_parameters{.vdev_name = "index", .vdev_size = size, .num_chunks = num_chunks, .blk_size = atomic_page_size, .dev_type = devType, - .alloc_type = blk_allocator_type_t::fixed, - .chunk_sel_type = chunk_sel_type, + .alloc_type = blk_allocator_type_t::varsize, + .chunk_sel_type = chunk_selector_type_t::ROUND_ROBIN, .multi_pdev_opts = vdev_multi_pdev_opts_t::ALL_PDEV_STRIPED, .context_data = vdev_ctx.to_blob()}); } -shared< VirtualDev > IndexService::open_vdev(const vdev_info& vinfo, bool load_existing) { - m_vdev = std::make_shared< VirtualDev >(*(hs()->device_mgr()), vinfo, nullptr /* event_cb */, - true /* auto_recovery */, m_custom_chunk_selector); - return m_vdev; +shared< VirtualDev > IndexService::open_vdev(ServiceSubType sub_type, const vdev_info& vinfo, bool load_existing) { + auto const vdev = + std::make_shared< VirtualDev >(*(hs()->device_mgr()), vinfo, nullptr /* event_cb */, false /* auto_recovery */); + m_vdevs.insert(std::make_pair(sub_type, vdev)); + return vdev; } -uint32_t IndexService::reserve_ordinal() { return m_ordinal_reserver->reserve(); } +void IndexService::start() { + cp_mgr().register_consumer(cp_consumer_t::INDEX_SVC, std::move(std::make_unique< IndexCPCallbacks >())); -bool IndexService::reserve_ordinal(uint32_t ordinal) { - if (m_ordinal_reserver->is_reserved(ordinal)) { - LOGERROR("ordinal {} is already reserved", ordinal); - return false; - } - m_ordinal_reserver->reserve(ordinal); - return true; -} + if (m_store_sbs.size()) { + // Segregate the index store super blocks based on the store type + std::unordered_map< IndexStore::Type, std::vector< superblk< IndexStoreSuperBlock > > > m; + for (auto& sb : m_store_sbs) { + m[sb->index_store_type].emplace_back(std::move(sb)); + } -bool IndexService::unreserve_ordinal(uint32_t ordinal) { - if (!m_ordinal_reserver->is_reserved(ordinal)) { - LOGERROR("ordinal {} doesn't exist", ordinal); - return false; + for (auto& [store_type, sbs] : m) { + lookup_or_create_store(store_type, std::move(sbs)); + } } - m_ordinal_reserver->unreserve(ordinal); - return true; -} - -void IndexService::start() { - // Start Writeback cache - m_wb_cache = std::make_unique< IndexWBCache >(m_vdev, m_wbcache_sb, hs()->evictor(), - hs()->device_mgr()->atomic_page_size(HSDevType::Fast)); // Load any index tables which are to loaded from meta blk - for (auto const& [meta_cookie, buf] : m_itable_sbs) { - superblk< index_table_sb > sb; - sb.load(buf, meta_cookie); - auto inode = sb->total_interior_nodes; - auto lnode = sb->total_leaf_nodes; - auto depth = sb->btree_depth; - LOGINFO("sb metrics interior {}, leaf: {} depth {}", inode, lnode, depth); - auto tbl = m_svc_cbs->on_index_table_found(std::move(sb)); - tbl->load_metrics(inode, lnode, depth); - reserve_ordinal(tbl->ordinal()); - add_index_table(tbl); + for (auto& sb : m_index_sbs) { + m_ordinal_reserver->reserve(sb->ordinal); + add_index_table(m_svc_cbs->on_index_table_found(std::move(sb))); } - // Recover the writeback cache, which in-turns recovers any index table nodes - m_wb_cache->recover(m_wbcache_sb.second); - - // Notify each table that we have completed recovery - std::unique_lock lg(m_index_map_mtx); - for (const auto& [_, tbl] : m_index_map) { - tbl->recovery_completed(); -#ifdef _PRERELEASE - tbl->audit_tree(); -#endif - } - // Force taking cp after recovery done. This makes sure that the index table is in consistent state and dirty - // buffer after recovery can be added to dirty list for flushing in the new cp - hs()->cp_mgr().trigger_cp_flush(true /* force */); -} - -void IndexService::write_sb(uint32_t ordinal) { - if (is_stopping()) return; - incr_pending_request_num(); + // Notify each index store that we have completed recovery std::unique_lock lg(m_index_map_mtx); - auto const it = m_ordinal_index_map.find(ordinal); - if (it != m_ordinal_index_map.cend()) { it->second->update_sb(); } - decr_pending_request_num(); -} - -IndexService::~IndexService() { m_wb_cache.reset(); } - -bool IndexService::sanity_check(const uint32_t index_ordinal, const IndexBufferPtrList& bufs) const { - auto tbl = get_index_table(index_ordinal); - if (!tbl) { - HS_DBG_ASSERT(false, "Index corresponding to ordinal={} has not been loaded yet, unexpected", index_ordinal); + for (auto& [type, store] : m_index_stores) { + store->on_recovery_completed(); } - return tbl->sanity_check(bufs); } void IndexService::stop() { - start_stopping(); - while (true) { - if (!get_pending_request_num()) break; - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + m_index_map.clear(); + m_ordinal_index_map.clear(); + + for (auto& [type, store] : m_index_stores) { + store->stop(); + store.reset(); } - std::unique_lock lg(m_index_map_mtx); - for (auto& [_, table] : m_index_map) - table->stop(); } -uint64_t IndexService::num_tables() { - std::unique_lock lg(m_index_map_mtx); - return m_index_map.size(); +shared< VirtualDev > IndexService::get_vdev(ServiceSubType sub_type) { + auto it = m_vdevs.find(sub_type); + HS_REL_ASSERT(it != m_vdevs.end(), "Vdev not found for sub_type={}, vdev not created/opened?", sub_type); + return it->second; } -bool IndexService::add_index_table(const std::shared_ptr< IndexTableBase >& tbl) { - if (is_stopping()) return false; - incr_pending_request_num(); - std::unique_lock lg(m_index_map_mtx); - m_index_map.insert(std::make_pair(tbl->uuid(), tbl)); - m_ordinal_index_map.insert(std::make_pair(tbl->ordinal(), tbl)); - decr_pending_request_num(); - return true; +IndexStore* IndexService::lookup_store(IndexStore::Type store_type) { + auto it = m_index_stores.find(store_type); + return (it != m_index_stores.end()) ? it->second.get() : nullptr; } -bool IndexService::remove_index_table(const std::shared_ptr< IndexTableBase >& tbl) { - if (is_stopping()) return false; - incr_pending_request_num(); +shared< IndexStore > IndexService::lookup_or_create_store(IndexStore::Type store_type, + std::vector< superblk< IndexStoreSuperBlock > > sbs) { std::unique_lock lg(m_index_map_mtx); - if (!unreserve_ordinal(tbl->ordinal())) { - decr_pending_request_num(); - return false; - } - m_index_map.erase(tbl->uuid()); - m_ordinal_index_map.erase(tbl->ordinal()); - decr_pending_request_num(); - return true; -} + auto it = m_index_stores.find(store_type); + if (it != m_index_stores.end()) { return it->second; } -std::shared_ptr< IndexTableBase > IndexService::get_index_table(uuid_t uuid) const { - if (is_stopping()) return nullptr; - incr_pending_request_num(); - std::unique_lock lg(m_index_map_mtx); - auto const it = m_index_map.find(uuid); - auto ret = (it != m_index_map.cend()) ? it->second : nullptr; - decr_pending_request_num(); - return ret; -} + shared< IndexStore > store; -std::shared_ptr< IndexTableBase > IndexService::get_index_table(uint32_t ordinal) const { - if (is_stopping()) return nullptr; - incr_pending_request_num(); - std::unique_lock lg(m_index_map_mtx); - auto const it = m_ordinal_index_map.find(ordinal); - auto ret = (it != m_ordinal_index_map.cend()) ? it->second : nullptr; - decr_pending_request_num(); - return ret; -} + switch (store_type) { + case IndexStore::Type::COPY_ON_WRITE_BTREE: + store = std::make_shared< COWBtreeStore >(get_vdev(ServiceSubType::INDEX_BTREE_COPY_ON_WRITE), std::move(sbs)); + break; -void IndexService::repair_index_node(uint32_t ordinal, IndexBufferPtr const& node_buf) { - auto tbl = get_index_table(ordinal); - if (tbl) { - tbl->repair_node(node_buf); - } else { - HS_DBG_ASSERT(false, "Index corresponding to ordinal={} has not been loaded yet, unexpected", ordinal); - } -} + case IndexStore::Type::INPLACE_BTREE: +#if 0 + store = std::make_shared< InPlaceBtreeStore >(get_vdev(ServiceSubType::INDEX_BTREE_INPLACE), std::move(sbs), + hs()->evictor(), + hs()->device_mgr()->atomic_page_size(HSDevType::Fast)); +#endif + break; -void IndexService::parent_recover(uint32_t ordinal, IndexBufferPtr const& node_buf) { - auto tbl = get_index_table(node_buf->m_index_ordinal); - if (tbl) { - tbl->delete_stale_children(node_buf); - } else { - HS_DBG_ASSERT(false, "Index corresponding to ordinal={} has not been loaded yet, unexpected", - node_buf->m_index_ordinal); - } -} + case IndexStore::Type::MEM_BTREE: + store = std::make_shared< MemBtreeStore >(); + break; -void IndexService::update_root(uint32_t ordinal, IndexBufferPtr const& node_buf) { - auto tbl = get_index_table(ordinal); - if (tbl) { - tbl->repair_root_node(node_buf); - } else { - HS_DBG_ASSERT(false, "Index corresponding to ordinal={} has not been loaded yet, unexpected", ordinal); + default: + HS_REL_ASSERT(false, "Unsupported index store type {}", store_type); + break; } + m_index_stores.emplace(std::pair(store_type, store)); + return store; } -uint32_t IndexService::node_size() const { return m_vdev->atomic_page_size(); } - -uint64_t IndexService::used_size() const { - auto size{0}; - std::unique_lock lg{m_index_map_mtx}; - for (auto& [id, table] : m_index_map) { - size += table->used_size(); - } - return size; +void IndexService::add_index_table(const shared< Index >& index) { + std::unique_lock lg(m_index_map_mtx); + m_index_map.insert(std::make_pair(index->uuid(), index)); + m_ordinal_index_map.insert(std::make_pair(index->ordinal(), index)); } -/////////////////////// IndexBuffer methods ////////////////////////// -IndexBuffer::IndexBuffer(BlkId blkid, uint32_t buf_size, uint32_t align_size) : - m_blkid{blkid}, m_bytes{hs_utils::iobuf_alloc(buf_size, sisl::buftag::btree_node, align_size)} {} +folly::Future< folly::Unit > IndexService::destroy_index_table(const shared< Index >& index) { + auto const uuid = index->uuid(); + auto const ordinal = index->ordinal(); + auto fut = index->destroy(); -IndexBuffer::IndexBuffer(uint8_t* raw_bytes, BlkId blkid) : m_blkid(blkid), m_bytes{raw_bytes} {} - -IndexBuffer::~IndexBuffer() { - if (m_bytes) { hs_utils::iobuf_free(m_bytes, sisl::buftag::btree_node); } -} - -std::string IndexBuffer::to_string() const { - static std::vector< std::string > state_str = {"CLEAN", "DIRTY", "FLUSHING"}; - // store m_down_buffers in a string - std::string down_bufs = ""; -#ifndef NDEBUG + // We remove from the map right away for the following reason: + // Typically before a btree is destroyed, it could have done some IO or merging the nodes. So if IO is done, then + // btree is initiated a destroy and then CP is taken, underlying btree will request for all indexes and flush + // them before it starts processing the destroyed btrees. This is because maintaining a map of removed btrees and + // removing from flusing is slightly more expensive for something that is rare event (delete of a btree). So we + // remove the map right away to minimize this cost. { - std::lock_guard lg(m_down_buffers_mtx); - if (m_down_buffers.empty()) { - fmt::format_to(std::back_inserter(down_bufs), "EMPTY"); - } else { - for (auto const& down_buf : m_down_buffers) { - if (auto ptr = down_buf.lock()) { - fmt::format_to(std::back_inserter(down_bufs), "[{}]", voidptr_cast(ptr.get())); - } - } - fmt::format_to(std::back_inserter(down_bufs), " #down bufs={}", m_down_buffers.size()); - } - } -#endif - - if (m_is_meta_buf) { - return fmt::format("[Meta] Buf={} index={} state={} create/dirty_cp={}/{} down_wait#={}{} down={{{}}}", - voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, - state_str[int_cast(state())], m_created_cp_id, m_dirtied_cp_id, - m_wait_for_down_buffers.get(), m_node_freed ? " Freed" : "", down_bufs); - } else { - - return fmt::format( - "Buf={} index={} state={} create/dirty_cp={}/{} down_wait#={}{} up={} node=[{}] down={{{}}}", - voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, state_str[int_cast(state())], - m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), m_node_freed ? " Freed" : "", - voidptr_cast(const_cast< IndexBuffer* >(m_up_buffer.get())), - (m_bytes == nullptr) ? "not attached yet" : r_cast< persistent_hdr_t const* >(m_bytes)->to_compact_string(), - down_bufs); - } -} + std::unique_lock lg(m_index_map_mtx); + auto it = m_index_map.find(uuid); + if (it == m_index_map.end()) { return folly::makeFuture< folly::Unit >(folly::Unit{}); } -std::string IndexBuffer::to_string_dot() const { - auto str = fmt::format("IndexBuffer {} ", reinterpret_cast< void* >(const_cast< IndexBuffer* >(this))); - if (m_bytes == nullptr) { - fmt::format_to(std::back_inserter(str), " node_buf=nullptr "); - } else { - fmt::format_to(std::back_inserter(str), " node_buf={} {} created/dirtied={}/{} {} down_wait#={}", - static_cast< void* >(m_bytes), m_is_meta_buf ? "[META]" : "", m_created_cp_id, m_dirtied_cp_id, - m_node_freed ? "FREED" : "", m_wait_for_down_buffers.get()); + m_ordinal_index_map.erase(ordinal); + m_index_map.erase(it); } - return str; -} -void IndexBuffer::add_down_buffer(const IndexBufferPtr& buf) { - m_wait_for_down_buffers.increment(); -#ifndef NDEBUG - { - std::lock_guard lg(m_down_buffers_mtx); - m_down_buffers.push_back(buf); - } -#endif + // We cannot unreserve the ordinal, until we complete the destroy in underlying tree, otherwise, there could be 2 + // live btrees with same ordinal. + return std::move(fut).thenValue([this, ordinal](auto&&) { + m_ordinal_reserver->unreserve(ordinal); + return folly::makeFuture< folly::Unit >(folly::Unit{}); + }); } -void IndexBuffer::remove_down_buffer(const IndexBufferPtr& buf) { - m_wait_for_down_buffers.decrement(); -#ifndef NDEBUG - bool found{false}; - { - std::lock_guard lg(m_down_buffers_mtx); - for (auto it = buf->m_up_buffer->m_down_buffers.begin(); it != buf->m_up_buffer->m_down_buffers.end(); ++it) { - if (it->lock() == buf) { - buf->m_up_buffer->m_down_buffers.erase(it); - found = true; - break; - } - } - } - HS_DBG_ASSERT(found, "Down buffer {} is linked to up_buf, but up_buf {} doesn't have down_buf in its list", - buf->to_string(), buf->m_up_buffer ? buf->m_up_buffer->to_string() : std::string("nulptr")); -#endif +shared< Index > IndexService::get_index_table(uuid_t uuid) const { + std::shared_lock lg(m_index_map_mtx); + auto const it = m_index_map.find(uuid); + return (it != m_index_map.cend()) ? it->second : nullptr; } -#ifndef NDEBUG -bool IndexBuffer::is_in_down_buffers(const IndexBufferPtr& buf) { - std::lock_guard< std::mutex > lg(m_down_buffers_mtx); - for (auto const& dbuf : m_down_buffers) { - if (dbuf.lock() == buf) { return true; } - } - return false; +shared< Index > IndexService::get_index_table(uint32_t ordinal) const { + std::shared_lock lg(m_index_map_mtx); + auto const it = m_ordinal_index_map.find(ordinal); + return (it != m_ordinal_index_map.cend()) ? it->second : nullptr; } -#endif -MetaIndexBuffer::MetaIndexBuffer(superblk< index_table_sb >& sb) : IndexBuffer{nullptr, BlkId{}}, m_sb{sb} { - m_is_meta_buf = true; +std::vector< shared< Index > > IndexService::get_all_index_tables() const { + std::shared_lock lg(m_index_map_mtx); + std::vector< shared< Index > > v; + std::transform(m_index_map.begin(), m_index_map.end(), std::back_inserter(v), + [](auto const& kv) { return kv.second; }); + return v; } -MetaIndexBuffer::MetaIndexBuffer(shared< MetaIndexBuffer > const& other) : - IndexBuffer{nullptr, BlkId{}}, m_sb{other->m_sb} { - m_is_meta_buf = true; - m_bytes = hs_utils::iobuf_alloc(m_sb.size(), sisl::buftag::metablk, meta_service().align_size()); - copy_sb_to_buf(); -} +uint32_t IndexService::reserve_ordinal() { return m_ordinal_reserver->reserve(); } -MetaIndexBuffer::~MetaIndexBuffer() { - if (m_bytes) { - hs_utils::iobuf_free(m_bytes, sisl::buftag::metablk); - m_bytes = nullptr; +uint64_t IndexService::space_occupied() const { + auto size{0}; + std::unique_lock lg{m_index_map_mtx}; + for (auto& [id, index] : m_index_map) { + size += index->space_occupied(); } - m_valid = false; + return size; } - -void MetaIndexBuffer::copy_sb_to_buf() { std::memcpy(m_bytes, m_sb.raw_buf()->cbytes(), m_sb.size()); } } // namespace homestore diff --git a/src/lib/index/README.md b/src/lib/index/inplace_btree/README.md similarity index 100% rename from src/lib/index/README.md rename to src/lib/index/inplace_btree/README.md diff --git a/src/lib/index/inplace_btree/index.hpp b/src/lib/index/inplace_btree/index.hpp new file mode 100644 index 000000000..2921fcb74 --- /dev/null +++ b/src/lib/index/inplace_btree/index.hpp @@ -0,0 +1,64 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +#include +#include +#include +#include + +namespace homestore { + +using bnodeid_t = uint64_t; +class IndexTableBase; + +#pragma pack(1) +struct index_table_sb { + uint64_t magic{indx_sb_magic}; + uint32_t version{indx_sb_version}; + uuid_t uuid; // UUID of the index + uuid_t parent_uuid; // UUID of the parent container of index (controlled by user) + + // Btree Section + bnodeid_t root_node{empty_bnodeid}; // Root Node ID + uint64_t root_link_version{0}; // Link version to btree root node + int64_t index_size{0}; // Size of the Index + // seq_id_t last_seq_id{-1}; // TODO: See if this is needed + + uint32_t ordinal{0}; // Ordinal of the Index + + uint32_t user_sb_size; // Size of the user superblk + uint8_t user_sb_bytes[0]; +}; +#pragma pack() + +class IndexStoreBase; + +// This class represents base abstract class of an Index. At present btree is the only implementation of Index +class Index { +private: + superblk< IndexSuperBlock > m_sb; + bool const m_is_ephemeral; + +public: + Index(bool is_ephermal) : m_is_ephemeral{is_ephermal} {} + virtual bool is_ephemeral() const { return m_is_ephemeral; } + virtual uuid_t uuid() const override { return m_sb->uuid; } + virtual superblk< index_table_sb >& mutable_super_blk() { return m_sb; } + virtual const superblk< index_table_sb >& mutable_super_blk() const { return m_sb; } +}; + +} // namespace homestore diff --git a/src/lib/index/inplace_btree/index_buffer.cpp b/src/lib/index/inplace_btree/index_buffer.cpp new file mode 100644 index 000000000..09e9239a4 --- /dev/null +++ b/src/lib/index/inplace_btree/index_buffer.cpp @@ -0,0 +1,66 @@ +/////////////////////// IndexBuffer methods ////////////////////////// +IndexBuffer::IndexBuffer(BlkId blkid, uint32_t buf_size, uint32_t align_size) : + m_blkid{blkid}, m_bytes{hs_utils::iobuf_alloc(buf_size, sisl::buftag::btree_node, align_size)} {} + +IndexBuffer::IndexBuffer(uint8_t* raw_bytes, BlkId blkid) : m_blkid(blkid), m_bytes{raw_bytes} {} + +IndexBuffer::~IndexBuffer() { + if (m_bytes) { hs_utils::iobuf_free(m_bytes, sisl::buftag::btree_node); } +} + +std::string IndexBuffer::to_string() const { + if (m_is_meta_buf) { + return fmt::format("Buf={} [Meta] index={} state={} create/dirty_cp={}/{} down_wait#={} freed={}", + voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, int_cast(state()), + m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), m_node_freed); + } else { + // store m_down_buffers in a string + std::string down_bufs = ""; +#ifndef NDEBUG + for (auto const& down_buf : m_down_buffers) { + if (auto ptr = down_buf.lock()) { + fmt::format_to(std::back_inserter(down_bufs), "[{}]", voidptr_cast(ptr.get())); + } + } +#endif + + return fmt::format("Buf={} index={} state={} create/dirty_cp={}/{} down_wait#={}{} up={} node=[{}] down=[{}]", + voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, int_cast(state()), + m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), + m_node_freed ? " Freed" : "", voidptr_cast(const_cast< IndexBuffer* >(m_up_buffer.get())), + (m_bytes == nullptr) ? "not attached yet" + : r_cast< PersistentHeader const* >(m_bytes)->to_compact_string(), + down_bufs); + } +} +std::string IndexBuffer::to_string_dot() const { + auto str = fmt::format("IndexBuffer {} ", reinterpret_cast< void* >(const_cast< IndexBuffer* >(this))); + if (m_bytes == nullptr) { + fmt::format_to(std::back_inserter(str), " node_buf=nullptr "); + } else { + fmt::format_to(std::back_inserter(str), " node_buf={} {} created/dirtied={}/{} {} down_wait#={}", + static_cast< void* >(m_bytes), m_is_meta_buf ? "[META]" : "", m_created_cp_id, m_dirtied_cp_id, + m_node_freed ? "FREED" : "", m_wait_for_down_buffers.get()); + } + return str; +} + +MetaIndexBuffer::MetaIndexBuffer(superblk< index_table_sb >& sb) : IndexBuffer{nullptr, BlkId{}}, m_sb{sb} { + m_is_meta_buf = true; +} + +MetaIndexBuffer::MetaIndexBuffer(shared< MetaIndexBuffer > const& other) : + IndexBuffer{nullptr, BlkId{}}, m_sb{other->m_sb} { + m_is_meta_buf = true; + m_bytes = hs_utils::iobuf_alloc(m_sb.size(), sisl::buftag::metablk, meta_service().align_size()); + copy_sb_to_buf(); +} + +MetaIndexBuffer::~MetaIndexBuffer() { + if (m_bytes) { + hs_utils::iobuf_free(m_bytes, sisl::buftag::metablk); + m_bytes = nullptr; + } +} + +void MetaIndexBuffer::copy_sb_to_buf() { std::memcpy(m_bytes, m_sb.raw_buf()->cbytes(), m_sb.size()); } \ No newline at end of file diff --git a/src/lib/index/inplace_btree/index_cp.cpp b/src/lib/index/inplace_btree/index_cp.cpp new file mode 100644 index 000000000..955bd523f --- /dev/null +++ b/src/lib/index/inplace_btree/index_cp.cpp @@ -0,0 +1,335 @@ +#include +#include + +#include +#include "index/index_cp.hpp" +#include "index/wb_cache.hpp" +#include "common/homestore_assert.hpp" + +namespace homestore { +IndexCPCallbacks::IndexCPCallbacks(IndexWBCache* wb_cache) : m_wb_cache{wb_cache} {} + +std::unique_ptr< CPContext > IndexCPCallbacks::on_switchover_cp(CP* cur_cp, CP* new_cp) { + return std::make_unique< IndexCPContext >(new_cp); +} + +folly::Future< bool > IndexCPCallbacks::cp_flush(CP* cp) { + auto ctx = s_cast< IndexCPContext* >(cp->context(cp_consumer_t::INDEX_SVC)); + return m_wb_cache->async_cp_flush(ctx); +} + +void IndexCPCallbacks::cp_cleanup(CP* cp) {} + +int IndexCPCallbacks::cp_progress_percent() { return 100; } + +/////////////////////// IndexCPContext section /////////////////////////// +IndexCPContext::IndexCPContext(CP* cp) : VDevCPContext(cp) {} + +void IndexCPContext::add_to_txn_journal(uint32_t index_ordinal, const IndexBufferPtr& parent_buf, + const IndexBufferPtr& left_child_buf, const IndexBufferPtrList& created_bufs, + const IndexBufferPtrList& freed_bufs) { + auto record_size = txn_record::size_for_num_ids(created_bufs.size() + freed_bufs.size() + (left_child_buf ? 1 : 0) + + (parent_buf ? 1 : 0)); + std::unique_lock< iomgr::FiberManagerLib::mutex > lg{m_txn_journal_mtx}; + if (m_txn_journal_buf.bytes() == nullptr) { + m_txn_journal_buf = + std::move(sisl::io_blob_safe{std::max(sizeof(txn_journal), 512ul), 512, sisl::buftag::metablk}); + txn_journal* tj = new (m_txn_journal_buf.bytes()) txn_journal(); + tj->cp_id = id(); + } + + txn_journal* tj = r_cast< txn_journal* >(m_txn_journal_buf.bytes()); + if (m_txn_journal_buf.size() < tj->size + record_size) { + m_txn_journal_buf.buf_realloc(m_txn_journal_buf.size() + std::max(tj->size + record_size, 512u), 512, + sisl::buftag::metablk); + tj = r_cast< txn_journal* >(m_txn_journal_buf.bytes()); + } + + { + auto rec = tj->append_record(index_ordinal); + if (parent_buf) { + rec->append(op_t::parent_inplace, parent_buf->blkid()); + if (parent_buf->is_meta_buf()) { rec->is_parent_meta = 0x1; } + } + if (left_child_buf && (left_child_buf != parent_buf)) { + rec->append(op_t::child_inplace, left_child_buf->blkid()); + } + for (auto const& buf : created_bufs) { + rec->append(op_t::child_new, buf->blkid()); + } + for (auto const& buf : freed_bufs) { + rec->append(op_t::child_freed, buf->blkid()); + } + } +} + +void IndexCPContext::add_to_dirty_list(const IndexBufferPtr& buf) { + m_dirty_buf_list.push_back(buf); + buf->set_state(index_buf_state_t::DIRTY); + m_dirty_buf_count.increment(1); +} + +bool IndexCPContext::any_dirty_buffers() const { return !m_dirty_buf_count.testz(); } + +void IndexCPContext::prepare_flush_iteration() { m_dirty_buf_it = m_dirty_buf_list.begin(); } + +std::optional< IndexBufferPtr > IndexCPContext::next_dirty() { + if (m_dirty_buf_it == m_dirty_buf_list.end()) { return std::nullopt; } + IndexBufferPtr ret = *m_dirty_buf_it; + ++m_dirty_buf_it; + return ret; +} + +std::string IndexCPContext::to_string() { + std::string str{fmt::format("IndexCPContext cpid={} dirty_buf_count={} dirty_buf_list_size={}\n", m_cp->id(), + m_dirty_buf_count.get(), m_dirty_buf_list.size())}; + + // Mapping from a node to all its parents in the graph. + // Display all buffers and its dependencies and state. + std::unordered_map< IndexBuffer*, std::vector< IndexBuffer* > > parents; + + m_dirty_buf_list.foreach_entry([&parents](IndexBufferPtr buf) { + // Add this buf to his children. + parents[buf->m_up_buffer.get()].emplace_back(buf.get()); + }); + + m_dirty_buf_list.foreach_entry([&str, &parents](IndexBufferPtr buf) { + fmt::format_to(std::back_inserter(str), "{}", buf->to_string()); + auto first = true; + for (const auto& p : parents[buf.get()]) { + if (first) { + fmt::format_to(std::back_inserter(str), "\nDepends:"); + first = false; + } + fmt::format_to(std::back_inserter(str), " {}({})", r_cast< void* >(p), s_cast< int >(p->state())); + } + fmt::format_to(std::back_inserter(str), "\n"); + }); + return str; +} + +void IndexCPContext::to_string_dot(const std::string& filename) { + std::ofstream file(filename); + if (!file.is_open()) { throw std::runtime_error("Failed to open file: " + filename); } + + file << "digraph G {\n"; + + // Mapping from a node to all its parents in the graph. + std::unordered_map< IndexBuffer*, std::vector< IndexBuffer* > > parents; + + m_dirty_buf_list.foreach_entry([&parents](IndexBufferPtr buf) { + // Add this buf to his children. + parents[buf->m_up_buffer.get()].emplace_back(buf.get()); + }); + m_dirty_buf_list.foreach_entry([&file, &parents, this](IndexBufferPtr buf) { + std::vector< std::string > colors = {"lightgreen", "lightcoral", "lightyellow"}; + auto sbuf = BtreeNode::to_string_buf(buf->raw_buffer()); + auto pos = sbuf.find("LEAF"); + if (pos != std::string::npos) { + sbuf.insert(pos + 4, "
"); + } else { + pos = sbuf.find("INTERIOR"); + if (pos != std::string::npos) { sbuf.insert(pos + 8, "
"); } + } + file << fmt::format( + "\"{}\" [shape={}, label=< {}
{} >, fillcolor=\"{}\", style=\"filled\", fontname=\"bold\"];\n", + r_cast< void* >(buf.get()), m_cp->id() == buf->m_created_cp_id ? "ellipse" : "box", buf->to_string_dot(), + sbuf, colors[s_cast< int >(buf->state())]); + for (const auto& p : parents[buf.get()]) { + file << fmt::format("\"{}\" -> \"{}\";\n", r_cast< void* >(p), r_cast< void* >(buf.get())); + } + }); + file << "}\n"; + + file.close(); +} + +std::string IndexCPContext::to_string_with_dags() { + struct DagNode { + IndexBufferPtr buf; + std::vector< shared< DagNode > > down_nodes; + }; + std::vector< shared< DagNode > > group_roots; + std::unordered_map< IndexBufferPtr, shared< DagNode > > buf_to_dag_node; + + auto get_insert_buf = [&buf_to_dag_node](IndexBufferPtr buf) { + auto it = buf_to_dag_node.find(buf); + if (it == buf_to_dag_node.end()) { + auto dgn = std::make_shared< DagNode >(); + dgn->buf = buf; + buf_to_dag_node[buf] = dgn; + return dgn; + } + return it->second; + }; + + std::unique_lock lg{m_flush_buffer_mtx}; + // Create the graph + m_dirty_buf_list.foreach_entry([&get_insert_buf, &group_roots](IndexBufferPtr buf) { + if (buf->m_up_buffer == nullptr) { + auto dgn = get_insert_buf(buf); + group_roots.emplace_back(dgn); + } else { + auto dgn = get_insert_buf(buf); + auto up_dgn = get_insert_buf(buf->m_up_buffer); + up_dgn->down_nodes.emplace_back(dgn); + } + }); + + // Now walk through the list of graphs and prepare formatted string + std::string str{fmt::format("IndexCPContext cpid={} dirty_buf_count={} dirty_buf_list_size={} #_of_dags={}\n", + m_cp->id(), m_dirty_buf_count.get(), m_dirty_buf_list.size(), group_roots.size())}; + for (const auto& root : group_roots) { + std::vector< std::pair< std::shared_ptr< DagNode >, int > > stack; + stack.emplace_back(root, 0); + while (!stack.empty()) { + auto [node, level] = stack.back(); + stack.pop_back(); + fmt::format_to(std::back_inserter(str), "{}{} \n", std::string(level * 4, ' '), node->buf->to_string()); + for (const auto& d : node->down_nodes) { + stack.emplace_back(d, level + 1); + } + } + } + + return str; +} + +void IndexCPContext::log_dags() { + LOGINFO("{}", to_string_with_dags()); + sisl::logging::GetLogger()->flush(); +} + +std::map< BlkId, IndexBufferPtr > IndexCPContext::recover(sisl::byte_view sb) { + txn_journal const* tj = r_cast< txn_journal const* >(sb.bytes()); + if (tj->cp_id != id()) { + // On clean shutdown, cp_id would be lesser than the current cp_id, in that case ignore this sb + HS_DBG_ASSERT_LT(tj->cp_id, id(), "Persisted cp in wb txn journal is more than current cp"); + return {}; + } + HS_DBG_ASSERT_GT(tj->num_txns, 0, "Invalid txn_journal, num_txns is zero"); + HS_DBG_ASSERT_GT(tj->size, 0, "Invalid txn_journal, size of records is zero"); + + std::map< BlkId, IndexBufferPtr > buf_map; + uint8_t const* cur_ptr = r_cast< uint8_t const* >(tj) + sizeof(txn_journal); + + for (uint32_t t{0}; t < tj->num_txns; ++t) { + txn_record const* rec = r_cast< txn_record const* >(cur_ptr); + HS_DBG_ASSERT_GT(rec->total_ids(), 0, "Invalid txn_record, has no ids in it"); + + process_txn_record(rec, buf_map); + cur_ptr += rec->size(); + } + + return buf_map; +} + +void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, IndexBufferPtr >& buf_map) { + auto cpg = cp_mgr().cp_guard(); + + auto const rec_to_buf = [&buf_map, &cpg](txn_record const* rec, bool is_meta, BlkId const& bid, + IndexBufferPtr const& up_buf) -> IndexBufferPtr { + IndexBufferPtr buf; + auto it = buf_map.find(bid); + if (it == buf_map.end()) { + if (is_meta) { + superblk< index_table_sb > tmp_sb; + buf = std::make_shared< MetaIndexBuffer >(tmp_sb); + } else { + buf = std::make_shared< IndexBuffer >(nullptr, bid); + } + + [[maybe_unused]] auto [it2, happened] = buf_map.insert(std::make_pair(bid, buf)); + DEBUG_ASSERT(happened, "buf_map insert failed"); + + buf->m_dirtied_cp_id = cpg->id(); + buf->m_index_ordinal = rec->index_ordinal; + } else { + buf = it->second; + } + + if (up_buf) { + DEBUG_ASSERT(((buf->m_up_buffer == nullptr) || (buf->m_up_buffer == up_buf)), "Inconsistent up buffer"); + auto real_up_buf = (up_buf->m_created_cp_id == cpg->id()) ? up_buf->m_up_buffer : up_buf; + +#ifndef NDEBUG + // if (!is_sibling_link || (buf->m_up_buffer == real_up_buf)) { return buf;} + // Already linked with same buf or its not a sibling link to override + bool found{false}; + for (auto const& dbuf : real_up_buf->m_down_buffers) { + if (dbuf.lock() == buf) { + found = true; + break; + } + } + if (found) { return buf; } + real_up_buf->m_down_buffers.emplace_back(buf); +#endif + + if (buf->m_up_buffer != real_up_buf) { + real_up_buf->m_wait_for_down_buffers.increment(1); + buf->m_up_buffer = real_up_buf; + } + } + return buf; + }; + + uint32_t cur_idx = 0; + IndexBufferPtr parent_buf{nullptr}; + if (rec->has_inplace_parent) { parent_buf = rec_to_buf(rec, rec->is_parent_meta, rec->blk_id(cur_idx++), nullptr); } + + IndexBufferPtr inplace_child_buf{nullptr}; + if (rec->has_inplace_child) { + inplace_child_buf = rec_to_buf(rec, false /* is_meta */, rec->blk_id(cur_idx++), parent_buf); + } + + for (uint8_t idx{0}; idx < rec->num_new_ids; ++idx) { + auto new_buf = rec_to_buf(rec, false /* is_meta */, rec->blk_id(cur_idx++), + inplace_child_buf ? inplace_child_buf : parent_buf); + new_buf->m_created_cp_id = cpg->id(); + } + + for (uint8_t idx{0}; idx < rec->num_freed_ids; ++idx) { + auto freed_buf = rec_to_buf(rec, false /* is_meta */, rec->blk_id(cur_idx++), + inplace_child_buf ? inplace_child_buf : parent_buf); + freed_buf->m_node_freed = true; + } +} + +void IndexCPContext::txn_journal::log_records() const { LOGINFO("{}", to_string()); } + +std::string IndexCPContext::txn_journal::to_string() const { + std::string str = fmt::format("cp_id={}, num_txns={}, size={}", cp_id, num_txns, size); + uint8_t const* cur_ptr = r_cast< uint8_t const* >(this) + sizeof(txn_journal); + for (uint32_t t{0}; t < num_txns; ++t) { + txn_record const* rec = r_cast< txn_record const* >(cur_ptr); + fmt::format_to(std::back_inserter(str), "\n {}: {}", t, rec->to_string()); + cur_ptr += rec->size(); + } + return str; +} + +std::string IndexCPContext::txn_record::to_string() const { + auto add_to_string = [this](std::string& str, uint8_t& idx, uint8_t id_count) { + if (id_count == 0) { + fmt::format_to(std::back_inserter(str), "empty]"); + } else { + for (uint8_t i{0}; i < id_count; ++i, ++idx) { + fmt::format_to(std::back_inserter(str), "[chunk={}, blk={}],", ids[idx].second, ids[idx].first); + } + fmt::format_to(std::back_inserter(str), "]"); + } + }; + + std::string str = fmt::format("ordinal={}, parent=[{}], in_place_child=[{}]", index_ordinal, parent_id_string(), + child_id_string(), num_new_ids, num_freed_ids); + + uint8_t idx = (has_inplace_parent == 0x1) ? 1 : 0 + (has_inplace_child == 0x1) ? 1 : 0; + fmt::format_to(std::back_inserter(str), ", new_ids=["); + add_to_string(str, idx, num_new_ids); + + fmt::format_to(std::back_inserter(str), ", freed_ids=["); + add_to_string(str, idx, num_freed_ids); + return str; +} +} // namespace homestore diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/inplace_btree/index_cp.hpp similarity index 96% rename from src/lib/index/index_cp.hpp rename to src/lib/index/inplace_btree/index_cp.hpp index b15cba892..b04b8f052 100644 --- a/src/lib/index/index_cp.hpp +++ b/src/lib/index/inplace_btree/index_cp.hpp @@ -24,8 +24,6 @@ #include #include "device/virtual_dev.hpp" -SISL_LOGGING_DECL(wbcache) - namespace homestore { class BtreeNode; struct IndexCPContext : public VDevCPContext { @@ -37,8 +35,7 @@ struct IndexCPContext : public VDevCPContext { uint8_t has_inplace_parent : 1; // Do we have parent_id in the list of ids. It will be first uint8_t has_inplace_child : 1; // Do we have child_id in the list of ids. It will be second uint8_t is_parent_meta : 1; // Is the parent buffer a meta buffer - uint8_t free_node_level : 4; // Free/created node level - uint8_t reserved1 : 1; + uint8_t reserved1 : 5; uint8_t num_new_ids; uint8_t num_freed_ids; uint8_t reserved{0}; @@ -49,7 +46,6 @@ struct IndexCPContext : public VDevCPContext { has_inplace_parent{0x0}, has_inplace_child{0x0}, is_parent_meta{0x0}, - free_node_level{0x0}, num_new_ids{0}, num_freed_ids{0}, index_ordinal{ordinal} {} @@ -101,7 +97,8 @@ struct IndexCPContext : public VDevCPContext { std::string child_id_string() const { auto const idx = (has_inplace_parent == 0x1) ? 1 : 0; - return (has_inplace_child == 0x1) ? fmt::format("{}", blk_id(idx).to_integer()) : "empty"; + return (has_inplace_child == 0x1) ? fmt::format("{}", blk_id(idx).to_integer()) + : "empty"; } std::string to_string() const; @@ -162,8 +159,6 @@ struct IndexCPContext : public VDevCPContext { void prepare_flush_iteration(); std::optional< IndexBufferPtr > next_dirty(); std::string to_string(); - std::string to_string_small(); - std::string to_string_free_list(); std::string to_string_with_dags(); uint16_t num_dags(); void to_string_dot(const std::string& filename); diff --git a/src/include/homestore/index/index_table.hpp b/src/lib/index/inplace_btree/inplace_btree_store.h similarity index 77% rename from src/include/homestore/index/index_table.hpp rename to src/lib/index/inplace_btree/inplace_btree_store.h index 3997a3ca3..63e141bda 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/lib/index/inplace_btree/inplace_btree_store.h @@ -21,22 +21,43 @@ #include #include #include -#include #include #include #include #include -SISL_LOGGING_DECL(wbcache) - namespace homestore { +class BtreeStoreBase; + +#pragma pack(1) +struct index_table_sb { + uint64_t magic{indx_sb_magic}; + uint32_t version{indx_sb_version}; + uuid_t uuid; // UUID of the index + uuid_t parent_uuid; // UUID of the parent container of index (controlled by user) + + struct btree_sb_t { + bnodeid_t root_node{empty_bnodeid}; // Btree Root Node ID + uint64_t root_link_version{0}; // Link version to btree root node + int64_t index_size{0}; // Size of the Index + // seq_id_t last_seq_id{-1}; // TODO: See if this is needed + + uint32_t ordinal{0}; // Ordinal of the Index + BlkId full_map_location; // Location of any btree map (applicable for COWBtree only so far) + }; + + btree_sb_t btree_sb; + uint32_t user_sb_size; // Size of the user superblk + uint8_t user_sb_bytes[0]; +}; +#pragma pack() + template < typename K, typename V > class IndexTable : public IndexTableBase, public Btree< K, V > { private: superblk< index_table_sb > m_sb; shared< MetaIndexBuffer > m_sb_buffer; - static constexpr uint32_t INVALID_ORDINAL = std::numeric_limits< uint32_t >::max(); // graceful shutdown private: @@ -60,25 +81,12 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } - IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg, - uint32_t ordinal = INVALID_ORDINAL, const std::vector< chunk_num_t >& chunk_ids = {}, - uint32_t pdev_id = 0) : + IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg) : Btree< K, V >{cfg}, m_sb{"index"} { - uint32_t ord_num = INVALID_ORDINAL; - if (ordinal != INVALID_ORDINAL) { - BT_LOG_ASSERT(!hs()->index_service().get_index_table(ordinal), "table with ordinal {} already exists", - ordinal); - hs()->index_service().reserve_ordinal(ordinal); - ord_num = ordinal; - } else { - ord_num = hs()->index_service().reserve_ordinal(); - } // Create a superblk for the index table and create MetaIndexBuffer corresponding to that - m_sb.create(sizeof(index_table_sb) + (chunk_ids.size() * sizeof(chunk_num_t))); - m_sb->init_chunks(chunk_ids); - m_sb->pdev_id = pdev_id; - m_sb->ordinal = ord_num; + m_sb.create(sizeof(index_table_sb)); m_sb->uuid = uuid; + m_sb->ordinal = hs()->index_service().reserve_ordinal(); m_sb->parent_uuid = parent_uuid; m_sb->user_sb_size = user_sb_size; m_sb.write(); @@ -113,32 +121,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } - void audit_tree() const override { + void audit_tree() override { cp_mgr().cp_guard(); Btree< K, V >::sanity_sub_tree(); } - bool sanity_check(const IndexBufferPtrList& bufs) const override { - for (auto& buf : bufs) { - if (buf->is_meta_buf()) { continue; } - try { - Btree< K, V >::validate_node(buf->blkid().to_integer()); - } catch (const std::exception& e) { - LOGERROR("Exception during validation of node {}", buf->blkid().to_integer()); - return false; - } - } - return true; - } - btree_status_t destroy() override { if (is_stopping()) return btree_status_t::stopping; incr_pending_request_num(); - auto chunk_selector{hs()->index_service().get_chunk_selector()}; - if (!chunk_selector) { - auto cpg = cp_mgr().cp_guard(); - Btree< K, V >::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); - } + auto cpg = cp_mgr().cp_guard(); + Btree< K, V >::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); m_sb.destroy(); m_sb_buffer->m_valid = false; decr_pending_request_num(); @@ -161,10 +153,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { auto cpg = cp_mgr().cp_guard(); put_req.m_op_context = (void*)cpg.context(cp_consumer_t::INDEX_SVC); ret = Btree< K, V >::put(put_req); - if (ret == btree_status_t::cp_mismatch) { - LOGTRACEMOD(wbcache, "CP Mismatch, retrying put"); - COUNTER_INCREMENT(this->m_metrics, btree_retry_count, 1); - } + if (ret == btree_status_t::cp_mismatch) { LOGTRACEMOD(wbcache, "CP Mismatch, retrying put"); } } while (ret == btree_status_t::cp_mismatch); decr_pending_request_num(); return ret; @@ -179,10 +168,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { auto cpg = cp_mgr().cp_guard(); remove_req.m_op_context = (void*)cpg.context(cp_consumer_t::INDEX_SVC); ret = Btree< K, V >::remove(remove_req); - if (ret == btree_status_t::cp_mismatch) { - LOGTRACEMOD(wbcache, "CP Mismatch, retrying remove"); - COUNTER_INCREMENT(this->m_metrics, btree_retry_count, 1); - } + if (ret == btree_status_t::cp_mismatch) { LOGTRACEMOD(wbcache, "CP Mismatch, retrying remove"); } } while (ret == btree_status_t::cp_mismatch); decr_pending_request_num(); return ret; @@ -208,18 +194,9 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf); auto edge_id = n->next_bnode(); - if (n->has_valid_edge() && hs()->has_fc_service()) { - auto const reason = - fmt::format("root {} already has a valid edge {}, so we should have found the new root node", - n->to_string(), n->get_edge_value().bnode_id()); - hs()->fc_service().trigger_fc(FaultContainmentEvent::ENTER, static_cast< void* >(&(m_sb->parent_uuid)), - reason); - return; - } else { - BT_REL_ASSERT(!n->has_valid_edge(), - "root {} already has a valid edge {}, so we should have found the new root node", - n->to_string(), n->get_edge_value().bnode_id()); - } + BT_DBG_ASSERT(!n->has_valid_edge(), + "root {} already has a valid edge {}, so we should have found the new root node", + n->to_string(), n->get_edge_value().bnode_id()); n->set_next_bnode(empty_bnodeid); n->set_edge_value(BtreeLinkInfo{edge_id, 0}); LOGTRACEMOD(wbcache, "change root node {}: edge updated to {} and invalidate the next node! ", n->node_id(), @@ -234,8 +211,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } void delete_stale_children(IndexBufferPtr const& idx_buf) override { - if (!idx_buf->is_meta_buf() && idx_buf->m_created_cp_id == -1) { - BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */, BtreeNode::identify_leaf_node(idx_buf->raw_buffer())); static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf); auto cpg = cp_mgr().cp_guard(); @@ -251,7 +226,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { void repair_node(IndexBufferPtr const& idx_buf) override { if (idx_buf->is_meta_buf()) { - // We cannot repair the meta buf on its own, we need to repair the root node which modifies the // meta_buf. It is ok to ignore this call, because repair will be done from root before meta_buf is // attempted to repair, which would have updated the meta_buf already. LOGTRACEMOD(wbcache, "Ignoring repair on meta buf {} root id {} ", idx_buf->to_string(), @@ -284,8 +258,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { protected: ////////////////// Override Implementation of underlying store requirements ////////////////// - BtreeNodePtr alloc_node(bool is_leaf) override { - return wb_cache().alloc_buf(ordinal(), [this, is_leaf](const IndexBufferPtr& idx_buf) -> BtreeNodePtr { + BtreeNodePtr create_node(bool is_leaf) override { + return wb_cache().alloc_buf([this, is_leaf](const IndexBufferPtr& idx_buf) -> BtreeNodePtr { BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), true, is_leaf); static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf); return BtreeNodePtr{n}; @@ -313,10 +287,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { "Writing on a node buffer which was currently in flushing state on cur_cp={} buffer_cp_id={}", cp_ctx->id(), idx_node->m_idx_buf->m_dirtied_cp_id); BT_DBG_ASSERT_EQ(idx_node->m_idx_buf->m_dirtied_cp_id, cp_ctx->id(), - "Writing a node which was not acquired by this cp"); } return btree_status_t::success; - } btree_status_t transact_nodes(const BtreeNodeList& new_nodes, const BtreeNodeList& freed_nodes, const BtreeNodePtr& left_child_node, const BtreeNodePtr& parent_node, @@ -335,7 +307,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { IndexBufferPtrList freed_node_bufs; for (const auto& freed_node : freed_nodes) { freed_node_bufs.push_back(s_cast< IndexBtreeNode* >(freed_node.get())->m_idx_buf); - this->free_node(freed_node, locktype_t::WRITE, context); + this->remove_node(freed_node, locktype_t::WRITE, context); } wb_cache().transact_bufs( @@ -375,10 +347,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { LOGTRACEMOD(wbcache, "root changed for index old_root={} new_root={}", m_sb->root_node, new_root->node_id()); m_sb->root_node = new_root->node_id(); m_sb->root_link_version = new_root->link_version(); - m_sb->btree_depth = new_root->level(); - m_sb->total_interior_nodes = this->m_total_interior_nodes; - m_sb->total_leaf_nodes = this->m_total_leaf_nodes; - std::tie(m_sb->total_interior_nodes, m_sb->total_leaf_nodes) = this->get_num_nodes(); if (!wb_cache().refresh_meta_buf(m_sb_buffer, r_cast< CPContext* >(context))) { LOGTRACEMOD(wbcache, "CP mismatch error - discard transact for meta node"); @@ -390,26 +358,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return btree_status_t::success; } - void update_sb() override { - if (!this->m_sb_buffer || !this->m_sb_buffer->m_valid) { - LOGERROR("Attempting to update superblk when it is already invalid"); - return; - } - m_sb->total_interior_nodes = this->m_total_interior_nodes; - m_sb->total_leaf_nodes = this->m_total_leaf_nodes; - m_sb->btree_depth = this->m_btree_depth; - m_sb.write(); - } - - void load_metrics(uint64_t interior, uint64_t leaf, uint8_t depth) override { - this->m_total_leaf_nodes = leaf; - this->m_total_interior_nodes = interior; - this->m_btree_depth = depth; - COUNTER_INCREMENT(this->m_metrics, btree_int_node_count, interior); - COUNTER_INCREMENT(this->m_metrics, btree_leaf_node_count, leaf); - COUNTER_INCREMENT(this->m_metrics, btree_depth, depth); - } - btree_status_t delete_stale_links(BtreeNodePtr const& parent_node, void* cp_ctx) { LOGTRACEMOD(wbcache, "deleting stale links for parent node [{}]", parent_node->to_string()); BtreeNodeList free_nodes; @@ -508,58 +456,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return btree_status_t::success; } - bnodeid_t true_sibling_first_child(BtreeNodePtr const& parent_node) { - bnodeid_t sibling_first_child_id = empty_bnodeid; - if (!parent_node->is_leaf() && !parent_node->has_valid_edge()) { - BtreeNodePtr parent_right_sibling; - if (auto parent_right_sibling_id = find_true_sibling(parent_node); - parent_right_sibling_id != empty_bnodeid) { - if (auto ret = read_node_impl(parent_right_sibling_id, parent_right_sibling); - ret == btree_status_t::success) { - if (parent_right_sibling->total_entries() > 0) { - BtreeLinkInfo sibling_first_child_info; - parent_right_sibling->get_nth_value(0, &sibling_first_child_info, false); - sibling_first_child_id = sibling_first_child_info.bnode_id(); - } else if (parent_right_sibling->has_valid_edge()) { - // If the right sibling has an edge, we can use that as the first child - sibling_first_child_id = parent_right_sibling->get_edge_value().bnode_id(); - } - } - } - } - return sibling_first_child_id; - } - - void update_root(BtreeNodePtr const& left_child, BtreeNodeList& new_nodes, void* cp_ctx) { - auto new_root = this->alloc_interior_node(); - if (new_root == nullptr) { return; } - new_root->set_level(left_child->level() + 1); - auto cur_child = left_child; - uint32_t i = 0; - LOGTRACEMOD(wbcache, "Updating new root node={}", new_root->to_string()); - do { - LOGTRACEMOD(wbcache, "Processiog child {}", cur_child->to_string()); - if (cur_child->has_valid_edge()) { - new_root->set_edge_value(BtreeLinkInfo{cur_child->node_id(), cur_child->link_version()}); - } else { - auto child_last_key = cur_child->get_last_key< K >(); - new_root->insert(new_root->total_entries(), child_last_key, - BtreeLinkInfo{cur_child->node_id(), cur_child->link_version()}); - } - if (i == new_nodes.size()) { break; } - auto next_child_id = cur_child->next_bnode(); - cur_child = new_nodes[i++]; - DEBUG_ASSERT_EQ(next_child_id, cur_child->node_id(), - "Next node id {} does not match current child node id {}", next_child_id, - cur_child->node_id()); - } while (true); - - new_nodes.push_back(new_root); - LOGTRACEMOD(wbcache, "New root node created {}", new_root->to_string()); - on_root_changed(new_root, cp_ctx); - this->set_root_node_info(BtreeLinkInfo{new_root->node_id(), new_root->link_version()}); - } - // btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { LOGTRACEMOD(wbcache, "Repairing links for parent node [{}]", parent_node->to_string()); @@ -567,11 +463,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // needs to be handled. Get the last key in the node auto last_parent_key = parent_node->get_last_key< K >(); - auto sibling_node_id = find_true_sibling(parent_node); - // during delete stale links, the current edge node can be deleted and its left sibling will become edge node. - // While repairing the left sibling, has_valid_edge() is false but we need to make it an edge node. - // So we check if the true_sibling is empty to determine if we need to make it an edge node. - auto const is_parent_edge_node = (sibling_node_id == empty_bnodeid); + auto const is_parent_edge_node = parent_node->has_valid_edge(); if ((parent_node->total_entries() == 0) && !is_parent_edge_node) { BT_LOG_ASSERT(false, "Parent node={} is empty and not an edge node but was asked to repair", parent_node->node_id()); @@ -584,8 +476,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { BtreeLinkInfo link_info; parent_node->get_nth_value(i, &link_info, true); orig_child_infos[link_info.bnode_id()] = parent_node->get_nth_key< K >(i, false /* copy */); - LOGTRACEMOD(wbcache, "Child node [{}] with key [{}] at index [{}]", link_info.bnode_id(), - orig_child_infos[link_info.bnode_id()].to_string(), i); } LOGTRACEMOD(wbcache, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), last_parent_key.to_string()); @@ -603,51 +493,48 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } // update the last key of parent for issue - // 1- last key is X for parent (P) - // 2- check the non deleted last child (A) last key (here is Y) // start from first child and store the last key of the child node, then traverse to next sibling // 2-1- if this is greater than parent last key, traverse for sibling of parent until reaches to // siblings which has keys more than Y or end of list (name this parent sibling node F), // 2-2- Put last key of F to last key of P // 2-3 - set F as Next of A + BtreeNodeList siblings; BtreeNodePtr next_cur_child; BT_DBG_ASSERT(parent_node->has_valid_edge() || parent_node->total_entries(), "parent node {} doesn't have valid edge and no entries ", parent_node->to_string()); if (parent_node->total_entries() > 0) { + auto updated_last_key = last_parent_key; K last_child_last_key; K last_child_neighbor_key; - BtreeNodePtr cur_child = child_node; + BtreeNodePtr cur_child; + BtreeLinkInfo cur_child_info; - // We find the last child node by starting from the leftmost child and traversing through the - // next_bnode links until we reach the end or find a sibling first child. bool found_child = false; - auto sibling_first_child = true_sibling_first_child(parent_node); - LOGTRACEMOD(wbcache, "Sibling first child id is {}", sibling_first_child); - while (cur_child != nullptr) { - LOGTRACEMOD(wbcache, "Processing child node [{}]", cur_child->to_string()); - if (!cur_child->is_node_deleted() && cur_child->total_entries() > 0) { - last_child_last_key = cur_child->get_last_key< K >(); - found_child = true; - } - - next_cur_child = nullptr; - if (cur_child->next_bnode() == empty_bnodeid || - read_node_impl(cur_child->next_bnode(), next_cur_child) != btree_status_t::success) { - break; // No next child, so we can stop here - } - - if (sibling_first_child != empty_bnodeid && sibling_first_child == cur_child->next_bnode()) { - LOGTRACEMOD(wbcache, + uint32_t nentries = parent_node->total_entries() + parent_node->has_valid_edge() ? 1 : 0; + + for (uint32_t i = nentries; i-- > 0;) { + parent_node->get_nth_value(i, &cur_child_info, false /* copy */); + if (auto ret = read_node_impl(cur_child_info.bnode_id(), cur_child); ret == btree_status_t::success) { + if (!cur_child->is_node_deleted() && cur_child->total_entries()) { + last_child_last_key = cur_child->get_last_key< K >(); + if (cur_child->next_bnode() != empty_bnodeid && + read_node_impl(cur_child->next_bnode(), next_cur_child) == btree_status_t::success) { + LOGTRACEMOD( + wbcache, "Last child last key {} for child_node [{}] parent node [{}], next neigbor is [{}]", last_child_last_key.to_string(), cur_child->to_string(), parent_node->to_string(), next_cur_child->to_string()); - break; + found_child = true; + break; + } + found_child = true; + break; + } + LOGTRACEMOD(wbcache, "PASSING child node {} so we need to check next child node", + cur_child->to_string()); } - cur_child = next_cur_child; } - // If we found a valid last child node, we adjust the parent_last_key by comparing it with the last - // child last key. if (found_child) { LOGTRACEMOD(wbcache, "Last child last key {} for parent node {}, child_node {}", last_child_last_key.to_string(), parent_node->to_string(), cur_child->to_string()); @@ -669,17 +556,47 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // 2-1 traverse for sibling of parent until reaches to siblings which has keys more than 7563 // or end // of list (put all siblings in a list, here is F) , + BtreeNodePtr sibling; BtreeNodePtr true_sibling; - if (sibling_node_id != empty_bnodeid && - read_node_impl(sibling_node_id, true_sibling) == btree_status_t::success) { - last_parent_key = last_child_last_key; - parent_node->set_next_bnode(true_sibling->node_id()); + BtreeLinkInfo sibling_info; + + auto sibling_node_id = parent_node->next_bnode(); + while (sibling_node_id != empty_bnodeid) { + if (auto ret = read_node_impl(sibling_node_id, sibling); ret == btree_status_t::success) { + if (sibling->is_node_deleted()) { + // Do we need to free the sibling node here? + siblings.push_back(sibling); + sibling_node_id = sibling->next_bnode(); + LOGTRACEMOD(wbcache, "Sibling node [{}] is deleted, continue to next sibling", + sibling->to_string()); + continue; + } + auto sibling_last_key = sibling->get_last_key< K >(); + if (next_cur_child && sibling_last_key.compare(last_child_neighbor_key) < 0) { + siblings.push_back(sibling); + sibling_node_id = sibling->next_bnode(); + } else { + true_sibling = sibling; + break; + } + } + } + if (true_sibling) { LOGTRACEMOD(wbcache, "True sibling [{}] for parent_node {}", true_sibling->to_string(), parent_node->to_string()); - } - if (!true_sibling) { + } else { LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", parent_node->to_string()); } + if (sibling_node_id != empty_bnodeid) { + last_parent_key = last_child_last_key; + parent_node->set_next_bnode(true_sibling->node_id()); + for (auto sibling : siblings) { + LOGTRACEMOD(wbcache, "Sibling list [{}]", sibling->to_string()); + } + LOGTRACEMOD(wbcache, "True sibling [{}]", true_sibling->to_string()); + BtreeLinkInfo first_child_info; + parent_node->get_nth_value(0, &first_child_info, false); + } } else { LOGTRACEMOD(wbcache, "No undeleted child found for parent_node [{}], keep normal repair (regular recovery)", @@ -691,17 +608,15 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // Keep a copy of the node buffer, in case we need to revert back uint8_t* tmp_buffer = new uint8_t[this->m_node_size]; - std::memcpy(tmp_buffer, parent_node->m_phys_node_buf, this->m_node_size); // Remove all the entries in parent_node and let walk across child_nodes rebuild this node - parent_node->remove_all(this->m_bt_cfg); + parent_node->remove_all(); // Walk across all child nodes until it gets the last_parent_key and keep fixing them. auto cur_parent = parent_node; BtreeNodeList new_parent_nodes; do { if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) { - LOGTRACEMOD(wbcache, "Child node [{}] is an edge node or a leaf with no next", child_node->to_string()); if (child_node->is_node_deleted()) { // Edge node is merged, we need to set the current last entry as edge if (cur_parent->total_entries() > 0) { @@ -719,14 +634,14 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } else { // Update edge and finish if (is_parent_edge_node) { - cur_parent->set_next_bnode(empty_bnodeid); cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); } else { - if (sibling_node_id != empty_bnodeid) { - cur_parent->set_next_bnode(sibling_node_id); + auto tsib_id = find_true_sibling(cur_parent); + if (tsib_id != empty_bnodeid) { + cur_parent->set_next_bnode(tsib_id); LOGTRACEMOD(wbcache, "True sibling [{}] for parent_node [{}], So don't add child [{}] here ", - sibling_node_id, cur_parent->to_string(), child_node->to_string()); + tsib_id, cur_parent->to_string(), child_node->to_string()); } else { cur_parent->set_next_bnode(empty_bnodeid); // if this child node previously belonged to this parent node, we need to add it but as edge @@ -742,14 +657,13 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", cur_parent->to_string()); } - BT_REL_ASSERT(cur_parent->total_entries() != 0 || cur_parent->has_valid_edge(), "Parent node [{}] cannot be empty", cur_parent->to_string()); } } - LOGTRACEMOD(wbcache, "Repairing node=[{}], child_node=[{}] is an edge node, end loop", - cur_parent->to_string(), child_node->to_string()); + // + // } break; } break; @@ -766,11 +680,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // last_parent_key. That's why here we have to check if the child node is one of the original child // nodes first. if (!is_parent_edge_node && !orig_child_infos.contains(child_node->node_id())) { - LOGTRACEMOD( - wbcache, - "Child node [{}] is not one of the original child nodes, so we need to check if it is beyond the " - "last parent key {}", - child_node->to_string(), last_parent_key.to_string()); if (child_last_key.compare(last_parent_key) > 0) { // We have reached a child beyond this parent, we can stop now // TODO this case if child last key is less than last parent key to update the parent node. @@ -801,13 +710,13 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } if (valid_sibling != empty_bnodeid) { cur_parent->set_next_bnode(valid_sibling); - LOGTRACEMOD(wbcache, "Repairing node=[{}], child_node=[{}] is an edge node, end loop", - cur_parent->to_string(), child_node->to_string()); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", + cur_parent->node_id(), child_node->to_string()); } else { cur_parent->set_next_bnode(empty_bnodeid); - LOGTRACEMOD(wbcache, "Repairing node=[{}], child_node=[{}] is an edge node, end loop", - cur_parent->to_string(), child_node->to_string()); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", + cur_parent->node_id(), child_node->to_string()); } break; @@ -817,11 +726,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(), BtreeLinkInfo::get_fixed_size())) { // No room in the parent_node, let us split the parent_node and continue - LOGTRACEMOD(wbcache, - "Repairing node={}, child_node=[{}] has no room for put, so we need to split the parent " - "node", - cur_parent->node_id(), child_node->to_string()); - auto new_parent = this->alloc_interior_node(); + auto new_parent = this->create_interior_node(); if (new_parent == nullptr) { ret = btree_status_t::space_not_avail; break; @@ -831,6 +736,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { cur_parent->set_next_bnode(new_parent->node_id()); new_parent->set_level(cur_parent->level()); cur_parent->inc_link_version(); + new_parent_nodes.push_back(new_parent); cur_parent = std::move(new_parent); } @@ -927,18 +833,14 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // if last parent has the key less than the last child key, then we need to update the parent node with // the last child key if it doesn't have edge. auto last_parent = parent_node; - if (new_parent_nodes.size() > 0) { - last_parent = new_parent_nodes.back(); - // handle the case where we are splitting the root node - if (m_sb->root_node == parent_node->node_id()) { update_root(parent_node, new_parent_nodes, cp_ctx); } - } + if (new_parent_nodes.size() > 0) { last_parent = new_parent_nodes[new_parent_nodes.size() - 1]; } if (last_parent->total_entries() && !last_parent->has_valid_edge()) { if (last_parent->compare_nth_key(last_parent_key, last_parent->total_entries() - 1) < 0) { BtreeLinkInfo child_info; last_parent->get_nth_value(last_parent->total_entries() - 1, &child_info, false /* copy */); - last_parent->update(last_parent->total_entries() - 1, last_parent_key, child_info); + parent_node->update(parent_node->total_entries() - 1, last_parent_key, child_info); LOGTRACEMOD(wbcache, "Repairing parent node={} with last_parent_key={} and child_info={}", - last_parent->node_id(), last_parent_key.to_string(), child_info.to_string()); + parent_node->node_id(), last_parent_key.to_string(), child_info.to_string()); } // if last key of children is less than the last key of parent, then we need to update the last key of non // interior child @@ -984,15 +886,17 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { bnodeid_t find_true_sibling(BtreeNodePtr const& node) { if (node == nullptr) return empty_bnodeid; - bnodeid_t sibling_id = node->next_bnode(); + bnodeid_t sibling_id = empty_bnodeid; + if (node->has_valid_edge()) { + sibling_id = node->get_edge_value().bnode_id(); + } else { + sibling_id = node->next_bnode(); + } if (sibling_id == empty_bnodeid) { return empty_bnodeid; } else { BtreeNodePtr sibling_node; - if (read_node_impl(sibling_id, sibling_node) != btree_status_t::success) { - LOGTRACEMOD(wbcache, "Failed to read sibling node with id {}", sibling_id); - return empty_bnodeid; - } + if (read_node_impl(sibling_id, sibling_node) != btree_status_t::success) { return empty_bnodeid; } if (sibling_node->is_node_deleted()) { LOGTRACEMOD(wbcache, "Sibling node [{}] is not the sibling for parent_node {}", diff --git a/src/lib/index/inplace_btree/ss_btree_cache.cpp b/src/lib/index/inplace_btree/ss_btree_cache.cpp new file mode 100644 index 000000000..49524ff31 --- /dev/null +++ b/src/lib/index/inplace_btree/ss_btree_cache.cpp @@ -0,0 +1,697 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include +#include +#include +#include +#include "device/chunk.h" +#include "common/homestore_assert.hpp" +#include "common/homestore_utils.hpp" + +#include "wb_cache.hpp" +#include "index_cp.hpp" +#include "device/virtual_dev.hpp" +#include "common/resource_mgr.hpp" + +#ifdef _PRERELEASE +#include "common/crash_simulator.hpp" +#endif + +namespace homestore { + +IndexWBCacheBase& wb_cache() { + try { + return index_service().wb_cache(); + } catch (const std::runtime_error& e) { + throw std::runtime_error(fmt::format("Failed to access wb_cache: {}", e.what())); + } +} + +SSBtreeCache::SSBtreeCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, + const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size) : + m_vdev{vdev}, + m_cache{evictor, 100000, node_size, + [](const BtreeNodePtr& node) -> BlkId { + return static_cast< IndexBtreeNode* >(node.get())->m_idx_buf->m_blkid; + }, + [](const sisl::CacheRecord& rec) -> bool { + const auto& hnode = (sisl::SingleEntryHashNode< BtreeNodePtr >&)rec; + return (hnode.m_value->m_refcount.test_le(1)); + }}, + m_node_size{node_size}, + m_meta_blk{sb.first} { + start_flush_threads(); + + // We need to register the consumer first before recovery, so that recovery can use the cp_ctx created to add/track + // recovered new nodes. + cp_mgr().register_consumer(cp_consumer_t::INDEX_SVC, std::move(std::make_unique< IndexCPCallbacks >(this))); +} + +void SSBtreeCache::start_flush_threads() { + // Start WBCache flush threads + struct Context { + std::condition_variable cv; + std::mutex mtx; + int32_t thread_cnt{0}; + }; + auto ctx = std::make_shared< Context >(); + auto nthreads = std::max(1, HS_DYNAMIC_CONFIG(generic.cache_flush_threads)); + + for (int32_t i{0}; i < nthreads; ++i) { + iomanager.create_reactor("index_cp_flush" + std::to_string(i), iomgr::INTERRUPT_LOOP, 1u, + [this, ctx](bool is_started) { + if (is_started) { + { + std::unique_lock< std::mutex > lk{ctx->mtx}; + m_cp_flush_fibers.push_back(iomanager.iofiber_self()); + ++(ctx->thread_cnt); + } + ctx->cv.notify_one(); + } + }); + } + + { + std::unique_lock< std::mutex > lk{ctx->mtx}; + ctx->cv.wait(lk, [ctx, nthreads] { return (ctx->thread_cnt == nthreads); }); + } +} + +BtreeNodePtr SSBtreeCache::alloc_buf(node_initializer_t&& node_initializer) { + auto cpg = cp_mgr().cp_guard(); + auto cp_ctx = r_cast< IndexCPContext* >(cpg.context(cp_consumer_t::INDEX_SVC)); + + // Alloc a block of data from underlying vdev + BlkId blkid; + auto ret = m_vdev->alloc_contiguous_blks(1, blk_alloc_hints{}, blkid); + if (ret != BlkAllocStatus::SUCCESS) { return nullptr; } + + // Alloc buffer and initialize the node + auto idx_buf = std::make_shared< IndexBuffer >(blkid, m_node_size, m_vdev->align_size()); + idx_buf->m_created_cp_id = cpg->id(); + idx_buf->m_dirtied_cp_id = cpg->id(); + auto node = node_initializer(idx_buf); + + if (!m_in_recovery) { + // Add the node to the cache. Skip if we are in recovery mode. + bool done = m_cache.insert(node); + HS_REL_ASSERT_EQ(done, true, "Unable to add alloc'd node to cache, low memory or duplicate inserts?"); + } + + // The entire index is updated in the commit path, so we alloc the blk and commit them right away + auto alloc_status = m_vdev->commit_blk(blkid); + // if any error happens when committing the blk to index service, we should assert and crash + if (alloc_status != BlkAllocStatus::SUCCESS) HS_REL_ASSERT(0, "Failed to commit blk: {}", blkid.to_string()); + return node; +} + +void IndexWBCache::write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf, CPContext* cp_ctx) { + // TODO upsert always returns false even if it succeeds. + if (m_in_recovery) { + if (buf->is_meta_buf()) { + auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; + meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + } else { + m_vdev->sync_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid); + } + } else { + if (node != nullptr) { m_cache.upsert(node); } + LOGTRACEMOD(wbcache, "add to dirty list cp {} {}", cp_ctx->id(), buf->to_string()); + r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf); + resource_mgr().inc_dirty_buf_size(m_node_size); + } +} + +void IndexWBCache::read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t&& node_initializer) { + auto const blkid = BlkId{id}; + +retry: + // Check if the blkid is already in cache, if not load and put it into the cache + if (!m_in_recovery && m_cache.get(blkid, node)) { return; } + + // Read the buffer from virtual device + auto idx_buf = std::make_shared< IndexBuffer >(blkid, m_node_size, m_vdev->align_size()); + m_vdev->sync_read(r_cast< char* >(idx_buf->raw_buffer()), m_node_size, blkid); + + // Create the btree node out of buffer + node = node_initializer(idx_buf); + + // Push the node into cache + if (!m_in_recovery) { + bool done = m_cache.insert(node); + if (!done) { + // There is a race between 2 concurrent reads from vdev and other party won the race. Re-read from cache + goto retry; + } + } +} + +bool IndexWBCache::get_writable_buf(const BtreeNodePtr& node, CPContext* context) { + IndexCPContext* icp_ctx = r_cast< IndexCPContext* >(context); + auto& idx_buf = static_cast< IndexBtreeNode* >(node.get())->m_idx_buf; + if (idx_buf->m_dirtied_cp_id == icp_ctx->id()) { + return true; // For same cp, we don't need a copy, we can rewrite on the same buffer + } else if (idx_buf->m_dirtied_cp_id > icp_ctx->id()) { + return false; // We are asked to provide the buffer of an older CP, which is not possible + } + + // If buffer is in clean state, which means it is already flushed, we can reuse the same buffer, if not + // we must copy the buffer and return the new buffer. + if (!idx_buf->is_clean()) { + HS_DBG_ASSERT_EQ(idx_buf->m_dirtied_cp_id, icp_ctx->id() - 1, + "Buffer is dirty, but its dirtied_cp_id is neither current nor previous cp id"); + + // If its not clean, we do deep copy. + auto new_buf = std::make_shared< IndexBuffer >(idx_buf->m_blkid, m_node_size, m_vdev->align_size()); + new_buf->m_created_cp_id = idx_buf->m_created_cp_id; + std::memcpy(new_buf->raw_buffer(), idx_buf->raw_buffer(), m_node_size); + + node->update_phys_buf(new_buf->raw_buffer()); + LOGTRACEMOD(wbcache, "cp={} cur_buf={} for node={} is dirtied by cp={} copying new_buf={}", icp_ctx->id(), + static_cast< void* >(idx_buf.get()), node->node_id(), idx_buf->m_dirtied_cp_id, + static_cast< void* >(new_buf.get())); + idx_buf = std::move(new_buf); + } + idx_buf->m_dirtied_cp_id = icp_ctx->id(); + return true; +} + +bool IndexWBCache::refresh_meta_buf(shared< MetaIndexBuffer >& meta_buf, CPContext* cp_ctx) { + if (meta_buf->m_dirtied_cp_id > cp_ctx->id()) { + return false; // meta_buf modified by a newer CP, we shouldn't overwrite that + } else if (meta_buf->m_dirtied_cp_id == cp_ctx->id()) { + // Modified by the same cp, no need to create new index buffer, but we only copy the superblk to the buffer + meta_buf->copy_sb_to_buf(); + } else { + // We always create a new meta index buffer on every meta buf update, which copies the superblk + auto new_buf = std::make_shared< MetaIndexBuffer >(meta_buf); + new_buf->m_dirtied_cp_id = cp_ctx->id(); + write_buf(nullptr, new_buf, cp_ctx); + meta_buf = new_buf; // Replace the meta_buf with new buf + LOGTRACEMOD(wbcache, "meta buf {} is created in cp {}", meta_buf->to_string(), cp_ctx->id()); + } + return true; +} + +#ifdef _PRERELEASE +static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr const& child_buf, + IndexBufferPtrList const& new_node_bufs, IndexBufferPtrList const& freed_node_bufs) { + // TODO: Need an API from flip to quickly check if flip is enabled, so this method doesn't check flip_enabled a + // bunch of times. + if (parent_buf && parent_buf->is_meta_buf()) { + // Split or merge happening on root + if (iomgr_flip::instance()->test_flip("crash_flush_on_meta")) { + parent_buf->set_crash_flag(); + } else if (iomgr_flip::instance()->test_flip("crash_flush_on_root")) { + child_buf->set_crash_flag(); + } + } else if ((new_node_bufs.size() == 1) && freed_node_bufs.empty()) { + // Its a split node situation + if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_parent")) { + parent_buf->set_crash_flag(); + } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_left_child")) { + child_buf->set_crash_flag(); + } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_right_child")) { + new_node_bufs[0]->set_crash_flag(); + } + } else if (!freed_node_bufs.empty() && (new_node_bufs.size() != freed_node_bufs.size())) { + // Its a merge nodes sitation + if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_parent")) { + parent_buf->set_crash_flag(); + } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_left_child")) { + child_buf->set_crash_flag(); + } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_right_child")) { + if (!new_node_bufs.empty()) { new_node_bufs[0]->set_crash_flag(); } + } + } else if (!freed_node_bufs.empty() && (new_node_bufs.size() == freed_node_bufs.size())) { + // Its a rebalance node situation + if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_parent")) { + parent_buf->set_crash_flag(); + } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_left_child")) { + child_buf->set_crash_flag(); + } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_right_child")) { + if (!new_node_bufs.empty()) { new_node_bufs[0]->set_crash_flag(); } + } + } +} +#endif + +void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& parent_buf, + IndexBufferPtr const& child_buf, IndexBufferPtrList const& new_node_bufs, + IndexBufferPtrList const& freed_node_bufs, CPContext* cp_ctx) { + IndexCPContext* icp_ctx = r_cast< IndexCPContext* >(cp_ctx); + if (parent_buf) { link_buf(parent_buf, child_buf, false /* is_sibling_link */, cp_ctx); } + +#ifdef _PRERELEASE + set_crash_flips(parent_buf, child_buf, new_node_bufs, freed_node_bufs); +#endif + + for (auto const& buf : new_node_bufs) { + link_buf(child_buf, buf, true /* is_sibling_link */, cp_ctx); + } + + for (auto const& buf : freed_node_bufs) { + if (!buf->m_wait_for_down_buffers.testz()) { + // This buffer has some down bufs depending on it. It can happen for an upper level interior node, where + // lower level node (say leaf) has split causing it to write entries in this node, but this node is now + // merging with other node, causing it to free. In these rare instances, we link this node to the new + // node resulting in waiting for all the down bufs to be flushed before up buf can flush (this buf is + // not written anyways) + link_buf(child_buf, buf, true /* is_sibling_link */, cp_ctx); + } + } + + if (new_node_bufs.empty() && freed_node_bufs.empty()) { + // This is an update for meta, root transaction. + if (child_buf->m_created_cp_id != -1) { + DEBUG_ASSERT_EQ(child_buf->m_created_cp_id, icp_ctx->id(), + "Root buffer is not created by current cp (for split root), its not expected"); + } + icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {}); + } else { + icp_ctx->add_to_txn_journal(index_ordinal, // Ordinal + child_buf->m_up_buffer, // real up buffer + new_node_bufs.empty() ? freed_node_bufs[0]->m_up_buffer + : new_node_bufs[0]->m_up_buffer, // real in place child + new_node_bufs, // new node bufs + freed_node_bufs // free_node_bufs + ); + } +#if 0 + static int id = 0; + auto filename = "transact_bufs_"+std::to_string(id++)+ "_" +std::to_string(rand()%100)+".dot"; + LOGINFO("Transact cp is in cp\n{} and storing in {}\n\n\n", icp_ctx->to_string(), filename); + icp_ctx->to_string_dot(filename); +#endif +} + +void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const& down_buf, bool is_sibling_link, + CPContext* cp_ctx) { + HS_DBG_ASSERT_NE((void*)up_buf->m_up_buffer.get(), (void*)down_buf.get(), "Cyclic dependency detected"); + IndexBufferPtr real_up_buf = up_buf; + IndexCPContext* icp_ctx = r_cast< IndexCPContext* >(cp_ctx); + + // Condition 1: If the down buffer and up buffer are both created by the current cp_id, unconditionally we need + // to link it with up_buffer's up_buffer. In other words, there should never a link between down and up buffers + // created in current generation (cp). In real terms, it means all new buffers can be flushed independently to + // each other and dependency is needed only for the buffers created in previous cps. + if (up_buf->m_created_cp_id == icp_ctx->id()) { + real_up_buf = up_buf->m_up_buffer; + HS_DBG_ASSERT(real_up_buf, + "Up buffer is newly created in this cp, but it doesn't have its own up_buffer, its not expected"); + } + + // Condition 2: If down_buf already has an up_buf, we can override it newly passed up_buf it only in case of + // sibling link. Say there is a parent node P1 and child C0, C1 (all 3 created in previous cps). Consider the + // scenarios + // + // Scenario 1: Following thing happens: + // 1. Child C1 first splits and thus chain will have P1 <-- C1 <-- C2. + // 2. Child C2 splits further creating C3 and writes to P1, the link_buf(P1, C2, is_sibling=false) will be + // called first. In this instance, we don't want to break the above chain, because C2 should rely on C1 for its + // repair. The link_buf calls will be + // a) link_buf(P1, C2, is_sibling=false), => P1 <-- C1 <-- C2 (because C2 has up_buffer C1 and not a sibling + // so no override) + // b) link_buf(C2, C3, is_sibling=true), => P1 <--- C1 <-- { C2, C3 } (because of Condition 1, + // where C2, C3 are created in this CP, so link C3 with C2's real_up_buf = C2) + // + // Scenario 2: Following thing happens: + // 1. Child C1 first splits and thus chain will have P1 <-- C1 <-- C2. + // 2. Child C1 merges with C0, which means we create a new node C1' and free C1. The link_buf calls will be + // a) link_buf(P1, C0, is_sibling=false), => P1 <-- C0, C1 <--- C2 + // b) link_buf(C0, C1', is_sibling=true), => + // P1 + // C0, C1 + // C1' + // c) link_buf(C0, C1, is_sibling=true), => + // P1 + // C0, + // C1' C1 + // This link is acheived by unconditionally changing the link in case of is_sibling=true to passed up_buf, but + // conditionally do it in case of parent link where it already has a link don't override it. + if (down_buf->m_up_buffer != nullptr) { + HS_DBG_ASSERT_LT(down_buf->m_up_buffer->m_created_cp_id, icp_ctx->id(), + "down_buf=[{}] up_buffer=[{}] should never have been created on same cp", + down_buf->to_string(), down_buf->m_up_buffer->to_string()); + + if (!is_sibling_link || (down_buf->m_up_buffer == real_up_buf)) { + // Already linked with same buf or its not a sibling link to override, nothing to do other than asserts + real_up_buf = down_buf->m_up_buffer; + HS_DBG_ASSERT(!real_up_buf->m_wait_for_down_buffers.testz(), + "Up buffer waiting count is zero, whereas down buf is already linked to up buf"); + HS_DBG_ASSERT((real_up_buf->m_dirtied_cp_id == down_buf->m_dirtied_cp_id) || (real_up_buf->is_meta_buf()), + "Up buffer is not modified by current cp, but down buffer is linked to it"); +#ifndef NDEBUG + bool found{false}; + for (auto const& dbuf : real_up_buf->m_down_buffers) { + if (dbuf.lock() == down_buf) { + found = true; + break; + } + } + HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); +#endif + return; + } + } + + // Now we link the down_buffer to the real up_buffer + real_up_buf->m_wait_for_down_buffers.increment(1); + down_buf->m_up_buffer = real_up_buf; +#ifndef NDEBUG + real_up_buf->m_down_buffers.emplace_back(down_buf); +#endif +} + +void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { + BtreeNodePtr node; + if (!m_in_recovery) { + bool done = m_cache.remove(buf->m_blkid, node); + HS_REL_ASSERT_EQ(done, true, "Race on cache removal of btree blkid?"); + } + + resource_mgr().inc_free_blk(m_node_size); + m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(cp_ctx)); +} + +//////////////////// Recovery Related section ///////////////////////////////// +void IndexWBCache::recover(sisl::byte_view sb) { + // If sb is empty, its possible a first time boot. + if ((sb.bytes() == nullptr) || (sb.size() == 0)) { + m_vdev->recovery_completed(); + return; + } + + m_in_recovery = true; // For entirity of this call, we should mark it as being recovered. + + // Recover the CP Context with the buf_map of all the buffers that were dirtied in the last cp with its + // relationship (up/down buf links) as it was by the cp that was flushing the buffers prior to unclean shutdown. + auto cpg = cp_mgr().cp_guard(); + auto icp_ctx = r_cast< IndexCPContext* >(cpg.context(cp_consumer_t::INDEX_SVC)); + std::map< BlkId, IndexBufferPtr > bufs = icp_ctx->recover(std::move(sb)); + + LOGINFOMOD(wbcache, "Detected unclean shutdown, prior cp={} had to flush {} nodes, recovering... ", icp_ctx->id(), + bufs.size()); + + // At this point, we have the DAG structure (up/down dependency graph), exactly the same as prior to crash, with one + // addition of all freed buffers also put in the DAG structure. + // + // We do repair/recovery as 2 passes. A quick glance would look like we don't need 2 passes of the walking through + // all the buffers, but it is essential. + // + // In the first pass, we look for any new bufs and any freed bufs and commit/free their corresponding node blkids. + // This has to be done before doing any repair, because repair can allocate blkids and we don't want to allocate + // the same blkid which could clash with the blkid next in the buf list. + // + // On the second pass, we only take the new nodes/bufs and then repair their up buffers, if needed. + std::vector< IndexBufferPtr > l0_bufs; + for (auto const& [_, buf] : bufs) { + if (buf->m_node_freed || (buf->m_created_cp_id == icp_ctx->id())) { + if (was_node_committed(buf)) { + if (was_node_committed(buf->m_up_buffer)) { + if (buf->m_node_freed) { + // Up buffer was written, so this buffer can be freed and thus can free the blk. + m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); + } else { + m_vdev->commit_blk(buf->m_blkid); + } + l0_bufs.push_back(buf); + } else { + buf->m_up_buffer->m_wait_for_down_buffers.decrement(); + } + } + } + } + + LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", + l0_bufs.size(), bufs.size(), icp_ctx->id()); + + auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, + std::vector< IndexBufferPtr > const& l0_bufs) { + // Logs to detect down_waits are set correctly for up buffers list of all recovered bufs + std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); + for (auto const& [_, buf] : bufs) { + fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); + } + + // list of new_bufs + fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size()); + for (auto const& buf : l0_bufs) { + fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); + } + return log; + }; + LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, l0_bufs)); + + // Second iteration we start from the lowest levels (which are all new_bufs) and check if up_buffers need to be + // repaired. All L1 buffers are not needed to repair, because they are sibling nodes and so we pass false in + // do_repair flag. + for (auto const& buf : l0_bufs) { + recover_buf(buf->m_up_buffer); + } + m_in_recovery = false; + m_vdev->recovery_completed(); +} + +void IndexWBCache::recover_buf(IndexBufferPtr const& buf) { + if (!buf->m_wait_for_down_buffers.decrement_testz()) { return; } + + // All down buffers are completed and given a nod saying that they are committed. If this buffer is not committed, + // then we need to repair this node/buffer. After that we will keep going to the next up level to repair them if + // needed + if (!was_node_committed(buf)) { + LOGDEBUGMOD(wbcache, "Index Recovery detected uncommitted up node [{}], repairing it", buf->to_string()); + index_service().repair_index_node(buf->m_index_ordinal, buf); + } else { + LOGTRACEMOD(wbcache, "Index Recovery detected up node [{}] as committed no need to repair that", + buf->to_string()); + } + + if (buf->m_up_buffer) { recover_buf(buf->m_up_buffer); } +} + +bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) { + if (buf == nullptr) { return false; } + + // If the node is freed, then it can be considered committed as long as its up buffer was committed + if (buf->m_node_freed) { + HS_DBG_ASSERT(buf->m_up_buffer, "Buf was marked deleted, but doesn't have an up_buffer"); + return was_node_committed(buf->m_up_buffer); + } + + // All down_buf has indicated that they have seen this up buffer, now its time to repair them. + if (buf->m_bytes == nullptr) { + // Read the btree node and get its modified cp_id + buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); + m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid()); + if (!BtreeNode::is_valid_node(sisl::blob{buf->m_bytes, m_node_size})) { return false; } + + buf->m_dirtied_cp_id = BtreeNode::get_modified_cp_id(buf->m_bytes); + } + auto cpg = cp_mgr().cp_guard(); + return (buf->m_dirtied_cp_id == cpg->id()); +} + +//////////////////// CP Related API section ///////////////////////////////// +folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { + LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp context={}", cp_ctx->to_string_with_dags()); + if (!cp_ctx->any_dirty_buffers()) { + if (cp_ctx->id() == 0) { + // For the first CP, we need to flush the journal buffer to the meta blk + LOGINFO("First time boot cp, we shall flush the vdev to ensure all cp information is created"); + m_vdev->cp_flush(cp_ctx); + } else { + CP_PERIODIC_LOG(DEBUG, cp_ctx->id(), "Btree does not have any dirty buffers to flush"); + } + return folly::makeFuture< bool >(true); // nothing to flush + } + +#ifdef _PRERELEASE + if (hs()->crash_simulator().is_crashed()) { + LOGINFOMOD(wbcache, "crash simulation is ongoing, so skip the cp flush"); + return folly::makeFuture< bool >(true); + } +#endif + + // First thing is to flush the new_blks created as part of the CP. + auto const& journal_buf = cp_ctx->journal_buf(); + if (journal_buf.size() != 0) { + if (m_meta_blk) { + meta_service().update_sub_sb(journal_buf.cbytes(), journal_buf.size(), m_meta_blk); + } else { + meta_service().add_sub_sb("wb_cache", journal_buf.cbytes(), journal_buf.size(), m_meta_blk); + } + } + + cp_ctx->prepare_flush_iteration(); + + for (auto& fiber : m_cp_flush_fibers) { + iomanager.run_on_forget(fiber, [this, cp_ctx]() { + IndexBufferPtrList buf_list; + get_next_bufs(cp_ctx, resource_mgr().get_dirty_buf_qd(), buf_list); + + for (auto& buf : buf_list) { + do_flush_one_buf(cp_ctx, buf, true); + } + m_vdev->submit_batch(); + }); + } + return std::move(cp_ctx->get_future()); +} + +void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) { +#ifdef _PRERELEASE + if (buf->m_crash_flag_on) { + // std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; + // LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), + // filename); cp_ctx->to_string_dot(filename); + LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}", buf->to_string()); + hs()->crash_simulator().crash(); + cp_ctx->complete(true); + return; + } else if (hs()->crash_simulator().is_crashed()) { + LOGINFOMOD(wbcache, "crash simulation is ongoing, aid simulation by not flushing"); + return; + } +#endif + + LOGTRACEMOD(wbcache, "cp={} {}", cp_ctx->id(), buf->to_string()); + buf->set_state(index_buf_state_t::FLUSHING); + + if (buf->is_meta_buf()) { + LOGTRACEMOD(wbcache, "flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), + buf->to_string()); + auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; + meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + process_write_completion(cp_ctx, buf); + } else if (buf->m_node_freed) { + LOGTRACEMOD(wbcache, "Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), + buf->to_string()); + process_write_completion(cp_ctx, buf); + } else { + LOGTRACEMOD(wbcache, "flushing cp {} buf {} info: {}", cp_ctx->id(), buf->to_string(), + BtreeNode::to_string_buf(buf->raw_buffer())); + m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) + .thenValue([buf, cp_ctx](auto) { + try { + auto& pthis = s_cast< IndexWBCache& >(wb_cache()); + pthis.process_write_completion(cp_ctx, buf); + } catch (const std::runtime_error& e) { LOGERROR("Failed to access write-back cache: {}", e.what()); } + }); + + if (!part_of_batch) { m_vdev->submit_batch(); } + } +} + +void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferPtr const& buf) { +#ifdef _PRERELEASE + if (hs()->crash_simulator().is_crashed()) { + LOGINFOMOD(wbcache, "Crash simulation is ongoing, ignore all process_write_completion"); + return; + } +#endif + + LOGTRACEMOD(wbcache, "cp {} buf {}", cp_ctx->id(), buf->to_string()); + resource_mgr().dec_dirty_buf_size(m_node_size); + auto [next_buf, has_more] = on_buf_flush_done(cp_ctx, buf); + if (next_buf) { + do_flush_one_buf(cp_ctx, next_buf, false); + } else if (!has_more) { + // We are done flushing the buffers, We flush the vdev to persist the vdev bitmaps and free blks + // Pick a CP Manager blocking IO fiber to execute the cp flush of vdev + iomanager.run_on_forget(cp_mgr().pick_blocking_io_fiber(), [this, cp_ctx]() { + LOGTRACEMOD(wbcache, "Initiating CP flush"); + m_vdev->cp_flush(cp_ctx); // This is a blocking io call + cp_ctx->complete(true); + }); + } +} + +std::pair< IndexBufferPtr, bool > IndexWBCache::on_buf_flush_done(IndexCPContext* cp_ctx, IndexBufferPtr const& buf) { + if (m_cp_flush_fibers.size() > 1) { + std::unique_lock lg(m_flush_mtx); + return on_buf_flush_done_internal(cp_ctx, buf); + } else { + return on_buf_flush_done_internal(cp_ctx, buf); + } +} + +std::pair< IndexBufferPtr, bool > IndexWBCache::on_buf_flush_done_internal(IndexCPContext* cp_ctx, + IndexBufferPtr const& buf) { + IndexBufferPtrList buf_list; +#ifndef NDEBUG + buf->m_down_buffers.clear(); +#endif + buf->set_state(index_buf_state_t::CLEAN); + + if (cp_ctx->m_dirty_buf_count.decrement_testz()) { + return std::make_pair(nullptr, false); + } else { + get_next_bufs_internal(cp_ctx, 1u, buf, buf_list); + return std::make_pair((buf_list.size() ? buf_list[0] : nullptr), true); + } +} + +void IndexWBCache::get_next_bufs(IndexCPContext* cp_ctx, uint32_t max_count, IndexBufferPtrList& bufs) { + if (m_cp_flush_fibers.size() > 1) { + std::unique_lock lg(m_flush_mtx); + get_next_bufs_internal(cp_ctx, max_count, nullptr, bufs); + } else { + get_next_bufs_internal(cp_ctx, max_count, nullptr, bufs); + } +} + +void IndexWBCache::get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_count, + IndexBufferPtr const& prev_flushed_buf, IndexBufferPtrList& bufs) { + uint32_t count{0}; + + // First attempt to execute any follower buffer flush + if (prev_flushed_buf) { + auto next_buffer = prev_flushed_buf->m_up_buffer; + if (next_buffer && next_buffer->m_wait_for_down_buffers.decrement_testz()) { + HS_DBG_ASSERT(next_buffer->state() == index_buf_state_t::DIRTY, + "Trying to flush a up_buffer after down buffer is completed, but up_buffer is " + "not in dirty state, but in {} state", + (int)next_buffer->state()); + bufs.emplace_back(next_buffer); + ++count; + } +#ifndef NDEBUG + // Retain prev up buffer for debugging purposes + // prev_flushed_buf->m_prev_up_buffer = std::move(next_buffer); +#endif + prev_flushed_buf->m_up_buffer.reset(); + } + + // If we still have room to push the next buffer, take it from the main list + while (count < max_count) { + std::optional< IndexBufferPtr > buf = cp_ctx->next_dirty(); + if (!buf) { break; } // End of list + + if ((*buf)->m_wait_for_down_buffers.testz()) { + bufs.emplace_back(std::move(*buf)); + ++count; + } else { + // There is some leader buffer still flushing, once done its completion will flush this buffer + } + } +} + +/* +IndexBtreeNode* IndexBtreeNode::convert(BtreeNode* bt_node) { + return r_cast< IndexBtreeNode* >(bt_node->get_node_context()); +}*/ +} // namespace homestore diff --git a/src/lib/index/inplace_btree/ss_btree_cache.hpp b/src/lib/index/inplace_btree/ss_btree_cache.hpp new file mode 100644 index 000000000..6305ff952 --- /dev/null +++ b/src/lib/index/inplace_btree/ss_btree_cache.hpp @@ -0,0 +1,79 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once +#include + +#include +#include +#include +#include +#include "index/index_cp.hpp" + +namespace sisl { +template < typename T > +class ThreadVector; + +class Evictor; +} // namespace sisl + +namespace homestore { +class VirtualDev; + +class SeqStoreBtreeCache : public IndexWBCacheBase { +private: + std::shared_ptr< VirtualDev > m_vdev; + sisl::SimpleCache< BlkId, BtreeNodePtr > m_cache; + uint32_t m_node_size; + std::vector< iomgr::io_fiber_t > m_cp_flush_fibers; + std::mutex m_flush_mtx; + void* m_meta_blk; + +public: + SeqStoreBtreeCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, + const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size); + + BtreeNodePtr alloc_buf(node_initializer_t&& node_initializer) override; + void write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf, CPContext* cp_ctx) override; + void read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t&& node_initializer) override; + + bool get_writable_buf(const BtreeNodePtr& node, CPContext* context) override; + void transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& parent_buf, IndexBufferPtr const& child_buf, + IndexBufferPtrList const& new_node_bufs, IndexBufferPtrList const& freed_node_bufs, + CPContext* cp_ctx) override{}; + void free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) override; + bool refresh_meta_buf(shared< MetaIndexBuffer >& meta_buf, CPContext* cp_ctx) override; + + //////////////////// CP Related API section ///////////////////////////////// + folly::Future< bool > async_cp_flush(IndexCPContext* context); + void recover(sisl::byte_view sb) override; + +private: + void start_flush_threads(); + void process_write_completion(IndexCPContext* cp_ctx, IndexBufferPtr const& pbuf); + void do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch); + void link_buf(IndexBufferPtr const& up, IndexBufferPtr const& down, bool is_sibling_link, CPContext* cp_ctx); + + std::pair< IndexBufferPtr, bool > on_buf_flush_done(IndexCPContext* cp_ctx, IndexBufferPtr const& buf); + std::pair< IndexBufferPtr, bool > on_buf_flush_done_internal(IndexCPContext* cp_ctx, IndexBufferPtr const& buf); + + void get_next_bufs(IndexCPContext* cp_ctx, uint32_t max_count, IndexBufferPtrList& bufs); + void get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_count, IndexBufferPtr const& prev_flushed_buf, + IndexBufferPtrList& bufs); + + void recover_buf(IndexBufferPtr const& buf); + bool was_node_committed(IndexBufferPtr const& buf); +}; +} // namespace homestore diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/inplace_btree/wb_cache.cpp similarity index 80% rename from src/lib/index/wb_cache.cpp rename to src/lib/index/inplace_btree/wb_cache.cpp index 8cc2192c3..092dc9e5b 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/inplace_btree/wb_cache.cpp @@ -30,8 +30,6 @@ #include "common/crash_simulator.hpp" #endif -SISL_LOGGING_DECL(wbcache) - namespace homestore { IndexWBCacheBase& wb_cache() { @@ -92,16 +90,13 @@ void IndexWBCache::start_flush_threads() { } } -BtreeNodePtr IndexWBCache::alloc_buf(uint32_t ordinal, node_initializer_t&& node_initializer) { +BtreeNodePtr IndexWBCache::alloc_buf(node_initializer_t&& node_initializer) { auto cpg = cp_mgr().cp_guard(); auto cp_ctx = r_cast< IndexCPContext* >(cpg.context(cp_consumer_t::INDEX_SVC)); // Alloc a block of data from underlying vdev - MultiBlkId blkid; - // Ordinal used as a hint in the case of custom chunk selector exists - blk_alloc_hints hints; - hints.application_hint = ordinal; - auto ret = m_vdev->alloc_contiguous_blks(1, hints, blkid); + BlkId blkid; + auto ret = m_vdev->alloc_contiguous_blks(1, blk_alloc_hints{}, blkid); if (ret != BlkAllocStatus::SUCCESS) { return nullptr; } // Alloc buffer and initialize the node @@ -109,7 +104,6 @@ BtreeNodePtr IndexWBCache::alloc_buf(uint32_t ordinal, node_initializer_t&& node idx_buf->m_created_cp_id = cpg->id(); idx_buf->m_dirtied_cp_id = cpg->id(); auto node = node_initializer(idx_buf); - idx_buf->m_node_level = node->level(); if (!m_in_recovery) { // Add the node to the cache. Skip if we are in recovery mode. @@ -131,7 +125,6 @@ void IndexWBCache::write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); } else { - LOGTRACEMOD(wbcache, "write buf [{}] in recovery mode", buf->to_string()); m_vdev->sync_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid); } } else { @@ -146,7 +139,7 @@ void IndexWBCache::read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t auto const blkid = BlkId{id}; retry: - // Check if the blkid is already in cache, if notL load and put it into the cache + // Check if the blkid is already in cache, if not load and put it into the cache if (!m_in_recovery && m_cache.get(blkid, node)) { return; } // Read the buffer from virtual device @@ -184,7 +177,6 @@ bool IndexWBCache::get_writable_buf(const BtreeNodePtr& node, CPContext* context // If its not clean, we do deep copy. auto new_buf = std::make_shared< IndexBuffer >(idx_buf->m_blkid, m_node_size, m_vdev->align_size()); new_buf->m_created_cp_id = idx_buf->m_created_cp_id; - new_buf->m_node_level = idx_buf->m_node_level; std::memcpy(new_buf->raw_buffer(), idx_buf->raw_buffer(), m_node_size); node->update_phys_buf(new_buf->raw_buffer()); @@ -305,11 +297,11 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p if (new_node_bufs.empty() && freed_node_bufs.empty()) { // This is an update for meta, root transaction. - if (child_buf->m_created_cp_id < icp_ctx->id()) { - icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, child_buf, {}, {}); - } else { - icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {}); + if (child_buf->m_created_cp_id != -1) { + DEBUG_ASSERT_EQ(child_buf->m_created_cp_id, icp_ctx->id(), + "Root buffer is not created by current cp (for split root), its not expected"); } + icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {}); } else { icp_ctx->add_to_txn_journal(index_ordinal, child_buf->m_up_buffer /* real up buffer */, child_buf, new_node_bufs, freed_node_bufs); @@ -318,14 +310,15 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p // log new nodes and freed nodes and parent and child static uint32_t txn_id = 0; static int last_cp_id = -2; - std::string txn = ""; + static std::string txn = ""; if (last_cp_id != icp_ctx->id()) { last_cp_id = icp_ctx->id(); txn_id = 0; + txn = ""; } if (new_node_bufs.empty() && freed_node_bufs.empty()) { - fmt::format_to(std::back_inserter(txn), "{} - parent=[{}] child=[{}] new=[{}] freed=[{}]", txn_id, + fmt::format_to(std::back_inserter(txn), "\n{} - parent=[{}] child=[{}] new=[{}] freed=[{}]", txn_id, (parent_buf && parent_buf->blkid().to_integer() != 0) ? std::to_string(parent_buf->blkid().to_integer()) : "empty", @@ -346,10 +339,10 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p ? std::to_string(child_buf->blkid().to_integer()) : "empty"; - fmt::format_to(std::back_inserter(txn), ": {} - parent={} child={} new=[{}] freed=[{}]", txn_id, parent_str, + fmt::format_to(std::back_inserter(txn), "\n{} - parent={} child={} new=[{}] freed=[{}]", txn_id, parent_str, child_str, new_nodes, freed_nodes); } - LOGTRACEMOD(wbcache, "tranasction till now: cp: {} {}", icp_ctx->id(), txn); + LOGTRACEMOD(wbcache, "\ttranasction till now: cp: {} \n{}\n", icp_ctx->id(), txn); txn_id++; #endif #if 0 @@ -442,8 +435,6 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { buf->m_node_freed = true; resource_mgr().inc_free_blk(m_node_size); m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(cp_ctx)); - LOGTRACEMOD(wbcache, "Freeing bkid = {}. Remove from cache?(aka not recovery mode) = {}", buf->m_blkid.to_integer(), - !m_in_recovery); } //////////////////// Recovery Related section ///////////////////////////////// @@ -455,7 +446,15 @@ void IndexWBCache::load_buf(IndexBufferPtr const& buf) { } } -IndexWBCache::DagMap IndexWBCache::generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap) { +struct DagNode { + IndexBufferPtr buffer; + std::vector< shared< DagNode > > children; +}; + +using DagPtr = std::shared_ptr< DagNode >; +using DagMap = std::map< IndexBufferPtr, DagPtr >; + +static DagMap generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap) { std::vector< IndexBufferPtr > bufs; std::ranges::transform(bufmap, std::back_inserter(bufs), [](const auto& pair) { return pair.second; }); @@ -497,7 +496,7 @@ IndexWBCache::DagMap IndexWBCache::generate_dag_buffers(std::map< BlkId, IndexBu return generateDagMap(bufs); } -std::string IndexWBCache::to_string_dag_bufs(DagMap& dags, cp_id_t cp_id) { +static std::string to_string_dag_bufs(DagMap& dags, cp_id_t cp_id = 0) { std::string str{fmt::format("#_of_dags={}\n", dags.size())}; int cnt = 1; for (const auto& [_, dag] : dags) { @@ -508,7 +507,6 @@ std::string IndexWBCache::to_string_dag_bufs(DagMap& dags, cp_id_t cp_id) { stack.pop_back(); auto snew = node->buffer->m_created_cp_id == cp_id ? "NEW" : ""; auto sfree = node->buffer->m_node_freed ? "FREED" : ""; - load_buf(node->buffer); fmt::format_to(std::back_inserter(str), "{}{}-{} {} {}\n", std::string(level * 4, ' '), index, node->buffer->to_string(), snew, sfree); int c = node->children.size(); @@ -520,26 +518,6 @@ std::string IndexWBCache::to_string_dag_bufs(DagMap& dags, cp_id_t cp_id) { return str; } -void IndexWBCache::prune_up_buffers(IndexBufferPtr const& buf, std::vector< IndexBufferPtr >& pruned_bufs_to_repair) { - auto up_buf = buf->m_up_buffer; - auto grand_up_buf = up_buf->m_up_buffer; - if (!up_buf || !up_buf->m_wait_for_down_buffers.testz()) { return; } - - // if up buffer has up buffer, then we need to decrement its wait_for_down_buffers - LOGINFOMOD(wbcache, "\n\npruning up_buffer due to zero dependency of child\n up buffer {}\n buffer {}", - up_buf->to_string(), buf->to_string()); - update_up_buffer_counters(up_buf); - - pruned_bufs_to_repair.push_back(up_buf); - if (grand_up_buf && !grand_up_buf->is_meta_buf() && grand_up_buf->m_wait_for_down_buffers.testz()) { - LOGTRACEMOD( - wbcache, - "\nadding grand_buffer to repair list due to zero dependency of child\n grand buffer {}\n buffer {}", - grand_up_buf->to_string(), buf->to_string()); - pruned_bufs_to_repair.push_back(grand_up_buf); - } -} - void IndexWBCache::recover(sisl::byte_view sb) { // If sb is empty, its possible a first time boot. if ((sb.bytes() == nullptr) || (sb.size() == 0)) { @@ -577,8 +555,10 @@ void IndexWBCache::recover(sisl::byte_view sb) { return log; }; + std::string log = fmt::format("Recovering bufs (#of bufs = {}) before processing them\n", bufs.size()); + LOGTRACEMOD(wbcache, "{}\n{}", log, detailed_log(bufs, {})); auto dags = generate_dag_buffers(bufs); - LOGTRACEMOD(wbcache, "before processing recovery DAGS:\n {}\n\n\n\n", to_string_dag_bufs(dags, icp_ctx->id())); + LOGTRACEMOD(wbcache, "Before recovery: {}", to_string_dag_bufs(dags, icp_ctx->id())); #endif // At this point, we have the DAG structure (up/down dependency graph), exactly the same as prior to crash, with one @@ -594,36 +574,24 @@ void IndexWBCache::recover(sisl::byte_view sb) { // On the second pass, we only take part of the parents/siblings and then repair them, if needed. std::vector< IndexBufferPtr > pending_bufs; std::vector< IndexBufferPtr > deleted_bufs; - std::multiset< IndexBufferPtr, bool (*)(const IndexBufferPtr&, const IndexBufferPtr&) > - potential_parent_recovered_bufs( - [](const IndexBufferPtr& a, const IndexBufferPtr& b) { return a->m_node_level < b->m_node_level; }); - - std::vector< IndexBufferPtr > pruned_bufs_to_repair; - std::set< IndexBufferPtr > bufs_to_skip_sanity_check; - LOGTRACEMOD(wbcache, "\n\n\nRecovery processing begins\n\n\n"); for (auto const& [_, buf] : bufs) { - load_buf(buf); - if (buf->m_node_freed) { - LOGTRACEMOD(wbcache, "recovering free buf {}", buf->to_string()); + // Freed node + load_buf(buf); if (was_node_committed(buf)) { // Mark this buffer as deleted, so that we can avoid using it anymore when repairing its parent's link r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted = true; - write_buf(nullptr, buf, icp_ctx); // no need to write it here !! + write_buf(nullptr, buf, icp_ctx); deleted_bufs.push_back(buf); pending_bufs.push_back(buf->m_up_buffer); - LOGINFOMOD(wbcache, "Freeing deleted buf {} and adding up buffer to pending {}", buf->to_string(), - buf->m_up_buffer->to_string()); } else { // (Up) buffer is not committed, node need to be kept and (potentially) repaired later - if (buf->m_created_cp_id != icp_ctx->id()) { - LOGTRACEMOD(wbcache, - "NOT FREE committing buffer {} node deleted is false reason: node commited?= {} " - "up committed? {}", - buf->to_string(), was_node_committed(buf), was_node_committed(buf->m_up_buffer)); - buf->m_node_freed = false; - r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted = false; + buf->m_node_freed = false; + if (buf->m_created_cp_id == icp_ctx->id()) { + // New nodes need to be commited first m_vdev->commit_blk(buf->m_blkid); +<<<<<<< HEAD:src/lib/index/inplace_btree/wb_cache.cpp +======= // it can happen when children moved to one of right parent sibling and then the previous node is // deleted but not commited during crash (upbuffer is not committed). but its children already // committed. and freed (or changed) @@ -639,42 +607,56 @@ void IndexWBCache::recover(sisl::byte_view sb) { LOGTRACEMOD(wbcache, "remove_down_buffer {} from up buffer {}", buf->to_string(), buf->m_up_buffer->to_string()); buf->m_up_buffer->remove_down_buffer(buf); - prune_up_buffers(buf, pruned_bufs_to_repair); + if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) { + // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers + LOGINFOMOD(wbcache, + "\n\npruning up_buffer due to zero dependency of child\n up buffer {}\n buffer {}", + buf->m_up_buffer ? buf->m_up_buffer->to_string() : std::string("nullptr"), + buf->to_string()); + update_up_buffer_counters(buf->m_up_buffer /*,visited_bufs*/); + } buf->m_up_buffer = nullptr; +>>>>>>> f30f0d44 (Issue 713: Fix index table destroy race with wb_cache cp flush (#714)):src/lib/index/wb_cache.cpp } + pending_bufs.push_back(buf); + buf->m_wait_for_down_buffers.increment(1); // Purely for recover_buf() counter consistency } } else if (buf->m_created_cp_id == icp_ctx->id()) { - LOGTRACEMOD(wbcache, "recovering new buf {}", buf->to_string()); // New node if (was_node_committed(buf) && was_node_committed(buf->m_up_buffer)) { // Both current and up buffer is commited, we can safely commit the current block - LOGTRACEMOD(wbcache, "New buffer {} and the up buffer {} are committed", buf->to_string(), - buf->m_up_buffer->to_string()); m_vdev->commit_blk(buf->m_blkid); pending_bufs.push_back(buf->m_up_buffer); } else { // Up buffer is not committed, we need to repair it first - LOGTRACEMOD(wbcache, "The up buffer {} is not committed for the new buffer {}", - buf->m_up_buffer->to_string(), buf->to_string()); buf->m_up_buffer->remove_down_buffer(buf); - prune_up_buffers(buf, pruned_bufs_to_repair); - // Skip the sanity check on this buf as we do not keep it - bufs_to_skip_sanity_check.insert(buf); + // buf->m_up_buffer = nullptr; + if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) { + // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers + update_up_buffer_counters(buf->m_up_buffer); + } +<<<<<<< HEAD:src/lib/index/inplace_btree/wb_cache.cpp +======= // buf->m_up_buffer = nullptr; +>>>>>>> f30f0d44 (Issue 713: Fix index table destroy race with wb_cache cp flush (#714)):src/lib/index/wb_cache.cpp } } } - LOGTRACEMOD(wbcache, "\n\n\nRecovery processing Ends\n\n\n"); + #ifdef _PRERELEASE LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", pending_bufs.size(), bufs.size(), icp_ctx->id()); - // add deleted bufs to logs here as well - auto modified_dags = generate_dag_buffers(bufs); - LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log({}, pending_bufs)); - LOGTRACEMOD(wbcache, "All pruned bufs for recovery\n{}", detailed_log({}, pruned_bufs_to_repair)); - LOGTRACEMOD(wbcache, "After recovery: {}", to_string_dag_bufs(modified_dags, icp_ctx->id())); - + LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, pending_bufs)); + LOGTRACEMOD(wbcache, "After recovery: {}", to_string_dag_bufs(dags, icp_ctx->id())); #endif +<<<<<<< HEAD:src/lib/index/inplace_btree/wb_cache.cpp + + for (auto const& buf : pending_bufs) { + recover_buf(buf); + if (buf->m_bytes != nullptr && r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted) { + // This buffer was marked as deleted during repair, so we also need to free it + deleted_bufs.push_back(buf); +======= uint32_t cnt = 0; LOGTRACEMOD(wbcache, "Potential parent recovered bufs (#of bufs = {})", potential_parent_recovered_bufs.size()); for (auto const& buf : potential_parent_recovered_bufs) { @@ -693,68 +675,18 @@ void IndexWBCache::recover(sisl::byte_view sb) { } else { // This buffer was not marked as deleted during repair, so we need to repair it buffers_to_repair.push_back(buf); +>>>>>>> f30f0d44 (Issue 713: Fix index table destroy race with wb_cache cp flush (#714)):src/lib/index/wb_cache.cpp } } - // let all unfreed buffers to be repaired first. This is important to let detect and remove all stale links first - // and then repair them before actual repair (due to dependency of finding true siblings) - for (auto const& buf : buffers_to_repair) { - LOGTRACEMOD(wbcache, "recover and repairing unfreed non-stale link interior node buf {}", buf->to_string()); - index_service().repair_index_node(buf->m_index_ordinal, buf); - } - // actual recover is done here in recovery path - for (auto const& buf : pending_bufs) { - LOGTRACEMOD(wbcache, "recover and repairing up_buffer buf {}", buf->to_string()); - recover_buf(buf); - } - - // When we prune a buffer due to zero down dependency, there is a case where the key range of the parent needs to be - // adjusted. This can happen when a child is merged and its right sibling is flushed before the parent is flushed. - // And during recovery, we prune the node and keep the deleted child and keep the parent as is. - // We need to call repair_links directly on them as the recovery_buf() path will not trigger it. - for (auto const& buf : pruned_bufs_to_repair) { - LOGTRACEMOD(wbcache, "pruned buf {} is repaired", buf->to_string()); - index_service().repair_index_node(buf->m_index_ordinal, buf); - } for (auto const& buf : deleted_bufs) { - LOGTRACEMOD(wbcache, "freeing buf after repairing (last step) {}", buf->to_string()); m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); } - if (pending_bufs.empty()) { - LOGTRACEMOD(wbcache, "No buffers to repair, recovery completed"); - } else { - std::map< uint32_t, IndexBufferPtrList > changed_bufs; - for (auto const& [_, buf] : bufs) { - LOGTRACEMOD(wbcache, "{}", buf->to_string()); - if (!buf->m_node_freed && !bufs_to_skip_sanity_check.contains(buf)) { - changed_bufs[buf->m_index_ordinal].push_back(buf); - } - } - for (auto const& [index_ordinal, bufs] : changed_bufs) { - LOGTRACEMOD(wbcache, "Sanity checking buffers for index ordinal {}: # of bufs {}", index_ordinal, - bufs.size()); - auto ret = index_service().sanity_check(index_ordinal, bufs); - if (ret) { - LOGTRACEMOD(wbcache, "Sanity check for index ordinal {} passed", index_ordinal); - } else { - LOGERRORMOD(wbcache, "Sanity check for index ordinal {} failed", index_ordinal); -#ifdef _PRELEASE - HS_DBG_ASSERT(true, "sanity failed: {}", ret); -#else - // TODO: make this index table offline and let others work - HS_REL_ASSERT(0, "sanity failed: {}", ret); -#endif - } - } - } m_in_recovery = false; m_vdev->recovery_completed(); } -void IndexWBCache::parent_recover(IndexBufferPtr const& buf) { - index_service().parent_recover(buf->m_index_ordinal, buf); -} // if buf->m_wait_for_down_buffers.testz() is true (which means that it has no dependency on any other buffer) then we // can decrement the wait_for_down_buffers of its up buffer. If the up buffer has up buffer, then we need to decrement // its wait_for_down_buffers. If the up buffer of up buffer has wait_for_down_buffers as 0, then we need to decrement @@ -762,16 +694,14 @@ void IndexWBCache::parent_recover(IndexBufferPtr const& buf) { // wait_for_down_buffers as 0, then we need to decrement its wait_for_down_buffers. void IndexWBCache::update_up_buffer_counters(IndexBufferPtr const& buf) { if (buf == nullptr || !buf->m_wait_for_down_buffers.testz() || buf->m_up_buffer == nullptr) { - LOGINFOMOD(wbcache, "Finish decrementing wait_for_down_buffers\n"); + LOGINFOMOD(wbcache, "Finish decrementing wait_for_down_buffers"); return; } auto grand_buf = buf->m_up_buffer; + grand_buf->remove_down_buffer(buf); LOGINFOMOD(wbcache, - "Decrementing wait_for_down_buffers due to zero dependency of child for grand_buffer {} up_buffer {}, " - "Keep going up", + "Decrementing wait_for_down_buffers for buffer {} due to zero dependency of child {}, Keep going up", grand_buf->to_string(), buf->to_string()); - grand_buf->remove_down_buffer(buf); - buf->m_up_buffer = nullptr; update_up_buffer_counters(grand_buf); } @@ -806,7 +736,7 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) { // If the node is freed, then it can be considered committed as long as its up buffer was committed if (buf->m_node_freed) { - HS_DBG_ASSERT(buf->m_up_buffer, "Buf {} was marked deleted, but doesn't have an up_buffer", buf->to_string()); + HS_DBG_ASSERT(buf->m_up_buffer, "Buf was marked deleted, but doesn't have an up_buffer"); return was_node_committed(buf->m_up_buffer); } @@ -818,7 +748,8 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) { //////////////////// CP Related API section ///////////////////////////////// folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { - LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp \ndag={}", cp_ctx->to_string_with_dags()); + LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp \ndag={}\n\n cp context {}", cp_ctx->to_string_with_dags(), + cp_ctx->to_string()); // #ifdef _PRERELEASE // static int id = 0; // auto filename = "cp_" + std::to_string(id++) + "_" + std::to_string(rand() % 100) + ".dot"; @@ -857,7 +788,7 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { } cp_ctx->prepare_flush_iteration(); - m_updated_ordinals.clear(); + for (auto& fiber : m_cp_flush_fibers) { iomanager.run_on_forget(fiber, [this, cp_ctx]() { IndexBufferPtrList buf_list; @@ -873,8 +804,8 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { } void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) { - static std::once_flag flag; #ifdef _PRERELEASE + static std::once_flag flag; if (hs()->crash_simulator().is_crashed()) { std::call_once(flag, []() { LOGINFO("Crash simulation is ongoing; aid simulation by not flushing."); }); return; @@ -892,38 +823,28 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const buf->set_state(index_buf_state_t::FLUSHING); if (buf->is_meta_buf()) { - LOGTRACEMOD(wbcache, "Flushing cp {} meta buf {}", cp_ctx->id(), buf->to_string()); + LOGTRACEMOD(wbcache, "Flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), + buf->to_string()); auto const sb_buf = r_cast< MetaIndexBuffer* >(buf.get()); if (sb_buf->m_valid) { auto const& sb = sb_buf->m_sb; - if (!sb.is_empty()) { - meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); - } else { - LOGTRACEMOD(wbcache, "Skipping flushing meta buf {} as sb is empty", buf->to_string()); - } - } else { - LOGTRACEMOD(wbcache, "Skipping flushing meta buf {} as it is not valid", buf->to_string()); + if (!sb.is_empty()) { meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); } } process_write_completion(cp_ctx, buf); } else if (buf->m_node_freed) { - LOGTRACEMOD(wbcache, "cp {} Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), + LOGTRACEMOD(wbcache, "Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), buf->to_string()); process_write_completion(cp_ctx, buf); } else { - if (buf->m_created_cp_id == cp_ctx->id()) { - LOGTRACEMOD(wbcache, "Flushing cp {} new node buf {} blkid {}", cp_ctx->id(), buf->to_string(), - buf->blkid().to_string()); - } + LOGTRACEMOD(wbcache, "Flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) .thenValue([buf, cp_ctx](auto) { try { auto& pthis = s_cast< IndexWBCache& >(wb_cache()); pthis.process_write_completion(cp_ctx, buf); - } catch (const std::runtime_error& e) { - std::call_once(flag, - []() { LOGERROR("Crash simulation is ongoing; aid simulation by not flushing."); }); - } + } catch (const std::runtime_error& e) { LOGERROR("Failed to access write-back cache: {}", e.what()); } }); + if (!part_of_batch) { m_vdev->submit_batch(); } } } @@ -938,28 +859,18 @@ void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferP } #endif - LOGTRACEMOD(wbcache, "cp {} completed flushed for buf {} blkid {}", cp_ctx->id(), buf->to_string(), - buf->blkid().to_string()); + LOGTRACEMOD(wbcache, "cp {} buf {}", cp_ctx->id(), buf->to_string()); resource_mgr().dec_dirty_buf_size(m_node_size); - m_updated_ordinals.insert(buf->m_index_ordinal); auto [next_buf, has_more] = on_buf_flush_done(cp_ctx, buf); if (next_buf) { do_flush_one_buf(cp_ctx, next_buf, false); } else if (!has_more) { - for (const auto& ordinal : m_updated_ordinals) { - LOGTRACEMOD(wbcache, "Updating sb for ordinal {}", ordinal); - index_service().write_sb(ordinal); - } - // We are done flushing the buffers, We flush the vdev to persist the vdev bitmaps and free blks // Pick a CP Manager blocking IO fiber to execute the cp flush of vdev iomanager.run_on_forget(cp_mgr().pick_blocking_io_fiber(), [this, cp_ctx]() { - auto cp_id = cp_ctx->id(); - LOGTRACEMOD(wbcache, "Initiating CP {} flush", cp_id); + LOGTRACEMOD(wbcache, "Initiating CP flush"); m_vdev->cp_flush(cp_ctx); // This is a blocking io call - LOGTRACEMOD(wbcache, "CP {} freed blkids: \n{}", cp_id, cp_ctx->to_string_free_list()); cp_ctx->complete(true); - LOGTRACEMOD(wbcache, "Completed CP {} flush", cp_id); }); } } @@ -982,13 +893,12 @@ std::pair< IndexBufferPtr, bool > IndexWBCache::on_buf_flush_done_internal(Index buf->m_down_buffers.clear(); } #endif + buf->set_state(index_buf_state_t::CLEAN); if (cp_ctx->m_dirty_buf_count.decrement_testz()) { - buf->set_state(index_buf_state_t::CLEAN); return std::make_pair(nullptr, false); } else { get_next_bufs_internal(cp_ctx, 1u, buf, buf_list); - buf->set_state(index_buf_state_t::CLEAN); return std::make_pair((buf_list.size() ? buf_list[0] : nullptr), true); } } @@ -1029,11 +939,7 @@ void IndexWBCache::get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_c std::optional< IndexBufferPtr > buf = cp_ctx->next_dirty(); if (!buf) { break; } // End of list - // If a buffer is reused during overlapping cp, there is a possibility that - // the buffer which is already flushed in cp x is dirtied by cp x + 1 - // and is picked up again to flush by cp x through this code path. - if ((*buf)->state() == index_buf_state_t::DIRTY && (*buf)->m_dirtied_cp_id == cp_ctx->id() && - (*buf)->m_wait_for_down_buffers.testz()) { + if ((*buf)->state() == index_buf_state_t::DIRTY && (*buf)->m_wait_for_down_buffers.testz()) { bufs.emplace_back(std::move(*buf)); ++count; } else { diff --git a/src/lib/index/wb_cache.hpp b/src/lib/index/inplace_btree/wb_cache.hpp similarity index 85% rename from src/lib/index/wb_cache.hpp rename to src/lib/index/inplace_btree/wb_cache.hpp index bf04dbc67..7d10d7f54 100644 --- a/src/lib/index/wb_cache.hpp +++ b/src/lib/index/inplace_btree/wb_cache.hpp @@ -41,13 +41,12 @@ class IndexWBCache : public IndexWBCacheBase { std::mutex m_flush_mtx; void* m_meta_blk; bool m_in_recovery{false}; - std::unordered_set< uint32_t > m_updated_ordinals; public: IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size); - BtreeNodePtr alloc_buf(uint32_t ordinal, node_initializer_t&& node_initializer) override; + BtreeNodePtr alloc_buf(node_initializer_t&& node_initializer) override; void write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf, CPContext* cp_ctx) override; void read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t&& node_initializer) override; @@ -62,13 +61,6 @@ class IndexWBCache : public IndexWBCacheBase { folly::Future< bool > async_cp_flush(IndexCPContext* context); IndexBufferPtr copy_buffer(const IndexBufferPtr& cur_buf, const CPContext* cp_ctx) const; void recover(sisl::byte_view sb) override; - struct DagNode { - IndexBufferPtr buffer; - std::vector< shared< DagNode > > children; - }; - - using DagPtr = std::shared_ptr< DagNode >; - using DagMap = std::map< IndexBufferPtr, DagPtr >; private: void start_flush_threads(); @@ -85,12 +77,8 @@ class IndexWBCache : public IndexWBCacheBase { IndexBufferPtrList& bufs); void recover_buf(IndexBufferPtr const& buf); - void parent_recover(IndexBufferPtr const& buf); - std::string to_string_dag_bufs(DagMap& dags, cp_id_t cp_id = 0); - DagMap generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap); bool was_node_committed(IndexBufferPtr const& buf); void load_buf(IndexBufferPtr const& buf); void update_up_buffer_counters(IndexBufferPtr const& buf); - void prune_up_buffers(IndexBufferPtr const& buf, std::vector< IndexBufferPtr >& bufs_to_repair); }; } // namespace homestore diff --git a/src/include/homestore/index/wb_cache_base.hpp b/src/lib/index/inplace_btree/wb_cache_base.hpp similarity index 93% rename from src/include/homestore/index/wb_cache_base.hpp rename to src/lib/index/inplace_btree/wb_cache_base.hpp index 3fb33d79b..4624f9444 100644 --- a/src/include/homestore/index/wb_cache_base.hpp +++ b/src/lib/index/inplace_btree/wb_cache_base.hpp @@ -36,10 +36,9 @@ class IndexWBCacheBase { /// @brief Allocate the buffer and initialize the btree node. It adds the node to the wb cache. /// @tparam K Key type of the Index - /// @param ordinal The index table ordinal used when custom index chunk selector exists /// @param node_initializer Callback to be called upon which buffer is turned into btree node /// @return Node which was created by the node_initializer - virtual BtreeNodePtr alloc_buf(uint32_t ordinal, node_initializer_t&& node_initializer) = 0; + virtual BtreeNodePtr alloc_buf(node_initializer_t&& node_initializer) = 0; /// @brief Write buffer /// @param buf diff --git a/src/lib/index/mem_btree/CMakeLists.txt b/src/lib/index/mem_btree/CMakeLists.txt new file mode 100644 index 000000000..49a04ef06 --- /dev/null +++ b/src/lib/index/mem_btree/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.11) + +include_directories (BEFORE ../) + +add_library(hs_mem_btree OBJECT) +target_sources(hs_mem_btree PRIVATE + mem_btree_store.cpp + ) +target_link_libraries(hs_mem_btree hs_common ${COMMON_DEPS}) +#add_library(hs_cow_btree OBJECT ${COW_BTREE_SOURCE_FILES}) diff --git a/src/lib/index/mem_btree/mem_btree_store.cpp b/src/lib/index/mem_btree/mem_btree_store.cpp new file mode 100644 index 000000000..449c153e4 --- /dev/null +++ b/src/lib/index/mem_btree/mem_btree_store.cpp @@ -0,0 +1,54 @@ +#include "index/mem_btree/mem_btree_store.h" +#include +#include + +namespace homestore { +unique< UnderlyingBtree > MemBtreeStore::create_underlying_btree(BtreeBase& btree, bool load_existing) { + // We don't need any mem specific btree portion, everything can be accomplished from common store class + return std::make_unique< MemBtree >(btree); +} + +MemBtree::MemBtree(BtreeBase& btree) : m_base_btree{btree} {} + +BtreeNodePtr MemBtree::create_node(bool is_leaf, CPContext*) { + // std::shared_ptr< uint8_t[] > ptr(new uint8_t[m_base_btree.node_size()]); + // node_buf_ptr_vec.emplace_back(ptr); + auto node = m_base_btree.new_node(bnodeid_t{0}, is_leaf, BtreeNode::Allocator::default_token); + node->set_node_id(bnodeid_t{r_cast< std::uintptr_t >(node.get())}); + node->m_refcount.increment(); + return node; +} + +btree_status_t MemBtree::write_node(BtreeNodePtr const& node, CPContext*) { return btree_status_t::success; } + +btree_status_t MemBtree::read_node(bnodeid_t id, BtreeNodePtr& node) const { + node.reset(r_cast< BtreeNode* >(id)); + return btree_status_t::success; +} + +btree_status_t MemBtree::refresh_node(BtreeNodePtr const& node, bool for_read_modify_write, CPContext*) { + return btree_status_t::success; +} + +void MemBtree::remove_node(BtreeNodePtr const& node, CPContext*) { intrusive_ptr_release(node.get()); } + +btree_status_t MemBtree::transact_nodes(BtreeNodeList const& new_nodes, BtreeNodeList const& freed_nodes, + BtreeNodePtr const& left_child_node, BtreeNodePtr const& parent_node, + CPContext* context) { + for (auto const& node : new_nodes) { + m_base_btree.write_node(node, context); + } + m_base_btree.write_node(left_child_node, context); + m_base_btree.write_node(parent_node, context); + + for (auto const& node : freed_nodes) { + m_base_btree.remove_node(node, locktype_t::WRITE, context); + } + return btree_status_t::success; +} + +BtreeLinkInfo MemBtree::load_root_node_id() { return BtreeLinkInfo{empty_bnodeid, 0}; } + +btree_status_t MemBtree::on_root_changed(BtreeNodePtr const&, CPContext*) { return btree_status_t::success; } + +} // namespace homestore \ No newline at end of file diff --git a/src/lib/index/mem_btree/mem_btree_store.h b/src/lib/index/mem_btree/mem_btree_store.h new file mode 100644 index 000000000..12cd2c9bb --- /dev/null +++ b/src/lib/index/mem_btree/mem_btree_store.h @@ -0,0 +1,47 @@ +#pragma once + +#include +#include + +#include +#include + +namespace homestore { +class MemBtreeStore : public BtreeStore { +public: + MemBtreeStore() = default; + virtual ~MemBtreeStore() = default; + + void stop() override {} + std::string store_type() const override { return "MEM_BTREE"; } + void on_recovery_completed() override {} + + unique< UnderlyingBtree > create_underlying_btree(BtreeBase& btree, bool load_existing) override; + folly::Future< folly::Unit > destroy_underlying_btree(BtreeBase&) override { return folly::makeFuture(); } + // void on_node_freed(BtreeNode* node) override; + bool is_fast_destroy_supported() const override { return true; } + bool is_ephemeral() const override { return true; } + uint32_t max_node_size() const override { return 4096u; } +}; + +class MemBtree : public UnderlyingBtree { +public: + MemBtree(BtreeBase& btree); + BtreeNodePtr create_node(bool is_leaf, CPContext* context) override; + btree_status_t write_node(BtreeNodePtr const& node, CPContext* context) override; + btree_status_t read_node(bnodeid_t id, BtreeNodePtr& node) const override; + btree_status_t refresh_node(BtreeNodePtr const& node, bool for_read_modify_write, CPContext* context) override; + void remove_node(BtreeNodePtr const& node, CPContext* context) override; + btree_status_t transact_nodes(BtreeNodeList const& new_nodes, BtreeNodeList const& freed_nodes, + BtreeNodePtr const& left_child_node, BtreeNodePtr const& parent_node, + CPContext* context) override; + BtreeLinkInfo load_root_node_id() override; + btree_status_t on_root_changed(BtreeNodePtr const&, CPContext*) override; + uint64_t space_occupied() const override { return 0; } + +private: + BtreeBase& m_base_btree; + // std::vector< std::shared_ptr< uint8_t[] > > node_buf_ptr_vec; +}; + +} // namespace homestore \ No newline at end of file diff --git a/src/lib/logging.cpp b/src/lib/logging.cpp deleted file mode 100644 index 2ba71ef04..000000000 --- a/src/lib/logging.cpp +++ /dev/null @@ -1,4 +0,0 @@ -#include -#include - -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 764259756..2b3f88c30 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -34,8 +34,6 @@ namespace homestore { -SISL_LOGGING_DECL(logstore) - #define THIS_LOGDEV_LOG(level, msg, ...) HS_SUBMOD_LOG(level, logstore, , "log_dev", m_logdev_id, msg, __VA_ARGS__) #define THIS_LOGDEV_PERIODIC_LOG(level, msg, ...) \ HS_PERIODIC_DETAILED_LOG(level, logstore, "log_dev", m_logdev_id, , , msg, __VA_ARGS__) @@ -47,6 +45,8 @@ LogDev::LogDev(logdev_id_t id, flush_mode_t flush_mode) : m_logdev_id{id}, m_flu m_flush_size_multiple = HS_DYNAMIC_CONFIG(logstore->flush_size_multiple_logdev); } +LogDev::~LogDev() = default; + void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) { // Each logdev has one journal descriptor. m_vdev = vdev; @@ -60,6 +60,7 @@ void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) { m_log_group_pool[i].start(m_flush_size_multiple, m_vdev->align_size()); } m_log_records = std::make_unique< sisl::StreamTracker< log_record > >(); + m_stopped = false; // First read the info block if (format) { @@ -86,9 +87,6 @@ void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) { m_last_flush_idx = m_log_idx - 1; } - // Now that we have create/load logdev metablk, so the log dev is ready to be used - m_is_ready = true; - if (allow_timer_flush()) start_timer(); handle_unopened_log_stores(format); @@ -107,15 +105,33 @@ void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) { } } -LogDev::~LogDev() { +void LogDev::stop() { THIS_LOGDEV_LOG(INFO, "Logdev stopping id {}", m_logdev_id); HS_LOG_ASSERT((m_pending_flush_size.load() == 0), "LogDev stop attempted while writes to logdev are pending completion"); + { + std::unique_lock lg = flush_guard(); + m_stopped = true; + // waiting under lock to make sure no new flush is started + while (m_pending_callback.load() > 0) { + THIS_LOGDEV_LOG(INFO, "Waiting for pending callbacks to complete, pending callbacks {}", + m_pending_callback.load()); + std::this_thread::sleep_for(std::chrono::milliseconds{1000}); + } + } + // after we call stop, we need to do any pending device truncations + truncate(); + + if (allow_timer_flush()) stop_timer(); + + { + folly::SharedMutexWritePriority::WriteHolder holder(m_store_map_mtx); + m_id_logstore_map.clear(); + } - m_log_records.reset(nullptr); + m_log_records = nullptr; m_logdev_meta.reset(); m_log_idx.store(0); - m_is_ready = false; m_pending_flush_size.store(0); m_last_flush_idx = -1; m_last_flush_ld_key = logdev_key{0, 0}; @@ -130,6 +146,7 @@ LogDev::~LogDev() { m_hs.reset(); } +#if 0 void LogDev::stop() { start_stopping(); while (true) { @@ -151,9 +168,6 @@ void LogDev::stop() { store.log_store->stop(); } - // trigger a new flush to make sure all pending writes are flushed - flush_under_guard(); - // after we call stop, we need to do any pending device truncations truncate(); m_id_logstore_map.clear(); @@ -162,6 +176,7 @@ void LogDev::stop() { std::move(f).get(); } } +#endif void LogDev::destroy() { THIS_LOGDEV_LOG(INFO, "Logdev destroy metablks log_dev={}", m_logdev_id); @@ -274,21 +289,14 @@ void LogDev::assert_next_pages(log_stream_reader& lstream) { int64_t LogDev::append_async(logstore_id_t store_id, logstore_seq_num_t seq_num, const sisl::io_blob& data, void* cb_context) { - if (is_stopping()) return -1; - incr_pending_request_num(); - m_stream_tracker_mtx.lock_shared(); const auto idx = m_log_idx.fetch_add(1, std::memory_order_acq_rel); m_pending_flush_size.fetch_add(data.size(), std::memory_order_relaxed); m_log_records->create(idx, store_id, seq_num, data, cb_context); - m_stream_tracker_mtx.unlock_shared(); if (allow_inline_flush()) flush_if_necessary(); - decr_pending_request_num(); return idx; } log_buffer LogDev::read(const logdev_key& key) { - if (is_stopping()) return -1; - incr_pending_request_num(); std::unique_lock lg = flush_guard(); auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread); auto ec = m_vdev_jd->sync_pread(buf->bytes(), initial_read_size, key.dev_offset); @@ -313,13 +321,11 @@ log_buffer LogDev::read(const logdev_key& key) { m_vdev_jd->sync_pread(new_buf->bytes(), rounded_size, key.dev_offset + rounded_data_offset); ret_view = sisl::byte_view{new_buf, s_cast< uint32_t >(data_offset - rounded_data_offset), record_header->size}; } - decr_pending_request_num(); + return ret_view; } void LogDev::read_record_header(const logdev_key& key, serialized_log_record& return_record_header) { - if (is_stopping()) return; - incr_pending_request_num(); std::unique_lock lg = flush_guard(); auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread); auto ec = m_vdev_jd->sync_pread(buf->bytes(), initial_read_size, key.dev_offset); @@ -332,7 +338,6 @@ void LogDev::read_record_header(const logdev_key& key, serialized_log_record& re return_record_header = serialized_log_record(record_header->size, record_header->offset, record_header->get_inlined(), record_header->store_seq_num, record_header->store_id); - decr_pending_request_num(); } void LogDev::verify_log_group_header(const logid_t idx, const log_group_header* header) { @@ -370,9 +375,7 @@ void LogDev::unreserve_store_id(logstore_id_t store_id) { m_garbage_store_ids.emplace(log_id, store_id); } -bool LogDev::get_registered_store_ids(std::vector< logstore_id_t >& registered, std::vector< logstore_id_t >& garbage) { - if (is_stopping()) return false; - incr_pending_request_num(); +void LogDev::get_registered_store_ids(std::vector< logstore_id_t >& registered, std::vector< logstore_id_t >& garbage) { std::unique_lock lg{m_meta_mutex}; for (const auto& id : m_logdev_meta.reserved_store_ids()) { registered.push_back(id); @@ -382,8 +385,6 @@ bool LogDev::get_registered_store_ids(std::vector< logstore_id_t >& registered, for (const auto& elem : m_garbage_store_ids) { garbage.push_back(elem.second); } - decr_pending_request_num(); - return true; } /* @@ -421,12 +422,9 @@ bool LogDev::can_flush_in_this_thread() { } bool LogDev::flush_if_necessary(int64_t threshold_size) { - if (is_stopping()) return false; - incr_pending_request_num(); if (!can_flush_in_this_thread()) { iomanager.run_on_forget(logstore_service().flush_thread(), [this, threshold_size]() { flush_if_necessary(threshold_size); }); - decr_pending_request_num(); return false; } @@ -442,11 +440,10 @@ bool LogDev::flush_if_necessary(int64_t threshold_size) { if (flush_by_size || flush_by_time) { std::unique_lock lck(m_flush_mtx, std::try_to_lock); if (lck.owns_lock()) { - decr_pending_request_num(); + if (m_stopped) return false; return flush(); } } - decr_pending_request_num(); return false; } @@ -465,10 +462,6 @@ bool LogDev::flush_under_guard() { } bool LogDev::flush() { - if (!is_ready()) { - THIS_LOGDEV_LOG(INFO, "LogDev is not ready to flush, log_dev={}", m_logdev_id); - return false; - } m_last_flush_time = Clock::now(); // We were able to win the flushing competition and now we gather all the flush data and reserve a slot. auto new_idx = m_log_idx.load(std::memory_order_acquire) - 1; @@ -477,9 +470,9 @@ bool LogDev::flush() { return false; } - // the amount of logs which one logGroup can flush has a upper limit. here we want to make sure all the logs - // that need to be flushed will definitely be flushed to physical dev, so we need this loop to create multiple - // log groups if necessary + // the amount of logs which one logGroup can flush has a upper limit. here we want to make sure all the logs that + // need to be flushed will definitely be flushed to physical dev, so we need this loop to create multiple log groups + // if necessary for (; m_last_flush_idx < new_idx;) { LogGroup* lg = prepare_flush(new_idx - m_last_flush_idx + 4); // Estimate 4 more extra in case of parallel writes @@ -525,32 +518,12 @@ void LogDev::on_flush_completion(LogGroup* lg) { auto upto_indx = lg->m_flush_log_idx_upto; auto dev_offset = lg->m_log_dev_offset; for (auto idx = from_indx; idx <= upto_indx; ++idx) { - logstore_req* req; - logstore_id_t store_id; -#ifdef _PRERELEASE - uint64_t lock_latency; - auto lock_start_time = Clock::now(); -#endif - { - // both flush completion and async_append can happen in parallel and - // during async_append stream tracker create log entry can cause - // resize and realloc of memory. So take a lock so that log records - // point to valid memory. - folly::SharedMutexWritePriority::WriteHolder holder(m_stream_tracker_mtx); -#ifdef _PRERELEASE - lock_latency = get_elapsed_time_us(lock_start_time); -#endif - auto& record = m_log_records->at(idx); - req = s_cast< logstore_req* >(record.context); - store_id = record.store_id; - } + auto& record = m_log_records->at(idx); + logstore_req* req = s_cast< logstore_req* >(record.context); HomeLogStore* log_store = req->log_store; - HS_LOG_ASSERT_EQ(log_store->get_store_id(), store_id, + HS_LOG_ASSERT_EQ(log_store->get_store_id(), record.store_id, "Expecting store id in log store and flush completion to match"); HISTOGRAM_OBSERVE(logstore_service().m_metrics, logstore_append_latency, get_elapsed_time_us(req->start_time)); -#ifdef _PRERELEASE - HISTOGRAM_OBSERVE(logstore_service().m_metrics, logstore_stream_tracker_lock_latency, lock_latency); -#endif log_store->on_write_completion(req, logdev_key{idx, dev_offset}, logdev_key{from_indx, dev_offset}); req_map[idx] = req; } @@ -577,12 +550,6 @@ void LogDev::on_flush_completion(LogGroup* lg) { } uint64_t LogDev::truncate() { - if (!is_ready()) { - THIS_LOGDEV_LOG(INFO, "LogDev is not ready to truncate, log_dev={}", m_logdev_id); - return 0; - } - auto stopping = is_stopping(); - incr_pending_request_num(); // Order of this lock has to be preserved. We take externally visible lock which is flush lock first. This // prevents any further update to tail_lsn and also flushes conurrently with truncation. Then we take the store // map lock, which is contained in this class and then meta_mutex. Reason for this is, we take meta_mutex under @@ -597,13 +564,15 @@ uint64_t LogDev::truncate() { auto lstore = store.log_store; if (lstore == nullptr) { continue; } auto const [trunc_lsn, trunc_ld_key, tail_lsn] = lstore->truncate_info(); - m_logdev_meta.update_store_superblk(store_id, logstore_superblk(trunc_lsn + 1), stopping /* persist_now */); + m_logdev_meta.update_store_superblk(store_id, logstore_superblk(trunc_lsn + 1), m_stopped /* persist_now */); // We found a new minimum logdev_key that we can truncate to if (trunc_ld_key.idx < min_safe_ld_key.idx) { min_safe_ld_key = trunc_ld_key; } } // All log stores are empty, we can truncate logs depends on the last flushed logdev_key - if (min_safe_ld_key == logdev_key::out_of_bound_ld_key()) { min_safe_ld_key = m_last_flush_ld_key; } + if (min_safe_ld_key == logdev_key::out_of_bound_ld_key()) { + min_safe_ld_key = m_last_flush_ld_key; + } // There are no writes or no truncation called for any of the store, so we can't truncate anything if (min_safe_ld_key.idx <= 0 || min_safe_ld_key.idx <= m_last_truncate_idx) { @@ -618,7 +587,6 @@ uint64_t LogDev::truncate() { // 6. Follower1 is killed again, after restart, its start index remains 0, misinterpreting the range as // [1,2500]. m_logdev_meta.persist(); - decr_pending_request_num(); return 0; } @@ -629,7 +597,7 @@ uint64_t LogDev::truncate() { // Update the start offset to be read upon restart m_last_truncate_idx = min_safe_ld_key.idx; - m_logdev_meta.set_start_dev_offset(min_safe_ld_key.dev_offset, min_safe_ld_key.idx, stopping /* persist_now */); + m_logdev_meta.set_start_dev_offset(min_safe_ld_key.dev_offset, min_safe_ld_key.idx, m_stopped /* persist_now */); // When a logstore is removed, it unregisteres the store and keeps the store id in garbage list. We can capture // these store_ids upto the log_idx which is truncated and then unreserve those. Now on we can re-use the @@ -639,27 +607,22 @@ uint64_t LogDev::truncate() { HS_PERIODIC_LOG(DEBUG, logstore, "Garbage collecting log_store={} in log_dev={} log_idx={}", it->second, m_logdev_id, it->first); - m_logdev_meta.unreserve_store(it->second, stopping /* persist_now */); + m_logdev_meta.unreserve_store(it->second, m_stopped /* persist_now */); it = m_garbage_store_ids.erase(it); } // We can remove the rollback records of those upto which logid is getting truncated - m_logdev_meta.remove_rollback_record_upto(min_safe_ld_key.idx, stopping /* persist_now */); + m_logdev_meta.remove_rollback_record_upto(min_safe_ld_key.idx, m_stopped /* persist_now */); THIS_LOGDEV_LOG(DEBUG, "LogDev::truncate remove rollback {}", min_safe_ld_key.idx); // All logdev meta information is updated in-memory, persist now m_logdev_meta.persist(); - decr_pending_request_num(); return num_records_to_truncate; } -bool LogDev::rollback(logstore_id_t store_id, logid_range_t id_range) { - if (is_stopping()) return false; - incr_pending_request_num(); +void LogDev::rollback(logstore_id_t store_id, logid_range_t id_range) { std::unique_lock lg{m_meta_mutex}; m_logdev_meta.add_rollback_record(store_id, id_range, true); - decr_pending_request_num(); - return true; } /////////////////////////////// LogStore Section /////////////////////////////////////// @@ -687,8 +650,6 @@ void LogDev::handle_unopened_log_stores(bool format) { } std::shared_ptr< HomeLogStore > LogDev::create_new_log_store(bool append_mode) { - if (is_stopping()) return nullptr; - incr_pending_request_num(); auto const store_id = reserve_store_id(); std::shared_ptr< HomeLogStore > lstore; lstore = std::make_shared< HomeLogStore >(shared_from_this(), store_id, append_mode, 0); @@ -700,7 +661,6 @@ std::shared_ptr< HomeLogStore > LogDev::create_new_log_store(bool append_mode) { m_id_logstore_map.insert(std::pair(store_id, logstore_info{.log_store = lstore, .append_mode = append_mode})); } HS_LOG(DEBUG, logstore, "Created log store log_dev={} log_store={}", m_logdev_id, store_id); - decr_pending_request_num(); return lstore; } @@ -723,22 +683,17 @@ folly::Future< shared< HomeLogStore > > LogDev::open_log_store(logstore_id_t sto return it->second.promise.getFuture(); } -bool LogDev::remove_log_store(logstore_id_t store_id) { - if (is_stopping()) return false; - incr_pending_request_num(); +void LogDev::remove_log_store(logstore_id_t store_id) { LOGINFO("Removing log_dev={} log_store={}", m_logdev_id, store_id); { folly::SharedMutexWritePriority::WriteHolder holder(m_store_map_mtx); auto ret = m_id_logstore_map.erase(store_id); if (ret == 0) { LOGWARN("try to remove invalid store_id {}-{}", m_logdev_id, store_id); - decr_pending_request_num(); - return false; + return; } } unreserve_store_id(store_id); - decr_pending_request_num(); - return true; } void LogDev::on_log_store_found(logstore_id_t store_id, const logstore_superblk& sb) { @@ -814,7 +769,7 @@ nlohmann::json LogDev::get_status(int verbosity) const { js["last_truncate_log_idx"] = m_last_truncate_idx; js["time_since_last_log_flush_ns"] = get_elapsed_time_ns(m_last_flush_time); if (verbosity == 2) { - js["logdev_stopped?"] = is_stopping(); + js["logdev_stopped?"] = m_stopped; js["logdev_sb_start_offset"] = m_logdev_meta.get_start_dev_offset(); js["logdev_sb_num_stores_reserved"] = m_logdev_meta.num_stores_reserved(); } diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index 5b18f981b..f3cc03f1d 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -600,11 +600,17 @@ class LogDev : public std::enable_shared_from_this< LogDev > { void start(bool format, std::shared_ptr< JournalVirtualDev > vdev); /** - * @brief Stop the logdev. it waits for all the pending writes to be completed and reject new api calls. + * @brief Stop the logdev. It resets all the parameters it is using and thus can be started later * */ void stop(); + /** + * @brief return whether the logdev is stopped or not + * + */ + bool is_stopped(); + /** * @brief Destroy the logdev metablks. * @@ -668,7 +674,7 @@ class LogDev : public std::enable_shared_from_this< LogDev > { * @param store_id : Store id whose logids are to be rolled back or invalidated * @param id_range : Log id range to rollback/invalidate */ - bool rollback(logstore_id_t store_id, logid_range_t id_range); + void rollback(logstore_id_t store_id, logid_range_t id_range); /** * @brief This method get all the store ids that are registered already and out of them which are being garbaged @@ -677,7 +683,7 @@ class LogDev : public std::enable_shared_from_this< LogDev > { * @param registered out - Reference to the vector where all registered ids are pushed * @param garbage out - Reference to the vector where all garbage ids */ - bool get_registered_store_ids(std::vector< logstore_id_t >& registered, std::vector< logstore_id_t >& garbage); + void get_registered_store_ids(std::vector< logstore_id_t >& registered, std::vector< logstore_id_t >& garbage); nlohmann::json dump_log_store(const log_dump_req& dum_req); nlohmann::json get_status(int verbosity) const; @@ -706,7 +712,7 @@ class LogDev : public std::enable_shared_from_this< LogDev > { /// @brief Remove the log store and its associated resources /// @param store_id Store id that was created/opened - bool remove_log_store(logstore_id_t store_id); + void remove_log_store(logstore_id_t store_id); /// @return externally visible lock to avoid flush concurrently auto flush_guard() { return std::unique_lock(m_flush_mtx); } @@ -723,8 +729,6 @@ class LogDev : public std::enable_shared_from_this< LogDev > { void start_timer(); folly::Future< int > stop_timer(); - bool is_ready() const { return m_is_ready.load(); } - bool allow_inline_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::INLINE); } bool allow_timer_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::TIMER); } bool allow_explicit_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::EXPLICIT); } @@ -773,6 +777,7 @@ class LogDev : public std::enable_shared_from_this< LogDev > { std::unique_ptr< sisl::StreamTracker< log_record > > m_log_records; // Container stores all in-memory log records std::atomic< logid_t > m_log_idx{0}; // Generator of log idx std::atomic< int64_t > m_pending_flush_size{0}; // How much flushable logs are pending + bool m_stopped{false}; // Is Logdev stopped. We don't need lock here, because it is updated under flush lock logdev_id_t m_logdev_id; std::shared_ptr< JournalVirtualDev > m_vdev; shared< JournalVirtualDev::Descriptor > m_vdev_jd; // Journal descriptor. @@ -786,9 +791,9 @@ class LogDev : public std::enable_shared_from_this< LogDev > { std::multimap< logid_t, logstore_id_t > m_garbage_store_ids; Clock::time_point m_last_flush_time; - logid_t m_last_flush_idx{-1}; // Track last flushed, last device offset and truncated log idx - logdev_key m_last_flush_ld_key{0, 0}; // Left interval of the last flush, 0 indicates the very beginning of logdev - logid_t m_last_truncate_idx{-1}; // Logdev truncate up to this idx + logid_t m_last_flush_idx{-1}; // Track last flushed, last device offset and truncated log idx + logdev_key m_last_flush_ld_key{0,0}; // Left interval of the last flush, 0 indicates the very beginning of logdev + logid_t m_last_truncate_idx{-1}; // Logdev truncate up to this idx crc32_t m_last_crc{INVALID_CRC32_VALUE}; // LogDev Info block related fields @@ -807,24 +812,6 @@ class LogDev : public std::enable_shared_from_this< LogDev > { // same thread. iomgr::FiberManagerLib::mutex m_flush_mtx; std::atomic_uint64_t m_pending_callback{0}; - folly::SharedMutexWritePriority m_stream_tracker_mtx; - - // This is used to ensure that the logdev meta is created/loaded - // to avoid other threads accessing it before it is ready (e.g., resource_mgr's device truncate thread) - std::atomic_bool m_is_ready{false}; - -private: - // graceful shutdown related fields - std::atomic_bool m_stopping{false}; - mutable std::atomic_uint64_t pending_request_num{0}; - - bool is_stopping() const { return m_stopping.load(); } - void start_stopping() { m_stopping = true; } - - uint64_t get_pending_request_num() const { return pending_request_num.load(); } - - void incr_pending_request_num() const { pending_request_num++; } - void decr_pending_request_num() const { pending_request_num--; } }; // LogDev } // namespace homestore diff --git a/src/lib/logstore/log_group.cpp b/src/lib/logstore/log_group.cpp index 597d97849..3b36eb9da 100644 --- a/src/lib/logstore/log_group.cpp +++ b/src/lib/logstore/log_group.cpp @@ -20,7 +20,6 @@ #include "log_dev.hpp" namespace homestore { -SISL_LOGGING_DECL(logstore) LogGroup::LogGroup() = default; void LogGroup::start(const uint64_t flush_multiple_size, const uint32_t align_size) { diff --git a/src/lib/logstore/log_store.cpp b/src/lib/logstore/log_store.cpp index f4eee9760..1e3a1bea6 100644 --- a/src/lib/logstore/log_store.cpp +++ b/src/lib/logstore/log_store.cpp @@ -26,7 +26,6 @@ #include "log_dev.hpp" namespace homestore { -SISL_LOGGING_DECL(logstore) #define THIS_LOGSTORE_LOG(level, msg, ...) HS_SUBMOD_LOG(level, logstore, , "log_store", m_fq_name, msg, __VA_ARGS__) #define THIS_LOGSTORE_PERIODIC_LOG(level, msg, ...) \ @@ -44,9 +43,7 @@ HomeLogStore::HomeLogStore(std::shared_ptr< LogDev > logdev, logstore_id_t id, b m_fq_name{fmt::format("{} log_dev={}", id, logdev->get_id())}, m_metrics{logstore_service().metrics()} {} -logstore_seq_num_t HomeLogStore::write_async(logstore_req* req, const log_req_comp_cb_t& cb) { - if (is_stopping()) return 0; - incr_pending_request_num(); +void HomeLogStore::write_async(logstore_req* req, const log_req_comp_cb_t& cb) { HS_LOG_ASSERT((cb || m_comp_cb), "Expected either cb is not null or default cb registered"); req->cb = (cb ? cb : m_comp_cb); req->start_time = Clock::now(); @@ -60,59 +57,43 @@ logstore_seq_num_t HomeLogStore::write_async(logstore_req* req, const log_req_co m_records.create(req->seq_num); COUNTER_INCREMENT(m_metrics, logstore_append_count, 1); HISTOGRAM_OBSERVE(m_metrics, logstore_record_size, req->data.size()); - auto ret = m_logdev->append_async(m_store_id, req->seq_num, req->data, static_cast< void* >(req)); - decr_pending_request_num(); - return ret; + m_logdev->append_async(m_store_id, req->seq_num, req->data, static_cast< void* >(req)); } -logstore_seq_num_t HomeLogStore::write_async(logstore_seq_num_t seq_num, const sisl::io_blob& b, void* cookie, - const log_write_comp_cb_t& cb) { - if (is_stopping()) return 0; - incr_pending_request_num(); +void HomeLogStore::write_async(logstore_seq_num_t seq_num, const sisl::io_blob& b, void* cookie, + const log_write_comp_cb_t& cb) { // Form an internal request and issue the write auto* req = logstore_req::make(this, seq_num, b); req->cookie = cookie; - auto ret = write_async(req, [cb](logstore_req* req, logdev_key written_lkey) { + write_async(req, [cb](logstore_req* req, logdev_key written_lkey) { if (cb) { cb(req->seq_num, req->data, written_lkey, req->cookie); } logstore_req::free(req); }); - decr_pending_request_num(); - return ret; } logstore_seq_num_t HomeLogStore::append_async(const sisl::io_blob& b, void* cookie, const log_write_comp_cb_t& cb) { - if (is_stopping()) return 0; - incr_pending_request_num(); HS_DBG_ASSERT_EQ(m_append_mode, true, "append_async can be called only on append only mode"); const auto seq_num = m_next_lsn.fetch_add(1, std::memory_order_acq_rel); write_async(seq_num, b, cookie, cb); - decr_pending_request_num(); return seq_num; } -logstore_seq_num_t HomeLogStore::write_and_flush(logstore_seq_num_t seq_num, const sisl::io_blob& b) { - if (is_stopping()) return 0; - incr_pending_request_num(); +void HomeLogStore::write_and_flush(logstore_seq_num_t seq_num, const sisl::io_blob& b) { HS_LOG_ASSERT(iomanager.am_i_sync_io_capable(), "Write and flush is a blocking IO, which can't run in this thread, please reschedule to a fiber"); if (seq_num > m_next_lsn.load(std::memory_order_relaxed)) m_next_lsn.store(seq_num + 1, std::memory_order_relaxed); - auto ret = write_async(seq_num, b, nullptr /* cookie */, nullptr /* cb */); + write_async(seq_num, b, nullptr /* cookie */, nullptr /* cb */); m_logdev->flush_under_guard(); - decr_pending_request_num(); - return ret; } log_buffer HomeLogStore::read_sync(logstore_seq_num_t seq_num) { - if (is_stopping()) return log_buffer{}; - incr_pending_request_num(); HS_LOG_ASSERT(iomanager.am_i_sync_io_capable(), "Read sync is a blocking IO, which can't run in this thread, reschedule to a fiber"); // If seq_num has not been flushed yet, but issued, then we flush them before reading auto const s = m_records.status(seq_num); if (s.is_out_of_range || s.is_hole) { - decr_pending_request_num(); throw std::out_of_range("key not valid since it has been truncated"); } else if (!s.is_completed) { THIS_LOGSTORE_LOG(TRACE, "Reading lsn={}:{} before flushed, doing flush first", m_store_id, seq_num); @@ -123,7 +104,6 @@ log_buffer HomeLogStore::read_sync(logstore_seq_num_t seq_num) { const logdev_key ld_key = record.m_dev_key; if (!ld_key.is_valid()) { THIS_LOGSTORE_LOG(ERROR, "ld_key not valid {}", seq_num); - decr_pending_request_num(); throw std::out_of_range("key not valid"); } @@ -131,7 +111,6 @@ log_buffer HomeLogStore::read_sync(logstore_seq_num_t seq_num) { COUNTER_INCREMENT(m_metrics, logstore_read_count, 1); const auto b = m_logdev->read(ld_key); HISTOGRAM_OBSERVE(m_metrics, logstore_read_latency, get_elapsed_time_us(start_time)); - decr_pending_request_num(); return b; } @@ -195,15 +174,8 @@ void HomeLogStore::on_log_found(logstore_seq_num_t seq_num, const logdev_key& ld if (m_found_cb != nullptr) { m_found_cb(seq_num, buf, nullptr); } } -bool HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate_only) { - if (is_stopping()) return false; - incr_pending_request_num(); - if (upto_lsn < m_start_lsn) { - decr_pending_request_num(); - THIS_LOGSTORE_LOG(WARN, "Truncating logstore upto lsn={} , start_lsn={}, upto_lsn < m_start_lsn", upto_lsn, - m_start_lsn.load(std::memory_order_relaxed)); - return false; - } +void HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate_only) { + if (upto_lsn < m_start_lsn) { return; } flush(); #ifndef NDEBUG auto cs = get_contiguous_completed_seq_num(0); @@ -221,8 +193,9 @@ bool HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate // In baseline resync path, we truncate all entries up to upto_lsn, and update m_tail_lsn and m_next_lsn // to make sure logstore's idx is always = raft's idx - 1. if (upto_lsn > m_tail_lsn) { - THIS_LOGSTORE_LOG(WARN, "Truncating issued on lsn={} which is greater than tail_lsn={}", upto_lsn, - m_tail_lsn.load(std::memory_order_relaxed)); + THIS_LOGSTORE_LOG(WARN, + "Truncating issued on lsn={} which is greater than tail_lsn={}", + upto_lsn, m_tail_lsn.load(std::memory_order_relaxed)); // update m_tail_lsn if it is less than upto_lsn auto current_tail_lsn = m_tail_lsn.load(std::memory_order_relaxed); while (current_tail_lsn < upto_lsn && @@ -244,8 +217,6 @@ bool HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate m_records.truncate(upto_lsn); m_start_lsn.store(upto_lsn + 1); if (!in_memory_truncate_only) { m_logdev->truncate(); } - decr_pending_request_num(); - return true; } std::tuple< logstore_seq_num_t, logdev_key, logstore_seq_num_t > HomeLogStore::truncate_info() const { @@ -258,30 +229,16 @@ std::tuple< logstore_seq_num_t, logdev_key, logstore_seq_num_t > HomeLogStore::t : std::make_tuple(trunc_lsn, m_trunc_ld_key, tail_lsn); } -bool HomeLogStore::fill_gap(logstore_seq_num_t seq_num) { - if (is_stopping()) return false; - incr_pending_request_num(); +void HomeLogStore::fill_gap(logstore_seq_num_t seq_num) { HS_DBG_ASSERT_EQ(m_records.status(seq_num).is_hole, true, "Attempted to fill gap lsn={} which has valid data", seq_num); logdev_key empty_ld_key; m_records.create_and_complete(seq_num, logstore_record(empty_ld_key, empty_ld_key)); - decr_pending_request_num(); - return true; -} - -void HomeLogStore::stop() { - start_stopping(); - while (true) { - if (!get_pending_request_num()) break; - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } } nlohmann::json HomeLogStore::dump_log_store(const log_dump_req& dump_req) { nlohmann::json json_dump{}; // create root object - if (is_stopping()) return json_dump; - incr_pending_request_num(); json_dump["store_id"] = this->m_store_id; int64_t start_idx = std::max(dump_req.start_seq_num, start_lsn()); @@ -314,19 +271,14 @@ nlohmann::json HomeLogStore::dump_log_store(const log_dump_req& dump_req) { }); json_dump["log_records"] = std::move(json_records); - decr_pending_request_num(); return json_dump; } -bool HomeLogStore::foreach (int64_t start_idx, const std::function< bool(logstore_seq_num_t, log_buffer) >& cb) { - if (is_stopping()) return false; - incr_pending_request_num(); +void HomeLogStore::foreach (int64_t start_idx, const std::function< bool(logstore_seq_num_t, log_buffer) >& cb) { m_records.foreach_all_completed(start_idx, [&](int64_t cur_idx, homestore::logstore_record& record) -> bool { auto log_buf = m_logdev->read(record.m_dev_key); return cb(cur_idx, log_buf); }); - decr_pending_request_num(); - return true; } logstore_seq_num_t HomeLogStore::get_contiguous_issued_seq_num(logstore_seq_num_t from) const { @@ -337,27 +289,24 @@ logstore_seq_num_t HomeLogStore::get_contiguous_completed_seq_num(logstore_seq_n return (logstore_seq_num_t)m_records.completed_upto(from + 1); } -bool HomeLogStore::flush(logstore_seq_num_t upto_lsn) { - if (is_stopping()) return false; - incr_pending_request_num(); +void HomeLogStore::flush(logstore_seq_num_t upto_lsn) { + if (!m_logdev->allow_explicit_flush()) { + HS_LOG_ASSERT(false, + "Explicit flush is turned off or calling flush on wrong thread for this logdev, ignoring flush"); + return; + } + m_logdev->flush_under_guard(); - decr_pending_request_num(); - return true; } bool HomeLogStore::rollback(logstore_seq_num_t to_lsn) { - if (is_stopping()) return false; - incr_pending_request_num(); - // Fast path + //Fast path if (to_lsn == m_tail_lsn.load()) { - decr_pending_request_num(); - return true; + return true; } if (to_lsn > m_tail_lsn.load() || to_lsn < m_start_lsn.load()) { - HS_LOG_ASSERT(false, "Attempted to rollback to {} which is not in the range of [{}, {}]", to_lsn, - m_start_lsn.load(), m_tail_lsn.load()); - decr_pending_request_num(); + HS_LOG_ASSERT(false, "Attempted to rollback to {} which is not in the range of [{}, {}]", to_lsn, m_start_lsn.load(), m_tail_lsn.load()); return false; } @@ -393,21 +342,17 @@ bool HomeLogStore::rollback(logstore_seq_num_t to_lsn) { if (do_flush) m_logdev->flush_under_guard(); } while (do_flush); - decr_pending_request_num(); return true; } nlohmann::json HomeLogStore::get_status(int verbosity) const { nlohmann::json js; - if (is_stopping()) return js; - incr_pending_request_num(); js["append_mode"] = m_append_mode; js["start_lsn"] = m_start_lsn.load(std::memory_order_relaxed); js["next_lsn"] = m_next_lsn.load(std::memory_order_relaxed); js["tail_lsn"] = m_tail_lsn.load(std::memory_order_relaxed); js["logstore_records"] = m_records.get_status(verbosity); js["logstore_sb_first_lsn"] = m_logdev->log_dev_meta().store_superblk(m_store_id).m_first_seq_num; - decr_pending_request_num(); return js; } diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index abb266101..7270a6184 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -32,7 +32,6 @@ #include "log_dev.hpp" namespace homestore { -SISL_LOGGING_DECL(logstore) LogStoreService& logstore_service() { return hs()->logstore_service(); } @@ -120,20 +119,14 @@ void LogStoreService::start(bool format) { } void LogStoreService::stop() { - start_stopping(); - while (true) { - if (!get_pending_request_num()) break; - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } - + // device_truncate(nullptr, true, false); for (auto& [id, logdev] : m_id_logdev_map) { logdev->stop(); } -} - -LogStoreService::~LogStoreService() { - folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); - m_id_logdev_map.clear(); + { + folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); + m_id_logdev_map.clear(); + } } logdev_id_t LogStoreService::get_next_logdev_id() { @@ -143,22 +136,18 @@ logdev_id_t LogStoreService::get_next_logdev_id() { } logdev_id_t LogStoreService::create_new_logdev(flush_mode_t flush_mode) { - if (is_stopping()) return 0; - incr_pending_request_num(); folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); logdev_id_t logdev_id = get_next_logdev_id(); auto logdev = create_new_logdev_internal(logdev_id, flush_mode); logdev->start(true /* format */, m_logdev_vdev); COUNTER_INCREMENT(m_metrics, logdevs_count, 1); HS_LOG(INFO, logstore, "Created log_dev={}", logdev_id); - decr_pending_request_num(); return logdev_id; } void LogStoreService::destroy_log_dev(logdev_id_t logdev_id) { - if (is_stopping()) return; HS_LOG(INFO, logstore, "Destroying logdev {}", logdev_id); - incr_pending_request_num(); + folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); const auto it = m_id_logdev_map.find(logdev_id); if (it == m_id_logdev_map.end()) { @@ -168,18 +157,20 @@ void LogStoreService::destroy_log_dev(logdev_id_t logdev_id) { // Stop the logdev and release all the chunks from the journal vdev. auto& logdev = it->second; + // if (!logdev->is_stopped()) { + // Stop the logdev if its started. logdev->stop(); + //} - // First release all chunks. - m_logdev_vdev->destroy(logdev_id); + // First release all chunks. + m_logdev_vdev->destroy(logdev_id); - // Destroy the metablks for logdev. - logdev->destroy(); + // Destroy the metablks for logdev. + logdev->destroy(); - m_id_logdev_map.erase(it); - COUNTER_DECREMENT(m_metrics, logdevs_count, 1); - HS_LOG(INFO, logstore, "Removed log_dev={}", logdev_id); - decr_pending_request_num(); + m_id_logdev_map.erase(it); + COUNTER_DECREMENT(m_metrics, logdevs_count, 1); + HS_LOG(INFO, logstore, "Removed log_dev={}", logdev_id); } void LogStoreService::delete_unopened_logdevs() { @@ -212,15 +203,11 @@ void LogStoreService::open_logdev(logdev_id_t logdev_id, flush_mode_t flush_mode } std::vector< std::shared_ptr< LogDev > > LogStoreService::get_all_logdevs() { - std::vector< std::shared_ptr< LogDev > > res; - if (is_stopping()) return res; - incr_pending_request_num(); folly::SharedMutexWritePriority::ReadHolder holder(m_logdev_map_mtx); - + std::vector< std::shared_ptr< LogDev > > res; for (auto& [id, logdev] : m_id_logdev_map) { res.push_back(logdev); } - decr_pending_request_num(); return res; } @@ -281,15 +268,11 @@ void LogStoreService::rollback_super_blk_found(const sisl::byte_view& buf, void* } std::shared_ptr< HomeLogStore > LogStoreService::create_new_log_store(logdev_id_t logdev_id, bool append_mode) { - if (is_stopping()) return nullptr; - incr_pending_request_num(); folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); COUNTER_INCREMENT(m_metrics, logstores_count, 1); const auto it = m_id_logdev_map.find(logdev_id); HS_REL_ASSERT((it != m_id_logdev_map.end()), "logdev id {} doesnt exist", logdev_id); - auto ret = it->second->create_new_log_store(append_mode); - decr_pending_request_num(); - return ret; + return it->second->create_new_log_store(append_mode); } folly::Future< shared< HomeLogStore > > LogStoreService::open_log_store(logdev_id_t logdev_id, logstore_id_t store_id, @@ -303,9 +286,8 @@ folly::Future< shared< HomeLogStore > > LogStoreService::open_log_store(logdev_i } void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t store_id) { - if (is_stopping()) return; HS_LOG(INFO, logstore, "Removing logstore {} from logdev {}", store_id, logdev_id); - incr_pending_request_num(); + folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); COUNTER_INCREMENT(m_metrics, logstores_count, 1); const auto it = m_id_logdev_map.find(logdev_id); @@ -315,25 +297,20 @@ void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t stor } it->second->remove_log_store(store_id); HS_LOG(INFO, logstore, "Successfully removed logstore {} from logdev {}", store_id, logdev_id); - decr_pending_request_num(); + COUNTER_DECREMENT(m_metrics, logstores_count, 1); } void LogStoreService::device_truncate() { // TODO: make device_truncate_under_lock return future and do collectAllFutures; - if (is_stopping()) return; - incr_pending_request_num(); for (auto& [id, logdev] : m_id_logdev_map) logdev->truncate(); - decr_pending_request_num(); } void LogStoreService::flush() { - if (is_stopping()) return; - incr_pending_request_num(); - for (auto& [id, logdev] : m_id_logdev_map) + for (auto& [id, logdev] : m_id_logdev_map) { logdev->flush_under_guard(); - decr_pending_request_num(); + } } void LogStoreService::start_threads() { @@ -364,8 +341,6 @@ void LogStoreService::start_threads() { nlohmann::json LogStoreService::dump_log_store(const log_dump_req& dump_req) { nlohmann::json json_dump{}; // create root object - if (is_stopping()) return json_dump; - incr_pending_request_num(); if (dump_req.log_store == nullptr) { for (auto& [id, logdev] : m_id_logdev_map) { json_dump[logdev->get_id()] = logdev->dump_log_store(dump_req); @@ -376,18 +351,14 @@ nlohmann::json LogStoreService::dump_log_store(const log_dump_req& dump_req) { nlohmann::json val = logdev->dump_log_store(dump_req); json_dump[logdev->get_id()] = std::move(val); } - decr_pending_request_num(); return json_dump; } nlohmann::json LogStoreService::get_status(const int verbosity) const { nlohmann::json js; - if (is_stopping()) return js; - incr_pending_request_num(); for (auto& [id, logdev] : m_id_logdev_map) { js[logdev->get_id()] = logdev->get_status(verbosity); } - decr_pending_request_num(); return js; } @@ -403,10 +374,6 @@ LogStoreServiceMetrics::LogStoreServiceMetrics() : sisl::MetricsGroup("LogStores REGISTER_COUNTER(logstore_read_count, "Total number of read requests to log stores", "logstore_op_count", {"op", "read"}); REGISTER_HISTOGRAM(logstore_append_latency, "Logstore append latency", "logstore_op_latency", {"op", "write"}); -#ifdef _PRERELEASE - REGISTER_HISTOGRAM(logstore_stream_tracker_lock_latency, "Logstore stream tracker lock latency", - "logstore_stream_tracker_lock_latency"); -#endif REGISTER_HISTOGRAM(logstore_read_latency, "Logstore read latency", "logstore_op_latency", {"op", "read"}); REGISTER_HISTOGRAM(logdev_flush_size_distribution, "Distribution of flush data size", HistogramBucketsType(ExponentialOfTwoBuckets)); diff --git a/src/lib/logstore/log_stream.cpp b/src/lib/logstore/log_stream.cpp index d67121dd1..d6c3fa42f 100644 --- a/src/lib/logstore/log_stream.cpp +++ b/src/lib/logstore/log_stream.cpp @@ -21,7 +21,6 @@ #include "device/journal_vdev.hpp" namespace homestore { -SISL_LOGGING_DECL(logstore) log_stream_reader::log_stream_reader(off_t device_cursor, shared< JournalVirtualDev > vdev, shared< JournalVirtualDev::Descriptor > vdev_jd, uint64_t read_size_multiple) : diff --git a/src/lib/meta/meta_blk_service.cpp b/src/lib/meta/meta_blk_service.cpp index 1b93dc203..ee8e4eb84 100644 --- a/src/lib/meta/meta_blk_service.cpp +++ b/src/lib/meta/meta_blk_service.cpp @@ -39,8 +39,6 @@ #include "blkalloc/blk_allocator.h" #include "meta_sb.hpp" -SISL_LOGGING_DECL(metablk) - namespace homestore { MetaBlkService& meta_service() { return hs()->meta_service(); } @@ -608,8 +606,6 @@ meta_blk* MetaBlkService::init_meta_blk(BlkId& bid, meta_sub_type type, const ui // void MetaBlkService::write_meta_blk_ovf(BlkId& out_obid, const uint8_t* context_data, uint64_t sz, const std::string& type) { - HS_DBG_ASSERT(m_meta_mtx.try_lock() == false, "mutex should be already be locked"); - // allocate data blocks static thread_local std::vector< BlkId > context_data_blkids{}; context_data_blkids.clear(); diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index 1dc9fb199..37ef04bee 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -22,8 +22,6 @@ using namespace homestore; -SISL_LOGGING_DECL(replication) - #define REPL_STORE_LOG(level, msg, ...) \ LOG##level##MOD_FMT(replication, ([&](fmt::memory_buffer& buf, const char* msgcb, auto&&... args) -> bool { \ fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "}, \ diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index 41de00b6e..f9b3d454e 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -114,12 +114,8 @@ std::string ReplLogStore::rdev_name() const { return m_rd.rdev_name(); } std::string ReplLogStore::identify_str() const { return m_rd.identify_str(); } bool ReplLogStore::compact(ulong compact_upto_lsn) { - auto truncation_upper_limit = m_rd.get_truncation_upper_limit(); - auto effective_compact_lsn = std::min(static_cast< repl_lsn_t >(compact_upto_lsn), truncation_upper_limit); - RD_LOGD(NO_TRACE_ID, - "Raft Channel: effective_compact_lsn={}, raft compact_to_lsn={}, local truncation_upper_limit={}", - effective_compact_lsn, compact_upto_lsn, truncation_upper_limit); - m_rd.on_compact(effective_compact_lsn); - return HomeRaftLogStore::compact(effective_compact_lsn); + RD_LOGD(NO_TRACE_ID, "Raft Channel: compact_to_lsn={}", compact_upto_lsn); + m_rd.on_compact(compact_upto_lsn); + return HomeRaftLogStore::compact(compact_upto_lsn); } } // namespace homestore diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index fbc732775..6b8ce122b 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -155,6 +155,7 @@ ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& list if (status != BlkAllocStatus::SUCCESS) { LOGWARNMOD(replication, "[traceID={}] block allocation failure, repl_key=[{}], status=[{}]", rkey().traceID, rkey(), status); + DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks"); return ReplServiceError::NO_SPACE_LEFT; } @@ -265,8 +266,7 @@ std::string repl_req_ctx::to_string() const { } std::string repl_req_ctx::to_compact_string() const { - if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_START_REPLACE || - m_op_code == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { + if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_START_REPLACE || m_op_code == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { return fmt::format("term={} lsn={} op={}", m_rkey.term, m_lsn, enum_name(m_op_code)); } diff --git a/src/lib/replication/repl_dev/common.h b/src/lib/replication/repl_dev/common.h index 3085a9d3c..c3433083f 100644 --- a/src/lib/replication/repl_dev/common.h +++ b/src/lib/replication/repl_dev/common.h @@ -78,7 +78,6 @@ struct repl_dev_superblk { rdev_name[max_name_len - 1] = '\0'; } }; - #pragma pack() template < class V = folly::Unit > diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 154178737..2303fda68 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -138,49 +138,43 @@ bool RaftReplDev::join_group() { } // All the steps in the implementation should be idempotent and retryable. -AsyncReplResult<> RaftReplDev::start_replace_member(std::string& task_id, const replica_member_info& member_out, +AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) { if (is_stopping()) { RD_LOGI(trace_id, "repl dev is being shutdown!"); return make_async_error<>(ReplServiceError::STOPPING); } - if (get_stage() != repl_dev_stage_t::ACTIVE) { - RD_LOGE(trace_id, "repl dev is not ready, stage={}", static_cast< int >(get_stage())); - return make_async_error<>(ReplServiceError::UNREADY_STATE); - } incr_pending_request_num(); - RD_LOGI(trace_id, "Start replace member, task_id={}, member_out={} member_in={}", task_id, - boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + RD_LOGI(trace_id, "Start replace member, member_out={} member_in={}", boost::uuids::to_string(member_out.id), + boost::uuids::to_string(member_in.id)); - // Step1, validate request - // TODO support rollback, this could happen when the first task failed, and we want to launch a new task to - // remediate it. Need to rollback the first task. And for the same task, it's reentrant and idempotent. - auto existing_task_id = get_replace_member_task_id(); - if (!existing_task_id.empty() && existing_task_id != task_id) { - RD_LOGE(trace_id, "Step1. Replace member, task_id={} is not the same as existing task_id={}", task_id, - existing_task_id); - decr_pending_request_num(); - return make_async_error<>(ReplServiceError::REPLACE_MEMBER_TASK_MISMATCH); + if (commit_quorum >= 1) { + // Two members are down and leader cant form the quorum. Reduce the quorum size. + reset_quorum_size(commit_quorum, trace_id); } + // Step1, validate request auto out_srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member_out.id)); if (!out_srv_cfg) { auto in_srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member_in.id)); if (in_srv_cfg) { - RD_LOGI(trace_id, - "Step1. Replace member, the intent has already been fulfilled, ignore it, task_id={}, " - "member_out={} member_in={}", - task_id, boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + RD_LOGI( + trace_id, + "Step1. Replace member, the intent has already been fulfilled, ignore it, member_out={} member_in={}", + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_success<>(); } - RD_LOGE(trace_id, "Step1. Replace member invalid parameter, out member is not found, task_id={}", task_id); + RD_LOGE(trace_id, "Step1. Replace member invalid parameter, out member is not found"); + reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } if (m_my_repl_id != get_leader_id()) { + reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::NOT_LEADER); } @@ -190,40 +184,34 @@ AsyncReplResult<> RaftReplDev::start_replace_member(std::string& task_id, const // until the successor finishes the catch-up of the latest log, and then resign. Return NOT_LEADER and let // client retry. raft_server()->yield_leadership(false /* immediate */, -1 /* successor */); - RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out so yield leadership, task_id={}", task_id); + RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out so yield leadership"); + reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::NOT_LEADER); } // quorum safety check. TODO currently only consider lsn, need to check last response time. auto active_peers = get_active_peers(); // active_peers doesn't include leader itself. - auto active_num = active_peers.size() + 1; + auto quorum = active_peers.size() + 1; for (const auto& p : active_peers) { - active_num = p == member_out.id ? active_num - 1 : active_num; - active_num = p == member_in.id ? active_num - 1 : active_num; + quorum = p == member_out.id ? quorum - 1 : quorum; + quorum = p == member_in.id ? quorum - 1 : quorum; } RD_LOGD(trace_id, "Step1. Replace member, quorum safety check, active_peers={}, active_peers_exclude_out/in_member={}, " - "commit_quorum={}, task_id={}", - active_peers.size(), active_num, commit_quorum, task_id); + "commit_quorum={}", + active_peers.size(), quorum, commit_quorum); // commit_quorum=0 means actual commit quorum is the majority. In this case, active normal member count should be - // >= majority. To be more specific, if we have S1(leader), S2, S3(out), S4(in), we don't allow + // greater than 1. To be more specific, if we have S1(leader), S2, S3(out), S4(in), we don't allow // replace_member(S3, S4) if S2 is down or laggy. Needs to recover S2 first or retry with commit_quorum=1. - auto quorum = get_quorum_for_commit(); - if (active_num < quorum && commit_quorum == 0) { - RD_LOGE(trace_id, - "Step1. Replace member, quorum safety check failed, active_peers={}, " - "active_peers_exclude_out/in_member={}, required_quorum={}, commit_quorum={}, task_id={}", - active_peers.size(), active_num, quorum, commit_quorum, task_id); + if (quorum <= 1 && commit_quorum == 0) { + RD_LOGE(trace_id, "Step1. Replace member, quorum safety check failed, active_peers={}, active_peers_exclude_out/in_member={}, commit_quorum={}", + active_peers.size(), quorum, commit_quorum); + reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(ReplServiceError::QUORUM_NOT_MET); } - if (commit_quorum >= 1) { - // Two members are down and leader cant form the quorum. Reduce the quorum size. - reset_quorum_size(commit_quorum, trace_id); - } - // Step 2: Handle out member. #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("replace_member_set_learner_failure")) { @@ -231,24 +219,25 @@ AsyncReplResult<> RaftReplDev::start_replace_member(std::string& task_id, const return make_async_error(ReplServiceError::FAILED); } #endif - RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner, task_id={}", task_id); + RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner"); auto learner_ret = do_flip_learner(member_out, true, true, trace_id); if (learner_ret != ReplServiceError::OK) { - RD_LOGE(trace_id, "Step2. Replace member, failed to flip out member to learner {}, task_id={}", learner_ret, - task_id); + RD_LOGE(trace_id, "Step2. Replace member, failed to flip out member to learner {}", learner_ret); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error(std::move(learner_ret)); } - RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner and set priority to 0, task_id={}", task_id); + RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner and set priority to 0"); // Step 3. Append log entry to mark the old member is out and new member is added. - RD_LOGI(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req, group_id={}, task_id={}", - task_id, group_id_str()); + RD_LOGI(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req, group_id={}", + group_id_str()); auto rreq = repl_req_ptr_t(new repl_req_ctx{}); - auto ctx = replace_member_ctx(task_id, member_out, member_in); + replace_member_ctx members; + members.replica_out = member_out; + members.replica_in = member_in; - sisl::blob header(r_cast< uint8_t* >(&ctx), sizeof(replace_member_ctx)); + sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_member_ctx)); rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1), @@ -257,9 +246,7 @@ AsyncReplResult<> RaftReplDev::start_replace_member(std::string& task_id, const auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { - RD_LOGE(trace_id, - "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req failed, task_id={}, err={}", - task_id, err); + RD_LOGE(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req failed {}", err); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(err)); @@ -272,39 +259,31 @@ AsyncReplResult<> RaftReplDev::start_replace_member(std::string& task_id, const return make_async_error(ReplServiceError::FAILED); } #endif - RD_LOGI(trace_id, "Step4. Replace member, propose to raft to add new member, group_id={}, task_id={}", - group_id_str(), task_id); - replica_member_info member_to_add = member_in; - member_to_add.priority = out_srv_cfg.get()->get_priority(); - auto ret = do_add_member(member_to_add, trace_id); + RD_LOGI(trace_id, "Step4. Replace member, propose to raft to add new member, group_id={}", group_id_str()); + auto ret = do_add_member(member_in, trace_id); if (ret != ReplServiceError::OK) { - RD_LOGE(trace_id, "Step4. Replace member, add member failed, err={}, task_id={}", ret, task_id); + RD_LOGE(trace_id, "Step4. Replace member, add member failed {}", ret); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(ret)); } - RD_LOGI(trace_id, "Step4. Replace member, proposed to raft to add member, task_id={}, member={}", task_id, - boost::uuids::to_string(member_in.id)); + RD_LOGI(trace_id, "Step4. Replace member, proposed to raft to add member, member={}", boost::uuids::to_string(member_in.id)); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_success<>(); } -AsyncReplResult<> RaftReplDev::complete_replace_member(std::string& task_id, const replica_member_info& member_out, +AsyncReplResult<> RaftReplDev::complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) { if (is_stopping()) { RD_LOGI(trace_id, "repl dev is being shutdown!"); return make_async_error<>(ReplServiceError::STOPPING); } - if (get_stage() != repl_dev_stage_t::ACTIVE) { - RD_LOGE(trace_id, "repl dev is not ready, stage={}", static_cast< int >(get_stage())); - return make_async_error<>(ReplServiceError::UNREADY_STATE); - } incr_pending_request_num(); - RD_LOGI(trace_id, "Complete replace member, task_id={}, member_out={}, member_in={}", task_id, - boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + RD_LOGI(trace_id, "Complete replace member, member={}", boost::uuids::to_string(member_out.id), + boost::uuids::to_string(member_out.id)); if (commit_quorum >= 1) { // Two members are down and leader cant form the quorum. Reduce the quorum size. @@ -312,8 +291,7 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(std::string& task_id, con } // Step 5: Remove member - RD_LOGI(trace_id, "Step5. Replace member, remove old member, task_id={}, member={}", task_id, - boost::uuids::to_string(member_out.id)); + RD_LOGI(trace_id, "Step5. Replace member, remove old member, member={}", boost::uuids::to_string(member_out.id)); #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("replace_member_remove_member_failure")) { RD_LOGE(trace_id, "Simulating remove member failure"); @@ -322,13 +300,13 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(std::string& task_id, con #endif auto ret = do_remove_member(member_out, trace_id); if (ret != ReplServiceError::OK) { - RD_LOGE(trace_id, "Step5. Replace member, failed to remove member, task_id={}, member={}, err={}", task_id, + RD_LOGE(trace_id, "Step5. Replace member, failed to remove member, member={}, err={}", boost::uuids::to_string(member_out.id), ret); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(ret)); } - RD_LOGI(trace_id, "Step5. Replace member, proposed to raft to remove member, task_id={}, member={}", task_id, + RD_LOGI(trace_id, "Step5. Replace member, proposed to raft to remove member, member={}", boost::uuids::to_string(member_out.id)); auto timeout = HS_DYNAMIC_CONFIG(consensus.wait_for_config_change_ms); // TODO Move wait logic to nuraft_mesg @@ -348,19 +326,20 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(std::string& task_id, con timeout); // If the member_out is down, leader will force remove it after // leave_timeout=leave_limit_(default=5)*heart_beat_interval_, it's better for client to retry it. - return make_async_error<>(ReplServiceError::RETRY_REQUEST); + return make_async_error<>(ReplServiceError::CANCELLED); } - RD_LOGD(trace_id, "Step5. Replace member, old member is removed, task_id={}, member={}", task_id, + RD_LOGD(trace_id, "Step5. Replace member, old member is removed, member={}", boost::uuids::to_string(member_out.id)); // Step 2. Append log entry to complete replace member - RD_LOGI(trace_id, - "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req, group_id={}, task_id={}", task_id, + RD_LOGI(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req, group_id={}", group_id_str()); auto rreq = repl_req_ptr_t(new repl_req_ctx{}); - auto ctx = replace_member_ctx(task_id, member_out, member_in); + replace_member_ctx members; + members.replica_out = member_out; + members.replica_in = member_in; - sisl::blob header(r_cast< uint8_t* >(&ctx), sizeof(replace_member_ctx)); + sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_member_ctx)); rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1), @@ -369,9 +348,8 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(std::string& task_id, con auto err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { - RD_LOGE(trace_id, - "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req failed , task_id={}, err={}", - task_id, err); + RD_LOGE(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req failed , err={}", + err); reset_quorum_size(0, trace_id); decr_pending_request_num(); return make_async_error<>(std::move(err)); @@ -379,88 +357,9 @@ AsyncReplResult<> RaftReplDev::complete_replace_member(std::string& task_id, con reset_quorum_size(0, trace_id); decr_pending_request_num(); - RD_LOGI(trace_id, "Complete replace member done, group_id={}, task_id={}, member_out={} member_in={}", - group_id_str(), task_id, boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); - return make_async_success<>(); -} - -ReplaceMemberStatus RaftReplDev::get_replace_member_status(std::string& task_id, const replica_member_info& member_out, - const replica_member_info& member_in, - const std::vector< replica_member_info >& others, - uint64_t trace_id) { - if (is_stopping()) { - RD_LOGI(trace_id, "repl dev is being shutdown!"); - return ReplaceMemberStatus::UNKNOWN; - } - incr_pending_request_num(); - - if (!m_repl_svc_ctx || !is_leader()) { - decr_pending_request_num(); - return ReplaceMemberStatus::NOT_LEADER; - } - - auto peers = get_replication_status(); - peer_info out_peer_info; - bool found_out = false; - bool found_in = false; - for (auto p : peers) { - if (p.id_ == member_out.id) { - out_peer_info = p; - found_out = true; - } else if (p.id_ == member_in.id) { - found_in = true; - } - } - - bool intent_completed = !found_out && found_in; - auto persisted_task_id = get_replace_member_task_id(); - if (persisted_task_id.empty()) { - if (intent_completed) { - // If caller doesn't give others, won't check it. - bool others_match = others.size() == 0 || others.size() + 1 == peers.size(); - auto detail = std::string{}; - for (const auto& other : others) { - if (!raft_server()->get_srv_config(nuraft_mesg::to_server_id(other.id))) { - others_match = false; - detail = fmt::format("member {} is not found in raft group", boost::uuids::to_string(other.id)); - break; - } - } - if (!others_match) { - RD_LOGE(trace_id, - "get_replace_member_status failed, other membership mismatch, task_id={}, detail={}, " - "others.size={}, " - "all_peers.size={}", - task_id, detail, others.size(), peers.size()); - decr_pending_request_num(); - return ReplaceMemberStatus::UNKNOWN; - } - decr_pending_request_num(); - return ReplaceMemberStatus::COMPLETED; - } - decr_pending_request_num(); - return ReplaceMemberStatus::TASK_NOT_FOUND; - } - if (m_rd_sb->replace_member_task.task_id != task_id) { - RD_LOGE(trace_id, "get_replace_member_status failed, task_id mismatch, persisted={}, received={}", - persisted_task_id, task_id); - decr_pending_request_num(); - return ReplaceMemberStatus::TASK_ID_MISMATCH; - } - // If the first attempt to remove out_member fails because out_member is down or leader crashes between Step5(remove - // member) and Step6(HS_CTRL_COMPLETE_REPLACE mesg). Replace member intent might be already fulfilled but - // replace_member_task sb still exists. In this case, we honor task sb, return IN_PROGRESS, and wait for reaper - // thread to trigger complete_replace_member again to cleanup the sb. - if (intent_completed) { - RD_LOGI(trace_id, - "Member replacement fulfilled, but task still exists, wait for reaper thread to retry " - "complete_replace_member. task_id={}, out_member={}, in_member={}", - persisted_task_id, boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); - } - RD_LOGD(trace_id, "Member replacement is in progress. task_id={}, out_member={}, in_member={}", task_id, + RD_LOGI(trace_id, "Complete replace member done, group_id={}, member_out={} member_in={}", group_id_str(), boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); - decr_pending_request_num(); - return ReplaceMemberStatus::IN_PROGRESS; + return make_async_success<>(); } ReplServiceError RaftReplDev::do_add_member(const replica_member_info& member, uint64_t trace_id) { @@ -470,27 +369,20 @@ ReplServiceError RaftReplDev::do_add_member(const replica_member_info& member, u } auto ret = retry_when_config_changing( [&] { - auto srv_config = nuraft::srv_config(nuraft_mesg::to_server_id(member.id), 0, - boost::uuids::to_string(member.id), "", false, member.priority); - auto add_ret = m_msg_mgr.add_member(m_group_id, srv_config) + auto rem_ret = m_msg_mgr.add_member(m_group_id, member.id) .via(&folly::InlineExecutor::instance()) .thenValue([this, member, trace_id](auto&& e) -> nuraft::cmd_result_code { return e.hasError() ? e.error() : nuraft::cmd_result_code::OK; }); - return add_ret.value(); + return rem_ret.value(); }, trace_id); if (ret == nuraft::cmd_result_code::SERVER_ALREADY_EXISTS) { RD_LOGW(trace_id, "Ignoring error returned from nuraft add_member, member={}, err={}", boost::uuids::to_string(member.id), ret); - } else if (ret == nuraft::cmd_result_code::CANCELLED) { - // nuraft mesg will return cancelled if the change is not commited after waiting for - // raft_leader_change_timeout_ms(default 3200). - RD_LOGE(trace_id, "Add member failed, member={}, err={}", boost::uuids::to_string(member.id), ret); - return ReplServiceError::CANCELLED; } else if (ret != nuraft::cmd_result_code::OK) { - // It's ok to retry this request as the request - // replace member is idempotent. + // Its ok to retry this request as the request + // of replace member is idempotent. RD_LOGE(trace_id, "Add member failed, member={}, err={}", boost::uuids::to_string(member.id), ret); return ReplServiceError::RETRY_REQUEST; } @@ -562,6 +454,13 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, RD_LOGI(trace_id, "flip learner flag failed, not leader"); return ReplServiceError::NOT_LEADER; } + if (!target && member.priority == 0) { + // If the intent is to take the learner back to normal member, then priority should not be 0(never has chance to + // become leader). Client need to trace the peers' priority, and give a meaningful value, currently default + // priorities of the quorum: leader=100, follower=66. + RD_LOGI(trace_id, "clear learner flag failed, priority is 0, member={}", boost::uuids::to_string(member.id)); + return ReplServiceError::BAD_REQUEST; + } // 2. Flip learner RD_LOGI(trace_id, "flip learner flag to {}, member={}", target, boost::uuids::to_string(member.id)); @@ -586,17 +485,32 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, boost::uuids::to_string(member.id)); } - // 3. Verification + // 3. Set priority + // Based on the current nuraft implementation, learner could be elected as leader, so we set priority to 0 to avoid + // it. And in turn, we need to revert prioiry change if the member is going to become a normal member. + // FIXME after nuraft fixes the bug, we can remove this logic. + auto priority = target ? 0 : member.priority; + RD_LOGI(trace_id, "Set the priority of the member to {}, member={}", priority, boost::uuids::to_string(member.id)); + if (srv_cfg->get_priority() != priority) { + auto priority_ret = set_priority(member.id, priority); + if (priority_ret != ReplServiceError::OK) { return ReplServiceError::NOT_LEADER; } + } else { + RD_LOGD(trace_id, "Priority has already been set to {}, skip, member={}", priority, + boost::uuids::to_string(member.id)); + } + + // 4. Verification if (wait_and_verify) { auto timeout = HS_DYNAMIC_CONFIG(consensus.wait_for_config_change_ms); if (!wait_and_check( [&]() { auto srv_conf = raft_server()->get_srv_config(nuraft_mesg::to_server_id(member.id)); - return srv_conf->is_learner(); + return srv_conf->is_learner() && srv_conf->get_priority() == 0; }, timeout)) { - RD_LOGD(trace_id, "Wait for flipping learner timed out, please retry, timeout: {}", timeout); - return ReplServiceError::RETRY_REQUEST; + RD_LOGD(trace_id, "Wait for learner and priority config change timed out, cancel the request, timeout: {}", + timeout); + return ReplServiceError::CANCELLED; } } @@ -604,7 +518,7 @@ ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, } nuraft::cmd_result_code RaftReplDev::retry_when_config_changing(const std::function< nuraft::cmd_result_code() >& func, - uint64_t trace_id) { + uint64_t trace_id) { auto ret = nuraft::cmd_result_code::OK; int32_t retries = HS_DYNAMIC_CONFIG(consensus.config_changing_error_retries); for (auto i = 0; i < retries; i++) { @@ -701,56 +615,10 @@ void RaftReplDev::on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< auto null_except = std::shared_ptr< std::exception >(); HS_REL_ASSERT(result.hasError() == false, "Not expecting creating snapshot to return false. "); - // propose truncate boundary on leader if needed - if (is_leader()) { propose_truncate_boundary(); } - auto ret_val{true}; if (when_done) { when_done(ret_val, null_except); } } -void RaftReplDev::propose_truncate_boundary() { - incr_pending_request_num(); - auto repl_status = get_replication_status(); - repl_lsn_t leader_commit_idx = m_commit_upto_lsn.load(); - repl_lsn_t minimum_repl_idx = leader_commit_idx; - for (auto p : repl_status) { - if (p.id_ == m_my_repl_id) { continue; } - RD_LOGD(NO_TRACE_ID, "peer_repl_idx={}, minimum_repl_idx={}", p.replication_idx_, minimum_repl_idx); - minimum_repl_idx = std::min(minimum_repl_idx, static_cast< repl_lsn_t >(p.replication_idx_)); - } - repl_lsn_t raft_logstore_reserve_threshold = HS_DYNAMIC_CONFIG(resource_limits.raft_logstore_reserve_threshold); - repl_lsn_t truncation_upper_limit = std::max(leader_commit_idx - raft_logstore_reserve_threshold, minimum_repl_idx); - RD_LOGD(NO_TRACE_ID, - "calculated truncation_upper_limit={}, " - "leader_commit_idx={}, raft_logstore_reserve_threshold={}, minimum_repl_idx={}", - truncation_upper_limit, leader_commit_idx, raft_logstore_reserve_threshold, minimum_repl_idx); - if (truncation_upper_limit > 0) { - auto rreq = repl_req_ptr_t(new repl_req_ctx{}); - auto ctx = truncate_ctx(truncation_upper_limit); - - sisl::blob header(r_cast< uint8_t* >(&ctx), sizeof(truncate_ctx)); - rreq->init(repl_key{.server_id = server_id(), - .term = raft_server()->get_term(), - .dsn = m_next_dsn.fetch_add(1), - .traceID = std::numeric_limits< uint64_t >::max()}, - journal_type_t::HS_CTRL_UPDATE_TRUNCATION_BOUNDARY, true, header, sisl::blob{}, 0, m_listener); - - auto err = m_state_machine->propose_to_raft(std::move(rreq)); - if (err != ReplServiceError::OK) { - // failed to propose to raft to update truncation boundary - // the update will be retried next create_snapshot, so we just log the error - RD_LOGW(NO_TRACE_ID, "propose to raft for HS_CTRL_UPDATE_TRUNCATION_BOUNDARY req failed, err={}", err); - } - } - decr_pending_request_num(); -} - -// 1 before repl_dev.stop() is called, the upper layer should make sure that there is no pending request. so graceful -// shutdown can consider when stopping repl_dev, there is no pending request. -// 2 before the log is appended to log store, repl_dev will guarantee the corresponding data is persisted on disk. so -// even if we do not care about this when stop, it will be ok, since log will replayed after restart. - -// we do not have shutdown for async_alloc_write according to the two points above. void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& data, repl_req_ptr_t rreq, bool part_of_batch, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } @@ -760,9 +628,8 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& if (auto const stage = *guard.get(); stage != repl_dev_stage_t::ACTIVE) { RD_LOGW(tid, "Raft channel: Not ready to accept writes, stage={}", enum_name(stage)); handle_error(rreq, - (stage == repl_dev_stage_t::INIT) ? ReplServiceError::SERVER_IS_JOINING - : (stage == repl_dev_stage_t::UNREADY) ? ReplServiceError::UNREADY_STATE - : ReplServiceError::SERVER_IS_LEAVING); + (stage == repl_dev_stage_t::INIT) ? ReplServiceError::SERVER_IS_JOINING + : ReplServiceError::SERVER_IS_LEAVING); return; } } @@ -993,8 +860,7 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ auto rreq = it->second; if (!happened) { - // We already have the entry in the map, reset its start time to prevent it from being incorrectly gc during - // use. + // We already have the entry in the map, reset its start time to prevent it from being incorrectly gc during use. rreq->set_created_time(); // Check if we are already allocated the blk by previous caller, in that case we need to return the req. if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { @@ -1008,15 +874,17 @@ repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_typ } // rreq->init will allocate the block if it has linked data. - auto status = - init_req_ctx(rreq, rkey, code, m_raft_server_id == rkey.server_id, user_header, key, data_size, m_listener); + auto status = init_req_ctx(rreq, rkey, code, false /* is_proposer */, user_header, key, data_size, m_listener); if (status != ReplServiceError::OK) { RD_LOGD(rkey.traceID, "For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), status); if (status == ReplServiceError::NO_SPACE_LEFT && !is_data_channel && !rreq->is_proposer()) { - RD_LOGD(rkey.traceID, "Repl_key=[{}] got no_space_left error on follower as lsn={}", rkey.to_string(), lsn); - m_listener->on_no_space_left(lsn, user_header); + const auto& chunk_id = rreq->local_blkid().chunk_num(); + RD_LOGD(rkey.traceID, + "For Repl_key=[{}] alloc hints returned error={} when trying to allocate blk on chunk={}", + rkey.to_string(), status, chunk_id); + m_listener->on_no_space_left(lsn, chunk_id); } else { RD_LOGD( rkey.traceID, @@ -1433,6 +1301,11 @@ void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { if (!rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { commit_blk(rreq); } + // Remove the request from repl_key map. + m_repl_key_req_map.erase(rreq->rkey()); + // Remove the request from lsn map. + m_state_machine->unlink_lsn_to_req(rreq->lsn(), rreq); + auto cur_dsn = m_next_dsn.load(std::memory_order_relaxed); while (cur_dsn <= rreq->dsn()) { m_next_dsn.compare_exchange_strong(cur_dsn, rreq->dsn() + 1); @@ -1445,8 +1318,6 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { start_replace_member(rreq); } else if (rreq->op_code() == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { complete_replace_member(rreq); - } else if (rreq->op_code() == journal_type_t::HS_CTRL_UPDATE_TRUNCATION_BOUNDARY) { - update_truncation_boundary(rreq); } else { m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), {rreq->local_blkid()}, rreq); } @@ -1458,16 +1329,6 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { rreq->lsn(), prev_lsn); } - // Remove the request from repl_key map only after the listener operation is completed. - // This prevents unnecessary block allocation in the following scenario: - // 1. The follower processes a commit for LSN 100 and remove rreq from rep_key map before listener commit - // 2. The follower receives a duplicate append request from leader and attempts to localize it in 'raft_event' step - // 3. since the old rreq has been removed, the follower alloc new blks for LSN 100, resulting in unnecessary garbage - // By deferring the removal of the request until after the listener's commit, the listener can recognize that - // data already exist for duplicated requests, preventing the unnecessary allocation described in step 3. - m_repl_key_req_map.erase(rreq->rkey()); - // Remove the request from lsn map. - m_state_machine->unlink_lsn_to_req(rreq->lsn(), rreq); if (!rreq->is_proposer()) rreq->clear(); } @@ -1476,7 +1337,7 @@ void RaftReplDev::handle_config_commit(const repl_lsn_t lsn, raft_cluster_config // since we didn't create repl req for config change, we just need to update m_commit_upto_lsn here. RD_LOGD(NO_TRACE_ID, "config commit on lsn {}", lsn); // keep this variable in case it is needed later - (void)new_conf; + (void) new_conf; auto prev_lsn = m_commit_upto_lsn.load(std::memory_order_relaxed); if (prev_lsn >= lsn || !m_commit_upto_lsn.compare_exchange_strong(prev_lsn, lsn)) { RD_LOGE(NO_TRACE_ID, "Raft Channel: unexpected log {} commited before config {} committed", prev_lsn, lsn); @@ -1544,70 +1405,32 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) } void RaftReplDev::start_replace_member(repl_req_ptr_t rreq) { - auto ctx = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); + auto members = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); - RD_LOGI(rreq->traceID(), "Raft repl start_replace_member commit, task_id={} member_out={} member_in={}", - ctx->task_id, boost::uuids::to_string(ctx->replica_out.id), boost::uuids::to_string(ctx->replica_in.id)); + RD_LOGI(rreq->traceID(), "Raft repl start_replace_member commit member_out={} member_in={}", + boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); - m_listener->on_start_replace_member(ctx->task_id, ctx->replica_out, ctx->replica_in, rreq->traceID()); + m_listener->on_start_replace_member(members->replica_out, members->replica_in, rreq->traceID()); // record the replace_member intent std::unique_lock lg{m_sb_mtx}; - std::strncpy(m_rd_sb->replace_member_task.task_id, ctx->task_id, max_replace_member_task_id_len); - m_rd_sb->replace_member_task.replica_in = ctx->replica_in.id; - m_rd_sb->replace_member_task.replica_out = ctx->replica_out.id; + m_rd_sb->replace_member_ctx.replica_in = members->replica_in.id; + m_rd_sb->replace_member_ctx.replica_out = members->replica_out.id; m_rd_sb.write(); } void RaftReplDev::complete_replace_member(repl_req_ptr_t rreq) { - auto ctx = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); - auto task_id = std::string(ctx->task_id); - RD_LOGI(rreq->traceID(), "Raft repl complete_replace_member commit, task_id={} member_out={} member_in={}", task_id, - boost::uuids::to_string(ctx->replica_out.id), boost::uuids::to_string(ctx->replica_in.id)); + auto members = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); - m_listener->on_complete_replace_member(ctx->task_id, ctx->replica_out, ctx->replica_in, rreq->traceID()); + RD_LOGI(rreq->traceID(), "Raft repl complete_replace_member commit member_out={} member_in={}", + boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); + + m_listener->on_complete_replace_member(members->replica_out, members->replica_in, rreq->traceID()); // clear the replace_member intent std::unique_lock lg{m_sb_mtx}; - auto persisted_task_id = get_replace_member_task_id(); - if (!persisted_task_id.empty()) { - RD_DBG_ASSERT(persisted_task_id == task_id, - "Invalid task_id in complete_replace_member message, received {}, expected {}", ctx->task_id, - m_rd_sb->replace_member_task.task_id); - m_rd_sb->replace_member_task = replace_member_task_superblk{}; - m_rd_sb.write(); - } - RD_LOGI(rreq->traceID(), "Raft repl replace_member_task has been cleared."); -} - -void RaftReplDev::update_truncation_boundary(repl_req_ptr_t rreq) { - repl_lsn_t cur_checkpoint_lsn = 0; - { - std::unique_lock lg{m_sb_mtx}; - cur_checkpoint_lsn = m_rd_sb->checkpoint_lsn; - } - // expected truncation_upper_limit should not larger than the current checkpoint_lsn, this is to ensure that - // when a crash happens before index flushed to disk, all the logs larger than checkpoint_lsn are still available - // to replay. - auto ctx = r_cast< const truncate_ctx* >(rreq->header().cbytes()); - auto exp_truncation_upper_limit = std::min(ctx->truncation_upper_limit, cur_checkpoint_lsn); - auto cur_truncation_upper_limit = m_truncation_upper_limit.load(); - // exp_truncation_upper_limit might be less or equal to cur_truncation_upper_limit after Baseline Re-sync, - // we should skip update to ensure the truncation_upper_limit is always increasing. - // for example: - // T1: Leader commits upto 10000, truncate logs upto 5000, while one of followers F1 is lagging behind with lsn 100 - // T2: F1 receives a snapshot with lsn 10000, start catching up - // T3: Leader commits upto 11000, propose truncation_upper_limit as 6000 - // T4: F1 catches up and commits upto 10000, this time truncation_upper_limit is updated as 10000 - // T5: F1 doing incremental re-sync, applies the log with truncation_upper_limit=6000, which is less than 10000 - if (exp_truncation_upper_limit <= cur_truncation_upper_limit) { - RD_LOGW(NO_TRACE_ID, "exp_truncation_upper_limit {} is no larger than cur_truncation_upper_limit {}", - exp_truncation_upper_limit, cur_truncation_upper_limit); - return; - } - - while (cur_truncation_upper_limit < exp_truncation_upper_limit && - !m_truncation_upper_limit.compare_exchange_weak(cur_truncation_upper_limit, exp_truncation_upper_limit)) {} - RD_LOGI(NO_TRACE_ID, "Raft repl update truncation_upper_limit to {}", exp_truncation_upper_limit); + m_rd_sb->replace_member_ctx = replace_member_ctx_superblk{}; + m_rd_sb.write(); + RD_LOGI(rreq->traceID(), "Raft repl replace_member_ctx has been cleared."); } static bool blob_equals(sisl::blob const& a, sisl::blob const& b) { @@ -1621,68 +1444,42 @@ repl_req_ptr_t RaftReplDev::repl_key_to_req(repl_key const& rkey) const { return it->second; } -// async_read and async_free_blks graceful shutdown will be handled by data_service. - folly::Future< std::error_code > RaftReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch, trace_id_t tid) { if (is_stopping()) { LOGINFO("repl dev is being shutdown!"); return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); } - if (get_stage() != repl_dev_stage_t::ACTIVE) { - LOGINFO("repl dev is not active!"); - return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - } return data_service().async_read(bid, sgs, size, part_of_batch); } folly::Future< std::error_code > RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) { // TODO: For timeline consistency required, we should retain the blkid that is changed and write that to another // journal. - if (is_stopping()) { - LOGINFO("repl dev is being shutdown!"); - return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - } - if (get_stage() != repl_dev_stage_t::ACTIVE) { - LOGINFO("repl dev is not active!"); - return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - } - return data_service().async_free_blk(bid); + data_service().async_free_blk(bid); } AsyncReplResult<> RaftReplDev::become_leader() { - if (is_stopping()) { - LOGINFO("repl dev is being shutdown!"); - return make_async_error<>(ReplServiceError::STOPPING); - } - incr_pending_request_num(); - return m_msg_mgr.become_leader(m_group_id).via(&folly::InlineExecutor::instance()).thenValue([this](auto&& e) { if (e.hasError()) { RD_LOGE(NO_TRACE_ID, "Error in becoming leader: {}", e.error()); decr_pending_request_num(); return make_async_error<>(RaftReplService::to_repl_error(e.error())); } - decr_pending_request_num(); return make_async_success<>(); }); } -bool RaftReplDev::is_leader() const { return m_repl_svc_ctx && m_repl_svc_ctx->is_raft_leader(); } +bool RaftReplDev::is_leader() const { return m_repl_svc_ctx->is_raft_leader(); } replica_id_t RaftReplDev::get_leader_id() const { static replica_id_t empty_uuid = boost::uuids::nil_uuid(); - if (!m_repl_svc_ctx) { return empty_uuid; } auto leader = m_repl_svc_ctx->raft_leader_id(); return leader.empty() ? empty_uuid : boost::lexical_cast< replica_id_t >(leader); } std::vector< peer_info > RaftReplDev::get_replication_status() const { std::vector< peer_info > pi; - if (!m_repl_svc_ctx) { - RD_LOGD(NO_TRACE_ID, "m_repl_svc_ctx doesn't exist, returning empty peer info"); - return pi; - } auto rep_status = m_repl_svc_ctx->get_raft_status(); for (auto const& pinfo : rep_status) { pi.emplace_back(peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), @@ -1698,13 +1495,12 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { auto repl_status = get_replication_status(); std::set< replica_id_t > res; auto my_committed_idx = m_commit_upto_lsn.load(); - auto laggy = HS_DYNAMIC_CONFIG(consensus.laggy_threshold); + auto laggy=HS_DYNAMIC_CONFIG(consensus.laggy_threshold); uint64_t least_active_repl_idx = my_committed_idx > HS_DYNAMIC_CONFIG(consensus.laggy_threshold) ? my_committed_idx - HS_DYNAMIC_CONFIG(consensus.laggy_threshold) : 0; - // peer's last log idx should also >= leader's start_index-1(ensure existence), otherwise leader can't append log - // entries to it and baseline resync will be triggerred. Try to avoid conflict between baseline resync and normal - // replication. + // peer's last log idx should also >= leader's start_index-1(ensure existence), otherwise leader can't append log entries to it + // and baseline resync will be triggerred. Try to avoid conflict between baseline resync and normal replication. least_active_repl_idx = std::max(least_active_repl_idx, m_data_journal->start_index() - 1); for (auto p : repl_status) { if (p.id_ == m_my_repl_id) { continue; } @@ -1724,15 +1520,6 @@ std::set< replica_id_t > RaftReplDev::get_active_peers() const { return res; } -uint32_t RaftReplDev::get_quorum_for_commit() const { - auto peers = get_replication_status(); - auto quorum = 0; - for (auto& p : peers) { - if (p.can_vote) { quorum++; } - } - return quorum / 2 + 1; -} - uint32_t RaftReplDev::get_blk_size() const { return data_service().get_blk_size(); } nuraft_mesg::repl_service_ctx* RaftReplDev::group_msg_service() { return m_repl_svc_ctx.get(); } @@ -1859,12 +1646,6 @@ int32_t RaftReplDev::server_id() { return m_raft_server_id; } bool RaftReplDev::is_destroy_pending() const { return (*m_stage.access().get() == repl_dev_stage_t::DESTROYED); } bool RaftReplDev::is_destroyed() const { return (*m_stage.access().get() == repl_dev_stage_t::PERMANENT_DESTROYED); } -repl_dev_stage_t RaftReplDev::get_stage() const { return *m_stage.access().get(); } - -void RaftReplDev::set_stage(repl_dev_stage_t stage) { - m_stage.update([stage](auto* s) { *s = stage; }); -} - /////////////////////////////////// nuraft_mesg::mesg_state_mgr overrides //////////////////////////////////// void RaftReplDev::become_ready() { m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::ACTIVE; }); @@ -2019,75 +1800,61 @@ void RaftReplDev::flush_durable_commit_lsn() { m_rd_sb.write(); } -void RaftReplDev::monitor_replace_member_replication_status() { - if (is_destroyed() || get_stage() == repl_dev_stage_t::UNREADY) { - RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed or unready, ignore check replace member status"); +void RaftReplDev::check_replace_member_status() { + if (is_destroyed()) { + RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore check replace member status"); return; } if (!m_repl_svc_ctx || !is_leader()) { return; } - if (m_rd_sb->replace_member_task.replica_in == boost::uuids::nil_uuid() || - m_rd_sb->replace_member_task.replica_out == boost::uuids::nil_uuid()) { + if (m_rd_sb->replace_member_ctx.replica_in == boost::uuids::nil_uuid() || + m_rd_sb->replace_member_ctx.replica_out == boost::uuids::nil_uuid()) { RD_LOGT(NO_TRACE_ID, "No replace member in progress, return"); return; } auto peers = get_replication_status(); - auto task_id = std::string(m_rd_sb->replace_member_task.task_id); - auto replica_in = m_rd_sb->replace_member_task.replica_in; - auto replica_out = m_rd_sb->replace_member_task.replica_out; + auto replica_in = m_rd_sb->replace_member_ctx.replica_in; + auto replica_out = m_rd_sb->replace_member_ctx.replica_out; repl_lsn_t in_lsn = 0; repl_lsn_t out_lsn = 0; repl_lsn_t laggy = HS_DYNAMIC_CONFIG(consensus.laggy_threshold); - auto in_member_found = false; for (auto& peer : peers) { if (peer.id_ == replica_out) { out_lsn = peer.replication_idx_; RD_LOGD(NO_TRACE_ID, "Replica out {} with lsn {}", boost::uuids::to_string(peer.id_), out_lsn); } else if (peer.id_ == replica_in) { - in_member_found = true; in_lsn = peer.replication_idx_; RD_LOGD(NO_TRACE_ID, "Replica in {} with lsn {}", boost::uuids::to_string(peer.id_), in_lsn); } } - if (!in_member_found) { - RD_LOGW(NO_TRACE_ID, - "Checking replace member status, task_id={}, Replica in {} not found in the peers, add_member might " - "fail, wait for users to retry or rollback the task", - task_id, boost::uuids::to_string(replica_in)); - return; - } // TODO optimize the condition bool catch_up = in_lsn + laggy >= out_lsn; if (!catch_up) { - RD_LOGD(NO_TRACE_ID, - "Checking replace member status, task_id={},replica_in={} with lsn={}, replica_out={} with lsn={}", - task_id, boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); + RD_LOGD(NO_TRACE_ID, "Checking replace member status, replica_in={} with lsn={}, replica_out={} with lsn={}", + boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); return; } RD_LOGD(NO_TRACE_ID, - "Checking replace member status, new member has caught up, task_id={}, replica_in={} with lsn={}, " - "replica_out={} with " + "Checking replace member status, new member has caught up, replica_in={} with lsn={}, replica_out={} with " "lsn={}", - task_id, boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); + boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); trace_id_t trace_id = generateRandomTraceId(); - RD_LOGD(trace_id, "Trigger complete_replace_member, task_id={}, replica_in={}, replica_out={}", - boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_in), - boost::uuids::to_string(replica_out)); + RD_LOGD(trace_id, "Trigger complete_replace_member, replica_in={}, replica_out={}", + boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out)); replica_member_info out{replica_out, ""}; replica_member_info in{replica_in, ""}; - auto ret = complete_replace_member(task_id, out, in, 0, trace_id).get(); + auto ret = complete_replace_member(out, in, 0, trace_id).get(); if (ret.hasError()) { - RD_LOGE(trace_id, "Failed to complete replace member, next time will retry it, task_id={}, error={}", task_id, - ret.error()); + RD_LOGE(trace_id, "Failed to complete replace member, next time will retry it, error={}", ret.error()); return; } - RD_LOGI(trace_id, "Complete replace member, task_id={}, replica_in={}, replica_out={}", task_id, + RD_LOGI(trace_id, "Complete replace member, replica_in={}, replica_out={}", boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out)) } @@ -2195,18 +1962,9 @@ void RaftReplDev::gc_repl_reqs() { if (removing_rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { auto blkid = removing_rreq->local_blkid(); data_service().async_free_blk(blkid).thenValue([this, blkid, removing_rreq](auto&& err) { - if (!err) { - RD_LOGD(removing_rreq->traceID(), "GC rreq: Releasing blkid={} freed successfully", - blkid.to_string()); - } else if (err == std::make_error_code(std::errc::operation_canceled)) { - // The gc reaper thread stops after the data service has been stopped, - // leading to a scenario where it attempts to free the blkid while the data service is inactive. - // In this case, we ignore the error and simply log a warning. - RD_LOGW(removing_rreq->traceID(), "GC rreq: Releasing blkid={} canceled", blkid.to_string()); - } else { - HS_LOG_ASSERT(false, "[traceID={}] freeing blkid={} upon error failed, potential to cause blk leak", - removing_rreq->traceID(), blkid.to_string()); - } + HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", + blkid.to_string()); + RD_LOGD(removing_rreq->traceID(), "GC rreq: Releasing blkid={} freed successfully", blkid.to_string()); }); } // 2. remove from the m_repl_key_req_map @@ -2219,10 +1977,6 @@ void RaftReplDev::gc_repl_reqs() { void RaftReplDev::set_log_store_last_durable_lsn(store_lsn_t lsn) { m_data_journal->set_last_durable_lsn(lsn); } void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { - if (get_stage() == repl_dev_stage_t::UNREADY) { - RD_LOGI(NO_TRACE_ID, "Raft Channel: repl dev is in UNREADY stage, skip log replay."); - return; - } auto repl_lsn = to_repl_lsn(lsn); if (need_skip_processing(repl_lsn)) { RD_LOGI(NO_TRACE_ID, @@ -2482,17 +2236,4 @@ bool RaftReplDev::is_ready_for_traffic() const { } return ready; } - -void RaftReplDev::pause_state_machine(size_t timeout) { - RD_LOGI(NO_TRACE_ID, "Pause state machine for group_id={}", group_id_str()); - raft_server()->pause_state_machine_execution(timeout); -} - -bool RaftReplDev::is_state_machine_paused() { return raft_server()->is_state_machine_execution_paused(); } - -void RaftReplDev::resume_state_machine() { - RD_LOGI(NO_TRACE_ID, "Resume state machine execution for group_id={}", group_id_str()); - raft_server()->resume_state_machine_execution(); -} - } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 60245dfa4..abede36bf 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -15,11 +15,7 @@ #include "replication/log_store/repl_log_store.h" namespace homestore { - -static constexpr uint64_t max_replace_member_task_id_len = 64; - -struct replace_member_task_superblk { - char task_id[max_replace_member_task_id_len]; +struct replace_member_ctx_superblk { replica_id_t replica_out; replica_id_t replica_in; }; @@ -34,7 +30,7 @@ struct raft_repl_dev_superblk : public repl_dev_superblk { uint64_t last_applied_dsn; // Last applied data sequence number uint8_t destroy_pending; // Flag to indicate whether the group is in destroy pending state repl_lsn_t last_snapshot_lsn; // Last snapshot LSN follower received from leader - replace_member_task_superblk replace_member_task; // Replace members task, used to track the replace member status + replace_member_ctx_superblk replace_member_ctx; // Replace members context, used to track the replace member status uint32_t get_raft_sb_version() const { return raft_sb_version; } }; @@ -43,26 +39,11 @@ struct raft_repl_dev_superblk : public repl_dev_superblk { using raft_buf_ptr_t = nuraft::ptr< nuraft::buffer >; using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; +ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, DESTROYING, DESTROYED, PERMANENT_DESTROYED); + struct replace_member_ctx { - char task_id[max_replace_member_task_id_len]; replica_member_info replica_out; replica_member_info replica_in; - - replace_member_ctx() = default; - replace_member_ctx(const std::string& id, const replica_member_info& out, const replica_member_info& in) { - auto len = std::min(id.length(), max_replace_member_task_id_len - 1); - std::strncpy(task_id, id.c_str(), len); - task_id[len] = '\0'; - replica_out = out; - replica_in = in; - } -}; - -struct truncate_ctx { - repl_lsn_t truncation_upper_limit = 0; - - truncate_ctx() = default; - explicit truncate_ctx(repl_lsn_t limit) : truncation_upper_limit(limit) {} }; class RaftReplDevMetrics : public sisl::MetricsGroup { @@ -219,7 +200,6 @@ class RaftReplDev : public ReplDev, // the state machine should committed to before accepting traffic. This threshold ensures that // all potential committed log be committed before handling incoming requests. std::atomic< repl_lsn_t > m_traffic_ready_lsn{0}; - std::atomic< repl_lsn_t > m_truncation_upper_limit{0}; // LSN upto which it can truncate the logs in log store std::mutex m_sb_mtx; // Lock to protect the repl dev superblock @@ -249,16 +229,11 @@ class RaftReplDev : public ReplDev, bool bind_data_service(); bool join_group(); - AsyncReplResult<> start_replace_member(std::string& task_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0); - AsyncReplResult<> complete_replace_member(std::string& task_id, const replica_member_info& member_out, + AsyncReplResult<> start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + uint32_t commit_quorum = 0, uint64_t trace_id = 0); + AsyncReplResult<> complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum = 0, uint64_t trace_id = 0); - ReplaceMemberStatus get_replace_member_status(std::string& task_id, const replica_member_info& member_out, - const replica_member_info& member_in, - const std::vector< replica_member_info >& others, - uint64_t trace_id = 0); AsyncReplResult<> flip_learner_flag(const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, uint64_t trace_id = 0); ReplServiceError do_add_member(const replica_member_info& member, uint64_t trace_id = 0); @@ -267,11 +242,9 @@ class RaftReplDev : public ReplDev, uint64_t trace_id = 0); ReplServiceError set_priority(const replica_id_t& member, int32_t priority, uint64_t trace_id = 0); nuraft::cmd_result_code retry_when_config_changing(const std::function< nuraft::cmd_result_code() >& func, - uint64_t trace_id = 0); + uint64_t trace_id = 0); bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms = 100); - std::string get_replace_member_task_id() const { return {m_rd_sb->replace_member_task.task_id}; } - folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// @@ -318,20 +291,13 @@ class RaftReplDev : public ReplDev, repl_lsn_t get_last_commit_lsn() const override { return m_commit_upto_lsn.load(); } void set_last_commit_lsn(repl_lsn_t lsn) { m_commit_upto_lsn.store(lsn); } repl_lsn_t get_last_append_lsn() override { return raft_server()->get_last_log_idx(); } - repl_lsn_t get_truncation_upper_limit() const { return m_truncation_upper_limit.load(); } bool is_destroy_pending() const; bool is_destroyed() const; - void set_stage(repl_dev_stage_t stage); - repl_dev_stage_t get_stage() const; - uint32_t get_quorum_for_commit() const; Clock::time_point destroyed_time() const { return m_destroyed_time; } bool is_ready_for_traffic() const override; // purge all resources (e.g., logs in logstore) is a very dangerous operation, it is not supported yet. void purge() override { RD_REL_ASSERT(false, "NOT SUPPORTED YET"); } - void pause_state_machine(size_t timeout) override; - void resume_state_machine() override; - bool is_state_machine_paused() override; std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { return std::make_shared< nuraft_snapshot_context >(snp_ctx); @@ -406,10 +372,9 @@ class RaftReplDev : public ReplDev, void flush_durable_commit_lsn(); /** - * Monitor the replace_member replication status, if the new member is fully synced up and ready to take over, - * remove the old member. + * Check the replace_member status, if the new member is fully synced up and ready to take over, remove the old member. */ - void monitor_replace_member_replication_status(); + void check_replace_member_status(); /** * \brief This method is called during restart to notify the upper layer @@ -484,9 +449,6 @@ class RaftReplDev : public ReplDev, void create_snp_resync_data(raft_buf_ptr_t& data_out); bool save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s); - void update_truncation_boundary(repl_req_ptr_t rreq); - void propose_truncate_boundary(); - void report_blk_metrics_if_needed(repl_req_ptr_t rreq); ReplServiceError init_req_ctx(repl_req_ptr_t rreq, repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 2217f3e3e..c0f910741 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -12,8 +12,6 @@ #include "common/homestore_config.hpp" #include "common/crash_simulator.hpp" -SISL_LOGGING_DECL(replication) - namespace homestore { RaftStateMachine::RaftStateMachine(RaftReplDev& rd) : m_rd{rd} { @@ -135,17 +133,13 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_finish(nuraft::log_entry // If we are able to locate that req in the map for this entry, it could be one of // a) This is an inline data and don't need any localization // b) This is a proposer and thus don't need any localization - // c) This is a proposer but term has changed. This can happen if the leader re-election happen between - // saving req and proposing it to raft. - // d) This is an indirect data and we received raft entry append from leader and localized the journal entry. - // e) This is an indirect data and we received only on data channel, but no raft entry append from leader. This + // c) This is an indirect data and we received raft entry append from leader and localized the journal entry. + // d) This is an indirect data and we received only on data channel, but no raft entry append from leader. This // would mean _prepare is never called but directly finish is called. This can happen if that the leader is not // the original proposer (perhaps unsupported scenario at this time) // - // On case a), b), we return the rreq as is. - // For case c), we localize the actual term and then finish them as proposer. - // For case d), we just need to localize the actual server_id as well (as finishing step). - // For case e), we prepare the localization of journal entry and then finish them + // On case a), b), we return the rreq as is. For case c), we just need to localize the actual server_id as well (as + // finishing step). For case d), we prepare the localization of journal entry and then finish them // // // If we are not able to locate that req in the map for this entry, it means that no entry from raft leader is @@ -401,9 +395,6 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, m_rd.m_listener->write_snapshot_obj(snp_ctx, snp_data); if (is_last_obj) { - // Nuraft will compact and truncate all logs when processeing the last obj. - // Update the truncation upper limit here to ensure all stale logs are truncated. - m_rd.m_truncation_upper_limit.exchange(s_cast< repl_lsn_t >(s.get_last_log_idx())); hs()->cp_mgr().trigger_cp_flush(true).wait(); // ensure DSN is flushed to disk } @@ -413,7 +404,7 @@ void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("baseline_resync_restart_new_follower")) { LOGINFO("Hit flip baseline_resync_restart_new_follower crashing"); - hs()->crash_simulator().crash(); + hs()->crash_simulator().crash_now(); } #endif } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 2bb38a4df..03b540184 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -1,4 +1,3 @@ -#include #include #include "replication/repl_dev/solo_repl_dev.h" #include "replication/repl_dev/common.h" @@ -7,13 +6,9 @@ #include #include #include "common/homestore_assert.hpp" -#include "common/homestore_config.hpp" -#include - -SISL_LOGGING_DECL(solorepl) namespace homestore { -SoloReplDev::SoloReplDev(superblk< solo_repl_dev_superblk >&& rd_sb, bool load_existing) : +SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing) : m_rd_sb{std::move(rd_sb)}, m_group_id{m_rd_sb->group_id} { if (load_existing) { m_logdev_id = m_rd_sb->logdev_id; @@ -26,13 +21,11 @@ SoloReplDev::SoloReplDev(superblk< solo_repl_dev_superblk >&& rd_sb, bool load_e m_data_journal->register_log_found_cb(bind_this(SoloReplDev::on_log_found, 3)); m_is_recovered = true; }); - m_commit_upto = m_rd_sb->durable_commit_lsn; } else { m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::TIMER); m_data_journal = logstore_service().create_new_log_store(m_logdev_id, true /* append_mode */); m_rd_sb->logstore_id = m_data_journal->get_store_id(); m_rd_sb->logdev_id = m_logdev_id; - m_rd_sb->checkpoint_lsn = -1; m_rd_sb.write(); m_is_recovered = true; } @@ -42,7 +35,7 @@ void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& repl_req_ptr_t rreq, bool part_of_batch, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } - incr_pending_request_num(); + // incr_pending_request_num(); auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1, .traceID = tid}, value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, header, key, value.size, m_listener); @@ -54,9 +47,7 @@ void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& HS_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener write_journal(std::move(rreq)); }); - } else { - write_journal(std::move(rreq)); - } + } else { write_journal(std::move(rreq)); } } // destroy is only called in worker thread; @@ -88,38 +79,38 @@ void SoloReplDev::write_journal(repl_req_ptr_t rreq) { data_service().commit_blk(blkid); } m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkids(), rreq); - decr_pending_request_num(); + // decr_pending_request_num(); }); } std::error_code SoloReplDev::alloc_blks(uint32_t data_size, const blk_alloc_hints& hints, std::vector< MultiBlkId >& out_blkids) { - if (is_stopping()) { return std::make_error_code(std::errc::operation_canceled); } + // if (is_stopping()) { return std::make_error_code(std::errc::operation_canceled); } - incr_pending_request_num(); + // incr_pending_request_num(); std::vector< BlkId > blkids; auto status = data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), hints, blkids); if (status != BlkAllocStatus::SUCCESS) { DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks"); - decr_pending_request_num(); + // decr_pending_request_num(); return std::make_error_code(std::errc::no_space_on_device); } for (auto& blkid : blkids) { out_blkids.emplace_back(blkid); } - decr_pending_request_num(); + // decr_pending_request_num(); return std::error_code{}; } folly::Future< std::error_code > SoloReplDev::async_write(const std::vector< MultiBlkId >& blkids, sisl::sg_list const& value, bool part_of_batch, trace_id_t tid) { - if (is_stopping()) { + /*if (is_stopping()) { return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - } + }*/ - incr_pending_request_num(); + // incr_pending_request_num(); HS_REL_ASSERT_GT(blkids.size(), 0, "Empty blkid vec"); std::vector< folly::Future< std::error_code > > futs; futs.reserve(blkids.size()); @@ -141,21 +132,21 @@ folly::Future< std::error_code > SoloReplDev::async_write(const std::vector< Mul } return folly::collectAllUnsafe(futs).thenValue([this](auto&& v_res) { - decr_pending_request_num(); for (const auto& err_c : v_res) { if (sisl_unlikely(err_c.value())) { return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::io_error)); } } + // decr_pending_request_num(); return folly::makeFuture< std::error_code >(std::error_code{}); }); } void SoloReplDev::async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header, sisl::blob const& key, uint32_t data_size, repl_req_ptr_t rreq, trace_id_t tid) { - if (is_stopping()) { return; } - incr_pending_request_num(); + // if (is_stopping()) { return; } + // incr_pending_request_num(); // We expect clients to provide valid repl req ctx with blocks allocated. HS_REL_ASSERT(rreq, "Invalid repl req ctx"); @@ -202,31 +193,27 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx auto cur_lsn = m_commit_upto.load(); if (cur_lsn < lsn) { m_commit_upto.compare_exchange_strong(cur_lsn, lsn); } - for (const auto& blkid : blkids) { - data_service().commit_blk(blkid); - } - m_listener->on_commit(lsn, header, key, blkids, nullptr); } folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch, trace_id_t tid) { - if (is_stopping()) { + /*if (is_stopping()) { return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - } - incr_pending_request_num(); + }*/ + // incr_pending_request_num(); auto result = data_service().async_read(bid, sgs, size, part_of_batch); - decr_pending_request_num(); + // decr_pending_request_num(); return result; } folly::Future< std::error_code > SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) { - if (is_stopping()) { + /*if (is_stopping()) { return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); - } - incr_pending_request_num(); + }*/ + // incr_pending_request_num(); auto result = data_service().async_free_blk(bid); - decr_pending_request_num(); + // decr_pending_request_num(); return result; } @@ -235,34 +222,11 @@ uint32_t SoloReplDev::get_blk_size() const { return data_service().get_blk_size( void SoloReplDev::cp_flush(CP*) { auto lsn = m_commit_upto.load(); m_rd_sb->durable_commit_lsn = lsn; - // Store the LSN's for last 3 checkpoints - m_rd_sb->last_checkpoint_lsn_2 = m_rd_sb->last_checkpoint_lsn_1; - m_rd_sb->last_checkpoint_lsn_1 = m_rd_sb->checkpoint_lsn; m_rd_sb->checkpoint_lsn = lsn; - HS_LOG(TRACE, solorepl, "dev={} cp flush cp_lsn={} cp_lsn_1={} cp_lsn_2={}", boost::uuids::to_string(group_id()), - lsn, m_rd_sb->last_checkpoint_lsn_1, m_rd_sb->last_checkpoint_lsn_2); m_rd_sb.write(); } -void SoloReplDev::truncate() { - // Ignore truncate when HS is initializing. And we need atleast 3 checkpoints to start truncating. - - if (homestore::hs()->is_initializing() || m_rd_sb->last_checkpoint_lsn_2 <= 0) { return; } - - // Truncate is safe anything below last_checkpoint_lsn - 2 as all the free blks - // before that will be flushed in the last_checkpoint. - HS_LOG(TRACE, solorepl, "dev={} truncating at lsn={}", boost::uuids::to_string(group_id()), - m_rd_sb->last_checkpoint_lsn_2); - m_data_journal->truncate(m_rd_sb->last_checkpoint_lsn_2); -} - -void SoloReplDev::cp_cleanup(CP*) { -#ifdef _PRERELEASE - if (iomgr_flip::instance()->test_flip("solo_repl_dev_manual_truncate")) { return; } -#endif - // cp_cleanup is called after all components' CP flush is done. - // We call truncate during cp clean up. - truncate(); +void SoloReplDev::cp_cleanup(CP*) { /* m_data_journal->truncate(m_rd_sb->checkpoint_lsn); */ } } // namespace homestore diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index f4707124d..9cf41dcce 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -29,29 +28,17 @@ namespace homestore { class CP; -#pragma pack(1) - -struct solo_repl_dev_superblk : public repl_dev_superblk { - // Store the last 2 checkpoint lsn's where - // last_checkpoint_lsn_2 < last_checkpoint_lsn_1 < checkpoint_lsn - repl_lsn_t last_checkpoint_lsn_1{-1}; // LSN at last_checkpoint - 1 - repl_lsn_t last_checkpoint_lsn_2{-1}; // LSN at last_checkpoint - 2 -}; - -#pragma pack() - class SoloReplDev : public ReplDev { private: logdev_id_t m_logdev_id; std::shared_ptr< HomeLogStore > m_data_journal{nullptr}; - superblk< solo_repl_dev_superblk > m_rd_sb; + superblk< repl_dev_superblk > m_rd_sb; uuid_t m_group_id; std::atomic< logstore_seq_num_t > m_commit_upto{-1}; std::atomic< bool > m_is_recovered{false}; - std::atomic< bool > m_paused{false}; public: - SoloReplDev(superblk< solo_repl_dev_superblk >&& rd_sb, bool load_existing); + SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing); virtual ~SoloReplDev() = default; virtual std::error_code alloc_blks(uint32_t data_size, const blk_alloc_hints& hints, @@ -79,16 +66,8 @@ class SoloReplDev : public ReplDev { peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0, .priority_ = 1}}; } bool is_ready_for_traffic() const override { return true; } - void set_stage(repl_dev_stage_t stage) override {} - repl_dev_stage_t get_stage() const override { return repl_dev_stage_t::ACTIVE; } void purge() override {} - void pause_state_machine(size_t timeout) override { m_paused.store(true); } - - void resume_state_machine() override { m_paused.store(false); } - - bool is_state_machine_paused() override { return m_paused.load(); } - std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { return nullptr; } @@ -115,7 +94,6 @@ class SoloReplDev : public ReplDev { void cp_cleanup(CP* cp); void destroy(); - void truncate(); private: void write_journal(repl_req_ptr_t rreq); diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index b5966f239..6f3861d59 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -46,9 +46,11 @@ GenericReplService::GenericReplService(cshared< ReplApplication >& repl_app) : m nullptr); } -GenericReplService::~GenericReplService() { - std::unique_lock lg{m_rd_map_mtx}; - m_rd_map.clear(); +void GenericReplService::stop() { + { + std::unique_lock lg{m_rd_map_mtx}; + m_rd_map.clear(); + } } ReplResult< shared< ReplDev > > GenericReplService::get_repl_dev(group_id_t group_id) const { @@ -79,7 +81,6 @@ hs_stats GenericReplService::get_cap_stats() const { ///////////////////// SoloReplService specializations and CP Callbacks ///////////////////////////// SoloReplService::SoloReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} {} -SoloReplService::~SoloReplService() {}; void SoloReplService::start() { for (auto const& [buf, mblk] : m_sb_bufs) { @@ -98,12 +99,12 @@ void SoloReplService::start() { } void SoloReplService::stop() { - start_stopping(); + /*start_stopping(); while (true) { auto pending_request_num = get_pending_request_num(); if (!pending_request_num) break; std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } + }*/ // stop all repl_devs { @@ -119,7 +120,7 @@ void SoloReplService::stop() { AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t group_id, std::set< replica_id_t > const& members) { - superblk< solo_repl_dev_superblk > rd_sb{get_meta_blk_name()}; + superblk< repl_dev_superblk > rd_sb{get_meta_blk_name()}; rd_sb.create(); rd_sb->group_id = group_id; auto rdev = std::make_shared< SoloReplDev >(std::move(rd_sb), false /* load_existing */); @@ -127,7 +128,7 @@ AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t auto listener = m_repl_app->create_repl_dev_listener(group_id); listener->set_repl_dev(rdev); rdev->attach_listener(std::move(listener)); - incr_pending_request_num(); + // incr_pending_request_num(); { std::unique_lock lg(m_rd_map_mtx); @@ -135,12 +136,12 @@ AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t if (!happened) { // We should never reach here, as we have failed to emplace in map, but couldn't find entry DEBUG_ASSERT(false, "Unable to put the repl_dev in rd map"); - decr_pending_request_num(); + // decr_pending_request_num(); return make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_ALREADY_EXISTS); } } - decr_pending_request_num(); + // decr_pending_request_num(); return make_async_success< shared< ReplDev > >(rdev); } @@ -174,7 +175,7 @@ folly::SemiFuture< ReplServiceError > SoloReplService::remove_repl_dev(group_id_ } void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) { - superblk< solo_repl_dev_superblk > rd_sb{get_meta_blk_name()}; + superblk< repl_dev_superblk > rd_sb{get_meta_blk_name()}; rd_sb.load(buf, meta_cookie); HS_DBG_ASSERT_EQ(rd_sb->get_magic(), repl_dev_superblk::REPL_DEV_SB_MAGIC, "Invalid rdev metablk, magic mismatch"); HS_DBG_ASSERT_EQ(rd_sb->get_version(), repl_dev_superblk::REPL_DEV_SB_VERSION, "Invalid version of rdev metablk"); @@ -193,27 +194,17 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } } -AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, std::string& task_id, - const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum, - uint64_t trace_id) const { +AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } -AsyncReplResult<> SoloReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, - bool target, uint32_t commit_quorum, bool wait_and_verify, - uint64_t trace_id) const { +AsyncReplResult<> SoloReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, + uint32_t commit_quorum, bool wait_and_verify, uint64_t trace_id) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } -ReplaceMemberStatus SoloReplService::get_replace_member_status(group_id_t group_id, std::string& task_id, - const replica_member_info& member_out, - const replica_member_info& member_in, - const std::vector< replica_member_info >& others, - uint64_t trace_id) const { - return ReplaceMemberStatus::UNKNOWN; -} - std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return std::make_unique< CPContext >(new_cp); } diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index 91aba9f80..cd63a8866 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -47,8 +47,8 @@ class GenericReplService : public ReplicationService { static std::shared_ptr< GenericReplService > create(cshared< ReplApplication >& repl_app); GenericReplService(cshared< ReplApplication >& repl_app); - virtual ~GenericReplService(); virtual void start() = 0; + virtual void stop(); meta_sub_type get_meta_blk_name() const override { return "repl_dev"; } ReplResult< shared< ReplDev > > get_repl_dev(group_id_t group_id) const override; @@ -57,33 +57,15 @@ class GenericReplService : public ReplicationService { hs_stats get_cap_stats() const override; replica_id_t get_my_repl_uuid() const { return m_my_uuid; } // void resource_audit() override; - virtual void stop() = 0; - - repl_impl_type get_impl_type() const { return m_repl_app->get_impl_type(); } protected: virtual void add_repl_dev(group_id_t group_id, shared< ReplDev > rdev); virtual void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) = 0; - - // graceful shutdown related -protected: - std::atomic_bool m_stopping{false}; - mutable std::atomic_uint64_t pending_request_num{0}; - - bool is_stopping() const { return m_stopping.load(); } - void start_stopping() { m_stopping = true; } - - uint64_t get_pending_request_num() const { return pending_request_num.load(); } - - void incr_pending_request_num() const { pending_request_num++; } - void decr_pending_request_num() const { pending_request_num--; } }; -// TODO: implement graceful shutdown for soloReplService class SoloReplService : public GenericReplService { public: SoloReplService(cshared< ReplApplication >& repl_app); - ~SoloReplService() override; void start() override; void stop() override; @@ -91,17 +73,12 @@ class SoloReplService : public GenericReplService { std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0) const override; + AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, uint64_t trace_id = 0) const override; - ReplaceMemberStatus get_replace_member_status(group_id_t group_id, std::string& task_id, - const replica_member_info& member_out, - const replica_member_info& member_in, - const std::vector< replica_member_info >& others, - uint64_t trace_id = 0) const override; }; class SoloReplServiceCPHandler : public CPCallbacks { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 3af6908bf..8df5d5e6a 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -65,8 +65,7 @@ int32_t RaftReplService::compute_raft_follower_priority() { auto max_wait_round = std::min(raft_priority_election_round_upper_limit, HS_DYNAMIC_CONFIG(consensus.max_wait_rounds_of_priority_election)); if (max_wait_round == 0) { return raft_leader_priority; } - auto priority = 1 + - static_cast< int32_t >( + auto priority = 1 + static_cast< int32_t >( std::ceil(raft_leader_priority * std::pow(raft_priority_decay_coefficient, max_wait_round))); return priority; } @@ -93,8 +92,7 @@ void RaftReplService::start() { .token_verifier_ = std::dynamic_pointer_cast< sisl::GrpcTokenVerifier >(ioenvironment.get_token_verifier()), .token_client_ = std::dynamic_pointer_cast< sisl::GrpcTokenClient >(ioenvironment.get_token_client()), .max_receive_message_size_ = HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), - .max_send_message_size_ = HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), - .enable_console_log_ = HS_DYNAMIC_CONFIG(consensus.enable_console_log)}; + .max_send_message_size_ = HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size)}; m_msg_mgr = nuraft_mesg::init_messaging(params, weak_from_this(), true /* with_data_channel */); LOGINFO("Starting RaftReplService with server_uuid={} port={}", boost::uuids::to_string(params.server_uuid_), @@ -150,8 +148,6 @@ void RaftReplService::start() { } m_config_sb_bufs.clear(); LOGINFO("Repl devs load completed, calling upper layer on_repl_devs_init_completed"); - // The upper layer(m_repl_app) can leverage this cb to initiate and recover its data. - // If some errors occurs, m_repl_app can set back the stage of repl_dev to repl_dev_stage_t::UNREADY. m_repl_app->on_repl_devs_init_completed(); // Step 5: Start the data and logstore service now. This step is essential before we can ask Raft to join groups etc @@ -176,10 +172,6 @@ void RaftReplService::start() { // Step 6: Iterate all the repl devs and ask each one of them to join the raft group concurrently. std::vector< std::future< bool > > join_group_futures; for (const auto& [_, repl_dev] : m_rd_map) { - if (repl_dev->get_stage() == repl_dev_stage_t::UNREADY) { - LOGINFO("Repl dev is unready, skip join group, group_id={}", boost::uuids::to_string(repl_dev->group_id())); - continue; - } join_group_futures.emplace_back(std::async(std::launch::async, [&repl_dev]() { auto rdev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev); rdev->wait_for_logstore_ready(); @@ -205,12 +197,14 @@ void RaftReplService::start() { } void RaftReplService::stop() { +#if 0 start_stopping(); while (true) { auto pending_request_num = get_pending_request_num(); if (!pending_request_num) break; std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } +#endif // stop all repl_devs { @@ -228,15 +222,7 @@ void RaftReplService::stop() { // 3 Cancel all scheduler tasks. // after m_msg_mgr is reset , no further data will hit data service and no futher log will hit log store. m_msg_mgr.reset(); - hs()->logstore_service().stop(); - hs()->data_service().stop(); -} - -RaftReplService::~RaftReplService() { - stop_reaper_thread(); - - // the base class destructor will clear the m_rd_map } void RaftReplService::monitor_cert_changes() { @@ -350,8 +336,6 @@ shared< nuraft_mesg::mesg_state_mgr > RaftReplService::create_state_mgr(int32_t AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t group_id, std::set< replica_id_t > const& members) { - if (is_stopping()) return make_async_error< shared< ReplDev > >(ReplServiceError::STOPPING); - incr_pending_request_num(); // TODO: All operations are made sync here for convenience to caller. However, we should attempt to make this async // and do deferValue to a seperate dedicated hs thread for these kind of operations and wakeup the caller. It // probably needs iomanager executor for deferValue. @@ -359,7 +343,6 @@ AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t // Create a new RAFT group and add all members. create_group() will call the create_state_mgr which will create // the repl_dev instance and add it to the map. if (auto const status = m_msg_mgr->create_group(group_id, "homestore_replication").get(); !status) { - decr_pending_request_num(); return make_async_error< shared< ReplDev > >(to_repl_error(status.error())); } @@ -369,17 +352,16 @@ AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t for (auto& member : members) { if (member == my_id) { continue; } // Skip myself do { - auto srv_config = nuraft::srv_config(nuraft_mesg::to_server_id(member), 0, - boost::uuids::to_string(member), "", false, follower_priority); + auto srv_config = nuraft::srv_config(nuraft_mesg::to_server_id(member), 0, boost::uuids::to_string(member), "", + false, follower_priority); auto const result = m_msg_mgr->add_member(group_id, srv_config).get(); if (result) { - LOGINFOMOD(replication, "Groupid={}, new member={} added with priority={}", - boost::uuids::to_string(group_id), boost::uuids::to_string(member), follower_priority); + LOGINFOMOD(replication, "Groupid={}, new member={} added with priority={}", boost::uuids::to_string(group_id), + boost::uuids::to_string(member), follower_priority); break; } else if (result.error() != nuraft::CONFIG_CHANGING) { LOGWARNMOD(replication, "Groupid={}, add member={} failed with error={}", boost::uuids::to_string(group_id), boost::uuids::to_string(member), result.error()); - decr_pending_request_num(); return make_async_error< shared< ReplDev > >(to_repl_error(result.error())); } else { LOGWARNMOD(replication, @@ -392,7 +374,6 @@ AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t } auto result = get_repl_dev(group_id); - decr_pending_request_num(); return result ? make_async_success< shared< ReplDev > >(result.value()) : make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_NOT_FOUND); } @@ -425,18 +406,12 @@ AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t // purge any unopened logstores. // folly::SemiFuture< ReplServiceError > RaftReplService::remove_repl_dev(group_id_t group_id) { - if (is_stopping()) return folly::makeSemiFuture< ReplServiceError >(ReplServiceError::STOPPING); - incr_pending_request_num(); - auto rdev_result = get_repl_dev(group_id); - if (!rdev_result) { - decr_pending_request_num(); - return folly::makeSemiFuture< ReplServiceError >(ReplServiceError::SERVER_NOT_FOUND); - } + if (!rdev_result) { return folly::makeSemiFuture< ReplServiceError >(ReplServiceError::SERVER_NOT_FOUND); } auto ret = std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value())->destroy_group(); - decr_pending_request_num(); + // decr_pending_request_num(); return ret; } @@ -490,66 +465,45 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki // In this function, it only invokes replDev start_replace_member. There is // a background reaper thread helps periodically check the member_in replication status, after in_member has caught up, // will trigger replDev complete_replace_member. -AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, std::string& task_id, - const replica_member_info& member_out, +AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, const replica_member_info& member_in, uint32_t commit_quorum, uint64_t trace_id) const { - if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); - incr_pending_request_num(); + // if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); + // incr_pending_request_num(); auto rdev_result = get_repl_dev(group_id); - if (!rdev_result) { - decr_pending_request_num(); - return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); - } + if (!rdev_result) { return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) - ->start_replace_member(task_id, member_out, member_in, commit_quorum, trace_id) + ->start_replace_member(member_out, member_in, commit_quorum, trace_id) .via(&folly::InlineExecutor::instance()) .thenValue([this](auto&& e) mutable { if (e.hasError()) { - decr_pending_request_num(); + // decr_pending_request_num(); return make_async_error<>(e.error()); } - decr_pending_request_num(); + // decr_pending_request_num(); return make_async_success<>(); }); } -AsyncReplResult<> RaftReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, - bool target, uint32_t commit_quorum, bool wait_and_verify, - uint64_t trace_id) const { - if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); - incr_pending_request_num(); +AsyncReplResult<> RaftReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, + bool wait_and_verify, uint64_t trace_id) const { + // if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); + // incr_pending_request_num(); auto rdev_result = get_repl_dev(group_id); if (!rdev_result) { - decr_pending_request_num(); + // decr_pending_request_num(); return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) ->flip_learner_flag(member, target, commit_quorum, wait_and_verify, trace_id) .via(&folly::InlineExecutor::instance()) .thenValue([this](auto&& e) mutable { - if (e.hasError()) { - decr_pending_request_num(); - return make_async_error<>(e.error()); - } - decr_pending_request_num(); + if (e.hasError()) { return make_async_error<>(e.error()); } return make_async_success<>(); }); } -// This query should always be called on leader to avoid misleading results due to lagging status on some followers. -ReplaceMemberStatus RaftReplService::get_replace_member_status(group_id_t group_id, std::string& task_id, - const replica_member_info& member_out, - const replica_member_info& member_in, - const std::vector< replica_member_info >& others, - uint64_t trace_id) const { - auto rdev_result = get_repl_dev(group_id); - if (!rdev_result) { return ReplaceMemberStatus::UNKNOWN; } - return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) - ->get_replace_member_status(task_id, member_out, member_in, others, trace_id); -} - ////////////////////// Reaper Thread related ////////////////////////////////// void RaftReplService::start_reaper_thread() { folly::Promise< folly::Unit > p; @@ -581,11 +535,11 @@ void RaftReplService::start_reaper_thread() { HS_DYNAMIC_CONFIG(consensus.flush_durable_commit_interval_ms) * 1000 * 1000, true /* recurring */, nullptr, [this](void*) { flush_durable_commit_lsn(); }); - // Check replace_member sync status to see a new member is fully synced up and ready to remove the old - // member + // Check replace_member sync status to see a new member is fully synced up and ready to remove the old member m_replace_member_sync_check_timer_hdl = iomanager.schedule_thread_timer( HS_DYNAMIC_CONFIG(consensus.replace_member_sync_check_interval_ms) * 1000 * 1000, true /* recurring */, - nullptr, [this](void*) { monitor_replace_member_replication_status(); }); + nullptr, [this](void*) { check_replace_member_status(); }); + p.setValue(); } else { @@ -634,13 +588,13 @@ void RaftReplService::gc_repl_reqs() { } void RaftReplService::gc_repl_devs() { - incr_pending_request_num(); + /* incr_pending_request_num(); // Skip gc when raft repl service is stopping to avoid concurrency issues between repl_dev's stop and destroy ops. if (is_stopping()) { LOGINFOMOD(replication, "ReplSvc is stopping, skipping GC"); decr_pending_request_num(); return; - } + } */ std::vector< group_id_t > groups_to_leave; { @@ -670,7 +624,7 @@ void RaftReplService::gc_repl_devs() { m_rd_map.erase(group_id); } } - decr_pending_request_num(); + // decr_pending_request_num(); } void RaftReplService::flush_durable_commit_lsn() { @@ -678,20 +632,15 @@ void RaftReplService::flush_durable_commit_lsn() { for (auto& rdev_parent : m_rd_map) { // FIXUP: is it safe to access rdev_parent here? auto rdev = std::dynamic_pointer_cast< RaftReplDev >(rdev_parent.second); - if (rdev->get_stage() == repl_dev_stage_t::UNREADY) { - LOGINFOMOD(replication, "ReplDev group_id={} is UNREADY, skip flushing durable commit lsn", - boost::uuids::to_string(rdev->group_id())); - continue; - } rdev->flush_durable_commit_lsn(); } } -void RaftReplService::monitor_replace_member_replication_status() { +void RaftReplService::check_replace_member_status() { std::unique_lock lg(m_rd_map_mtx); for (auto& rdev_parent : m_rd_map) { auto rdev = std::dynamic_pointer_cast< RaftReplDev >(rdev_parent.second); - rdev->monitor_replace_member_replication_status(); + rdev->check_replace_member_status(); } } diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index 2f8acfb2f..aa9550c4f 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -58,7 +58,6 @@ class RaftReplService : public GenericReplService, public: RaftReplService(cshared< ReplApplication >& repl_app); - ~RaftReplService() override; static ReplServiceError to_repl_error(nuraft::cmd_result_code code); int32_t compute_raft_follower_priority(); @@ -79,20 +78,14 @@ class RaftReplService : public GenericReplService, std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, std::string& task_id, const replica_member_info& member_out, - const replica_member_info& member_in, uint32_t commit_quorum = 0, - uint64_t trace_id = 0) const override; + AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, bool wait_and_verify = true, uint64_t trace_id = 0) const override; - ReplaceMemberStatus get_replace_member_status(group_id_t group_id, std::string& task_id, - const replica_member_info& member_out, - const replica_member_info& member_in, - const std::vector< replica_member_info >& others, - uint64_t trace_id = 0) const override; - private: RaftReplDev* raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie); void start_reaper_thread(); @@ -101,7 +94,7 @@ class RaftReplService : public GenericReplService, void gc_repl_devs(); void gc_repl_reqs(); void flush_durable_commit_lsn(); - void monitor_replace_member_replication_status(); + void check_replace_member_status(); void monitor_cert_changes(); void restart_raft_svc(const std::string filepath, const bool deleted); bool wait_for_cert(const std::string& filepath); diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 916364b41..dece4b36e 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -32,12 +32,6 @@ if (${build_nonio_tests}) target_link_libraries(test_btree_node ${COMMON_TEST_DEPS} GTest::gtest) add_test(NAME BtreeNode COMMAND test_btree_node) - set(TEST_MEMBTREE_SOURCE_FILES test_mem_btree.cpp) - add_executable(test_mem_btree ${TEST_MEMBTREE_SOURCE_FILES}) - target_link_libraries(test_mem_btree homestore ${COMMON_TEST_DEPS} GTest::gtest) - add_test(NAME MemBtree COMMAND test_mem_btree) - set_tests_properties(MemBtree PROPERTIES TIMEOUT 1200) - add_executable(test_blk_read_tracker) target_sources(test_blk_read_tracker PRIVATE test_blk_read_tracker.cpp ../lib/blkdata_svc/blk_read_tracker.cpp ../lib/blkalloc/blk.cpp) target_link_libraries(test_blk_read_tracker ${COMMON_TEST_DEPS} GTest::gtest) @@ -68,18 +62,20 @@ endif() can_build_io_tests(io_tests) if (${io_tests}) - set(TEST_INDEXBTREE_SOURCE_FILES test_index_btree.cpp) - add_executable(test_index_btree ${TEST_INDEXBTREE_SOURCE_FILES}) - target_link_libraries(test_index_btree homestore ${COMMON_TEST_DEPS} GTest::gtest) - add_test(NAME IndexBtree COMMAND test_index_btree) - set_property(TEST IndexBtree PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true") - set_tests_properties(IndexBtree PROPERTIES TIMEOUT 1200) - - set(TEST_RECOVERY_INDEX_SOURCE_FILES test_index_crash_recovery.cpp) - add_executable(test_index_crash_recovery ${TEST_RECOVERY_INDEX_SOURCE_FILES}) - target_link_libraries(test_index_crash_recovery homestore ${COMMON_TEST_DEPS} GTest::gtest) - add_test(NAME IndexCrashRecovery COMMAND test_index_crash_recovery) - set_property(TEST IndexCrashRecovery PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true") + set(TEST_BTREE_SOURCE_FILES test_btree.cpp) + add_executable(test_btree ${TEST_BTREE_SOURCE_FILES}) + target_link_libraries(test_btree homestore ${COMMON_TEST_DEPS} GTest::gtest) + add_test(NAME Btree COMMAND test_btree) + set_property(TEST Btree PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true") + set_tests_properties(Btree PROPERTIES TIMEOUT 1200) + + set(TEST_COW_BTREE_RECOVERY_SOURCE_FILES test_cow_btree_recovery.cpp) + add_executable(test_cow_btree_recovery ${TEST_COW_BTREE_RECOVERY_SOURCE_FILES}) + target_link_libraries(test_cow_btree_recovery homestore ${COMMON_TEST_DEPS} GTest::gtest) + add_test(NAME COWBtreeRecovery COMMAND test_cow_btree_recovery) + set_property(TEST COWBtreeRecovery PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true") + set_tests_properties(COWBtreeRecovery PROPERTIES TIMEOUT 1200) + set_tests_properties(COWBtreeRecovery PROPERTIES DEPENDS Btree) add_executable(test_data_service) target_sources(test_data_service PRIVATE test_data_service.cpp) @@ -106,6 +102,41 @@ if (${io_tests}) target_link_libraries(test_cp_mgr homestore ${COMMON_TEST_DEPS} GTest::gtest) add_test(NAME CPMgr COMMAND test_cp_mgr) + can_build_epoll_io_tests(epoll_tests) + if(${epoll_tests}) + add_test(NAME LogDev-Epoll COMMAND test_log_dev) + add_test(NAME LogStore-Epoll COMMAND test_log_store) + add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr) + add_test(NAME DataService-Epoll COMMAND test_data_service) + endif() + + can_build_spdk_io_tests(spdk_tests) + if(${spdk_tests}) + add_test(NAME LogStore-Spdk COMMAND test_log_store -- --spdk "true") + add_test(NAME LogDev-Spdk COMMAND test_log_dev -- --spdk "true") + add_test(NAME MetaBlkMgr-Spdk COMMAND test_meta_blk_mgr -- --spdk "true") + add_test(NAME DataSerice-Spdk COMMAND test_data_service -- --spdk "true") + if(${epoll_tests}) + SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk) + SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk) + endif() + endif() +endif() + +can_build_repl_tests(repl_tests) +if (${repl_tests}) + add_executable(test_repl_service) + target_sources(test_repl_service PRIVATE test_repl_service.cpp) + target_link_libraries(test_repl_service homestore ${COMMON_TEST_DEPS} GTest::gmock) + + add_executable(test_repl_log_store) + target_sources(test_repl_log_store PRIVATE test_repl_log_store.cpp) + target_link_libraries(test_repl_log_store hs_logdev homestore ${COMMON_TEST_DEPS} GTest::gmock) + + add_executable(test_repl_data_service) + target_sources(test_repl_data_service PRIVATE test_repl_data_service.cpp) + target_link_libraries(test_repl_data_service homestore ${COMMON_TEST_DEPS} GTest::gmock) + add_executable(test_solo_repl_dev) target_sources(test_solo_repl_dev PRIVATE test_solo_repl_dev.cpp) target_link_libraries(test_solo_repl_dev homestore ${COMMON_TEST_DEPS} GTest::gmock) @@ -124,30 +155,18 @@ if (${io_tests}) can_build_epoll_io_tests(epoll_tests) if(${epoll_tests}) - add_test(NAME LogDev-Epoll COMMAND test_log_dev) - add_test(NAME LogStore-Epoll COMMAND test_log_store) - add_test(NAME HomeRaftLogStore-Epoll COMMAND test_home_raft_logstore) - add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr) - add_test(NAME DataService-Epoll COMMAND test_data_service) - # add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev) - # add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic --override_config homestore_config.consensus.replace_member_sync_check_interval_ms=1000) + add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev) + add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic --override_config homestore_config.consensus.replace_member_sync_check_interval_ms=1000) add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) + add_test(NAME HomeRaftLogStore-Epoll COMMAND test_home_raft_logstore) endif() can_build_spdk_io_tests(spdk_tests) if(${spdk_tests}) - add_test(NAME LogStore-Spdk COMMAND test_log_store -- --spdk "true") - add_test(NAME LogDev-Spdk COMMAND test_log_dev -- --spdk "true") - add_test(NAME MetaBlkMgr-Spdk COMMAND test_meta_blk_mgr -- --spdk "true") - add_test(NAME DataSerice-Spdk COMMAND test_data_service -- --spdk "true") - add_test(NAME SoloReplDev-Spdk COMMAND test_solo_repl_dev -- --spdk "true") - add_test(NAME HomeRaftLogStore-Spdk COMMAND test_home_raft_logstore -- --spdk "true") add_test(NAME RaftReplDev-Spdk COMMAND test_raft_repl_dev -- --spdk "true") - add_test(NAME RaftReplDevDynamic-Spdk COMMAND test_raft_repl_dev_dynamic -- --spdk "true") - if(${epoll_tests}) - SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk) - SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk) - endif() + add_test(NAME RaftReplDevDynamic-Spdk COMMAND test_raft_repl_dev_dynamic -- --spdk "true" --override_config homestore_config.consensus.replace_member_sync_check_interval_ms=1000) + add_test(NAME SoloReplDev-Spdk COMMAND test_solo_repl_dev -- --spdk "true") + add_test(NAME HomeRaftLogStore-Spdk COMMAND test_home_raft_logstore -- --spdk "true") endif() endif() @@ -157,8 +176,4 @@ if (${non_coverage_build}) add_executable(log_store_benchmark) target_sources(log_store_benchmark PRIVATE log_store_benchmark.cpp) target_link_libraries(log_store_benchmark hs_logdev homestore ${COMMON_TEST_DEPS} benchmark::benchmark) - - add_executable(index_btree_benchmark) - target_sources(index_btree_benchmark PRIVATE index_btree_benchmark.cpp) - target_link_libraries(index_btree_benchmark homestore ${COMMON_TEST_DEPS} benchmark::benchmark) endif() diff --git a/src/tests/btree_helpers/btree_decls.h b/src/tests/btree_helpers/btree_decls.h index 132e1553e..9c094b6c8 100644 --- a/src/tests/btree_helpers/btree_decls.h +++ b/src/tests/btree_helpers/btree_decls.h @@ -16,44 +16,55 @@ *********************************************************************************/ #pragma once -#include +#include +#include +template < IndexStore::Type StoreType > struct FixedLenBtree { - using BtreeType = IndexTable< TestFixedKey, TestFixedValue >; + using BtreeType = Btree< TestFixedKey, TestFixedValue >; using KeyType = TestFixedKey; using ValueType = TestFixedValue; static constexpr btree_node_type leaf_node_type = btree_node_type::FIXED; static constexpr btree_node_type interior_node_type = btree_node_type::FIXED; + static constexpr IndexStore::Type store_type = StoreType; }; +template < IndexStore::Type StoreType > struct VarKeySizeBtree { - using BtreeType = IndexTable< TestVarLenKey, TestFixedValue >; + using BtreeType = Btree< TestVarLenKey, TestFixedValue >; using KeyType = TestVarLenKey; using ValueType = TestFixedValue; static constexpr btree_node_type leaf_node_type = btree_node_type::VAR_KEY; static constexpr btree_node_type interior_node_type = btree_node_type::VAR_KEY; + static constexpr IndexStore::Type store_type = StoreType; }; +template < IndexStore::Type StoreType > struct VarValueSizeBtree { - using BtreeType = IndexTable< TestFixedKey, TestVarLenValue >; + using BtreeType = Btree< TestFixedKey, TestVarLenValue >; using KeyType = TestFixedKey; using ValueType = TestVarLenValue; static constexpr btree_node_type leaf_node_type = btree_node_type::VAR_VALUE; static constexpr btree_node_type interior_node_type = btree_node_type::FIXED; + static constexpr IndexStore::Type store_type = StoreType; }; +template < IndexStore::Type StoreType > struct VarObjSizeBtree { - using BtreeType = IndexTable< TestVarLenKey, TestVarLenValue >; + using BtreeType = Btree< TestVarLenKey, TestVarLenValue >; using KeyType = TestVarLenKey; using ValueType = TestVarLenValue; static constexpr btree_node_type leaf_node_type = btree_node_type::VAR_OBJECT; - static constexpr btree_node_type interior_node_type = btree_node_type::VAR_OBJECT; + static constexpr btree_node_type interior_node_type = btree_node_type::VAR_KEY; + static constexpr IndexStore::Type store_type = StoreType; }; +template < IndexStore::Type StoreType > struct PrefixIntervalBtree { - using BtreeType = IndexTable< TestIntervalKey, TestIntervalValue >; + using BtreeType = Btree< TestIntervalKey, TestIntervalValue >; using KeyType = TestIntervalKey; using ValueType = TestIntervalValue; - static constexpr btree_node_type leaf_node_type = btree_node_type::PREFIX; + static constexpr btree_node_type leaf_node_type = btree_node_type::FIXED_PREFIX; static constexpr btree_node_type interior_node_type = btree_node_type::FIXED; + static constexpr IndexStore::Type store_type = StoreType; }; \ No newline at end of file diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index 04357afc0..9b2b07c52 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -25,12 +25,21 @@ #include #include #include -#include "common/homestore_config.hpp" +#include + #include "test_common/range_scheduler.hpp" #include "shadow_map.hpp" static constexpr uint32_t g_node_size{4096}; +struct BtreeTestOptions { + uint32_t num_entries; + uint32_t preload_size; + uint32_t num_ios; + uint32_t run_time_secs; + bool disable_merge{false}; +}; + template < typename TestType > struct BtreeTestHelper { using T = TestType; @@ -39,19 +48,24 @@ struct BtreeTestHelper { using mutex = iomgr::FiberManagerLib::shared_mutex; using op_func_t = std::function< void(void) >; - BtreeTestHelper() : m_shadow_map{SISL_OPTIONS["num_entries"].as< uint32_t >()} {} - - void SetUp() { + BtreeTestHelper(BtreeTestOptions options) : m_options{std::move(options)}, m_shadow_map{options.num_entries} { m_cfg.m_leaf_node_type = T::leaf_node_type; m_cfg.m_int_node_type = T::interior_node_type; - m_cfg.m_max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); - if (SISL_OPTIONS.count("disable_merge")) { m_cfg.m_merge_turned_on = false; } + m_cfg.m_store_type = T::store_type; + } - m_max_range_input = SISL_OPTIONS["num_entries"].as< uint32_t >(); + virtual void SetUp(std::shared_ptr< Btree< K, V > > bt, bool load, bool is_multi_threaded = false) { + m_bt = std::move(bt); + m_shadow_filename = fmt::format("/tmp/btree_{}_shadow_map", m_bt->ordinal()); + + if (!load) { std::filesystem::remove(m_shadow_filename); } + m_max_range_input = m_options.num_entries; + m_is_multi_threaded = is_multi_threaded; + if (m_options.disable_merge) { m_cfg.m_merge_turned_on = false; } if (m_is_multi_threaded) { std::mutex mtx; - m_run_time = SISL_OPTIONS["run_time"].as< uint32_t >(); + m_fibers.clear(); iomanager.run_on_wait(iomgr::reactor_regex::all_worker, [this, &mtx]() { auto fv = iomanager.sync_io_capable_fibers(); std::unique_lock lg(mtx); @@ -68,13 +82,15 @@ struct BtreeTestHelper { void TearDown() {} +public: + std::shared_ptr< Btree< K, V > > m_bt; + BtreeConfig m_cfg; + protected: - std::shared_ptr< typename T::BtreeType > m_bt; + BtreeTestOptions const m_options; ShadowMap< K, V > m_shadow_map; - BtreeConfig m_cfg{g_node_size}; uint32_t m_max_range_input{1000}; bool m_is_multi_threaded{false}; - uint32_t m_run_time{0}; std::map< std::string, op_func_t > m_operations; std::vector< iomgr::io_fiber_t > m_fibers; @@ -82,6 +98,9 @@ struct BtreeTestHelper { std::condition_variable m_test_done_cv; std::random_device m_re; std::atomic< uint32_t > m_num_ops{0}; + Clock::time_point m_start_time; + std::string m_shadow_filename; + #ifdef _PRERELEASE flip::FlipClient m_fc{iomgr_flip::instance()}; #endif @@ -101,6 +120,7 @@ struct BtreeTestHelper { LOGINFO("Flip {} reset", flip_name); } #endif + void preload(uint32_t preload_size) { if (preload_size == 0) { LOGINFO("Preload Skipped"); @@ -112,40 +132,17 @@ struct BtreeTestHelper { const auto last_chunk_size = preload_size % chunk_size ?: chunk_size; auto test_count = n_fibers; + LOGINFO("Btree{}: {} entries will be preloaded in {} fibers in parallel", m_bt->ordinal(), preload_size, + m_fibers.size()); for (std::size_t i = 0; i < n_fibers; ++i) { const auto start_range = i * chunk_size; const auto end_range = start_range + ((i == n_fibers - 1) ? last_chunk_size : chunk_size) - 1; auto fiber_id = i; iomanager.run_on_forget(m_fibers[i], [this, start_range, end_range, &test_count, fiber_id, preload_size]() { - double progress_interval = - (double)(end_range - start_range) / 20; // 5% of the total number of iterations - double progress_thresh = progress_interval; // threshold for progress interval - double elapsed_time, progress_percent, last_progress_time = 0; - auto m_start_time = Clock::now(); - + m_start_time = Clock::now(); for (uint32_t i = start_range; i < end_range; i++) { put(i, btree_put_type::INSERT); - if (fiber_id == 0) { - elapsed_time = get_elapsed_time_sec(m_start_time); - progress_percent = (double)(i - start_range) / (end_range - start_range) * 100; - - // check progress every 5% of the total number of iterations or every 30 seconds - bool print_time = false; - if (i >= progress_thresh) { - progress_thresh += progress_interval; - print_time = true; - } - if (elapsed_time - last_progress_time > 30) { - last_progress_time = elapsed_time; - print_time = true; - } - if (print_time) { - LOGINFO("Progress: iterations completed ({:.2f}%)- Elapsed time: {:.0f} seconds- " - "populated entries: {} ({:.2f}%)", - progress_percent, elapsed_time, m_shadow_map.size(), - m_shadow_map.size() * 100.0 / preload_size); - } - } + track_progress(preload_size, "Preload"); } { std::unique_lock lg(m_test_done_mtx); @@ -159,11 +156,35 @@ struct BtreeTestHelper { m_test_done_cv.wait(lk, [&]() { return test_count == 0; }); } - LOGINFO("Preload Done"); + LOGINFO("Btree{}: Preload Done", m_bt->ordinal()); } uint32_t get_op_num() const { return m_num_ops.load(); } + void track_progress(uint32_t max_ops, std::string_view work_type) { + static Clock::time_point last_print_time{Clock::now()}; + + bool print{false}; + auto completed = m_num_ops.fetch_add(1) + 1; + + auto elapsed_time = get_elapsed_time_sec(last_print_time); + if (elapsed_time > 30) { + // Print percent every 30 seconds no matter what + print = true; + } else if ((completed % (max_ops / 10) == 0) && (elapsed_time > 1)) { + // 10% completed and at least 1 second after last print time, we can print again + print = true; + } + + if (print) { + auto map_size = m_shadow_map.size(); + LOGINFO("Progress=({:.2f}%) IOsCompleted={} ElapsedTime={} seconds {} EntriesFilled={} ({:.2f}%)", + completed * 100.0 / max_ops, completed, get_elapsed_time_sec(m_start_time), work_type, map_size, + map_size * 100.0 / m_max_range_input); + last_print_time = Clock::now(); + } + } + ////////////////////// All put operation variants /////////////////////////////// void put(uint64_t k, btree_put_type put_type, bool expect = true) { do_put(k, put_type, V::generate_rand(), expect); @@ -180,23 +201,31 @@ struct BtreeTestHelper { auto existing_v = std::make_unique< V >(); K key = K{k}; V value = V::generate_rand(); - auto sreq = BtreeSinglePutRequest{&key, &value, btree_put_type::UPSERT, existing_v.get()}; - sreq.enable_route_tracing(); - auto const ret = m_bt->put(sreq); + auto const ret = m_bt->put_one(key, value, btree_put_type::UPSERT, existing_v.get()); ASSERT_EQ(ret, btree_status_t::success) << "Upsert key=" << k << " failed with error=" << enum_name(ret); m_shadow_map.force_put(k, value); } + void put_delta(uint64_t k) { + K key{k}; + auto it = m_shadow_map.map_const().find(key); + ASSERT_TRUE(it != m_shadow_map.map_const().cend()) + << "Asked to put_delta for key=" << k << " but its not in the map"; + + auto existing_v = std::make_unique< V >(); + auto const ret = m_bt->put_one(key, it->second, btree_put_type::UPSERT, existing_v.get()); + ASSERT_EQ(ret, btree_status_t::success) << "Upsert key=" << k << " failed with error=" << enum_name(ret); + } + void range_put(uint32_t start_k, uint32_t end_k, V const& value, bool update) { K start_key = K{start_k}; K end_key = K{end_k}; auto const nkeys = end_k - start_k + 1; - auto preq = BtreeRangePutRequest< K >{BtreeKeyRange< K >{start_key, true, end_key, true}, - update ? btree_put_type::UPDATE : btree_put_type::UPSERT, &value}; - preq.enable_route_tracing(); - ASSERT_EQ(m_bt->put(preq), btree_status_t::success) << "range_put failed for " << start_k << "-" << end_k; + auto const [ret, cookie] = m_bt->put_range(BtreeKeyRange< K >{start_key, true, end_key, true}, + update ? btree_put_type::UPDATE : btree_put_type::UPSERT, value); + ASSERT_EQ(ret, btree_status_t::success) << "range_put failed for " << start_k << "-" << end_k; if (update) { m_shadow_map.range_update(start_key, nkeys, value); @@ -223,10 +252,7 @@ struct BtreeTestHelper { auto existing_v = std::make_unique< V >(); auto pk = std::make_unique< K >(k); - auto rreq = BtreeSingleRemoveRequest{pk.get(), existing_v.get()}; - rreq.enable_route_tracing(); - bool removed = (m_bt->remove(rreq) == btree_status_t::success); - + bool removed = (m_bt->remove_one(*pk, existing_v.get()) == btree_status_t::success); if (care_success) { ASSERT_EQ(removed, m_shadow_map.exists(*pk)) << "Removal of key " << pk->key() << " status doesn't match with shadow"; @@ -249,70 +275,6 @@ struct BtreeTestHelper { do_range_remove(start_k, end_key.key(), true /* removing_all_existing */); } - void move_to_tombstone(uint64_t k, btree_status_t expected_status = btree_status_t::success) { - auto existing_v = std::make_unique< V >(); - K key = K{k}; - V value = V::zero(); - put_filter_cb_t filter_cb = [](BtreeKey const& key, BtreeValue const& existing_value, BtreeValue const& value) { - if (static_cast< const V& >(existing_value) == static_cast< const V& >(value)) { - return put_filter_decision::keep; - } - return put_filter_decision::replace; - }; - auto sreq = BtreeSinglePutRequest{&key, &value, btree_put_type::UPDATE, existing_v.get(), filter_cb}; - sreq.enable_route_tracing(); - - const auto ret = m_bt->put(sreq); - ASSERT_EQ(ret, expected_status) << "UPDATING key=" << k << " failed with error=" << enum_name(ret); - } - - void move_to_tombstone(uint64_t start_key, uint64_t end_key, std::vector< std::pair< K, V > >& previous_entities, - btree_status_t expected_status = btree_status_t::success) { - auto existing_v = std::make_unique< V >(); - V value = V::zero(); - previous_entities.clear(); - put_filter_cb_t filter_cb = [&previous_entities](BtreeKey const& key, BtreeValue const& existing_value, - BtreeValue const& value) { - if (static_cast< const V& >(existing_value) == static_cast< const V& >(value)) { - return put_filter_decision::keep; - } - previous_entities.push_back( - std::make_pair(static_cast< const K& >(key), static_cast< const V& >(existing_value))); - return put_filter_decision::replace; - }; - auto preq = BtreeRangePutRequest< K >{BtreeKeyRange< K >{start_key, true, end_key, true}, - btree_put_type::UPDATE, - &value, - nullptr, - std::numeric_limits< uint32_t >::max(), - filter_cb}; - preq.enable_route_tracing(); - const auto ret = m_bt->put(preq); - - ASSERT_EQ(ret, expected_status) << "UPDATING key=[" << start_key << ", " << end_key - << "] failed with error=" << enum_name(ret); - } - - void remove_tombstone(uint64_t start_key, uint64_t end_key, std::vector< std::pair< K, V > >& previous_entities, - btree_status_t expected_status = btree_status_t::success) { - previous_entities.clear(); - auto rreq = BtreeRangeRemoveRequest< K >{ - BtreeKeyRange< K >{start_key, true, end_key, true}, nullptr, std::numeric_limits< uint32_t >::max(), - [&previous_entities](BtreeKey const& key, BtreeValue const& value) mutable -> bool { - if (static_cast< const V& >(value) == V::zero()) { return true; } - previous_entities.push_back( - std::make_pair(static_cast< const K& >(key), static_cast< const V& >(value))); - return false; - }}; - - rreq.enable_route_tracing(); - const auto ret = m_bt->remove(rreq); - - LOGDEBUG("Range remove from {} to {} returned {}", start_key, end_key, enum_name(ret)); - ASSERT_EQ(ret, expected_status) << "GC key=[" << start_key << ", " << end_key - << "] failed with error=" << enum_name(ret); - } - void range_remove_existing_random() { static std::uniform_int_distribution< uint32_t > s_rand_range_generator{2, 50}; @@ -325,11 +287,9 @@ struct BtreeTestHelper { } ////////////////////// All query operation variants /////////////////////////////// - void query_all() { do_query(0u, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1, UINT32_MAX); } + void query_all() { do_query(0u, m_options.num_entries - 1, UINT32_MAX); } - void query_all_paginate(uint32_t batch_size) { - do_query(0u, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1, batch_size); - } + void query_all_paginate(uint32_t batch_size) { do_query(0u, m_options.num_entries - 1, batch_size); } void do_query(uint32_t start_k, uint32_t end_k, uint32_t batch_size) { std::vector< std::pair< K, V > > out_vector; @@ -337,13 +297,20 @@ struct BtreeTestHelper { uint32_t remaining = m_shadow_map.num_elems_in_range(start_k, end_k); auto it = m_shadow_map.map_const().lower_bound(K{start_k}); - BtreeQueryRequest< K > qreq{BtreeKeyRange< K >{K{start_k}, true, K{end_k}, true}, - BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, batch_size}; + btree_status_t ret; + QueryPaginateCookie< K > cookie; + while (remaining > 0) { out_vector.clear(); - qreq.enable_route_tracing(); - auto const ret = m_bt->query(qreq, out_vector); + auto const expected_count = std::min(remaining, batch_size); + if (!cookie) { + std::tie(ret, cookie) = m_bt->query(BtreeKeyRange< K >{K{start_k}, true, K{end_k}, true}, out_vector, + batch_size, BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY); + } else { + ret = m_bt->query_next(cookie, out_vector); + } + // this->print_keys(); ASSERT_EQ(out_vector.size(), expected_count) << "Received incorrect value on query pagination"; @@ -364,7 +331,7 @@ struct BtreeTestHelper { } } out_vector.clear(); - auto ret = m_bt->query(qreq, out_vector); + ret = m_bt->query_next(cookie, out_vector); ASSERT_EQ(ret, btree_status_t::success) << "Expected success on query"; ASSERT_EQ(out_vector.size(), 0) << "Received incorrect value on empty query pagination"; @@ -385,77 +352,61 @@ struct BtreeTestHelper { ////////////////////// All get operation variants /////////////////////////////// void get_all() const { m_shadow_map.foreach ([this](K key, V value) { - auto copy_key = std::make_unique< K >(); - *copy_key = key; auto out_v = std::make_unique< V >(); - auto req = BtreeSingleGetRequest{copy_key.get(), out_v.get()}; - req.enable_route_tracing(); - const auto ret = m_bt->get(req); - ASSERT_EQ(ret, btree_status_t::success) - << "Missing key " << key << " in btree but present in shadow map" << " - status=" << enum_name(ret); - ASSERT_EQ((const V&)req.value(), value) - << "Found value in btree doesn't return correct data for key=" << key; + const auto ret = m_bt->get_one(key, out_v.get()); + + ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map"; + ASSERT_EQ((const V&)*out_v, value) << "Found value in btree doesn't return correct data for key=" << key; }); } void get_specific(uint32_t k) const { - auto pk = std::make_unique< K >(k); + K key = K{k}; auto out_v = std::make_unique< V >(); - auto req = BtreeSingleGetRequest{pk.get(), out_v.get()}; - req.enable_route_tracing(); - const auto status = m_bt->get(req); + const auto status = m_bt->get_one(key, out_v.get()); + if (status == btree_status_t::success) { - m_shadow_map.validate_data(req.key(), (const V&)req.value()); + m_shadow_map.validate_data(key, (const V&)*out_v); } else { - ASSERT_EQ(m_shadow_map.exists(req.key()), false) << "Node key " << k << " is missing in the btree"; + ASSERT_EQ(m_shadow_map.exists(key), false) << "Node key " << k << " is missing in the btree"; } } void get_any(uint32_t start_k, uint32_t end_k) const { auto out_k = std::make_unique< K >(); auto out_v = std::make_unique< V >(); - auto req = - BtreeGetAnyRequest< K >{BtreeKeyRange< K >{K{start_k}, true, K{end_k}, true}, out_k.get(), out_v.get()}; - req.enable_route_tracing(); - const auto status = m_bt->get(req); + auto const status = + m_bt->get_any(BtreeKeyRange< K >{K{start_k}, true, K{end_k}, true}, out_k.get(), out_v.get()); if (status == btree_status_t::success) { - ASSERT_EQ(m_shadow_map.exists_in_range(*(K*)req.m_outkey, start_k, end_k), true) - << "Get Any returned key=" << *(K*)req.m_outkey << " which is not in range " << start_k << "-" << end_k + ASSERT_EQ(m_shadow_map.exists_in_range(*out_k, start_k, end_k), true) + << "Get Any returned key=" << *out_k << " which is not in range " << start_k << "-" << end_k << "according to shadow map"; - m_shadow_map.validate_data(*(K*)req.m_outkey, *(V*)req.m_outval); + m_shadow_map.validate_data(*out_k, *out_v); } else { - ASSERT_EQ(m_shadow_map.exists_in_range(*(K*)req.m_outkey, start_k, end_k), false) + ASSERT_EQ(m_shadow_map.exists_in_range(*out_k, start_k, end_k), false) << "Get Any couldn't find key in the range " << start_k << "-" << end_k << " but it present in shadow map"; } } - void multi_op_execute(const std::vector< std::pair< std::string, int > >& op_list, bool skip_preload = false) { - if (!skip_preload) { - auto preload_size = SISL_OPTIONS["preload_size"].as< uint32_t >(); - auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); - if (preload_size > num_entries / 2) { + void multi_op_execute(const std::vector< std::pair< std::string, int > >& op_list) { + if (m_shadow_map.size() == 0) { + auto preload_size = m_options.preload_size; + if (preload_size > m_options.num_entries / 2) { LOGWARN("Preload size={} is more than half of num_entries, setting preload_size to {}", preload_size, - num_entries / 2); - preload_size = num_entries / 2; + m_options.num_entries / 2); + preload_size = m_options.num_entries / 2; } preload(preload_size); } + LOGINFO("Btree{}: {} IOs will be executed in {} fibers in parallel", m_bt->ordinal(), m_options.num_ios, + m_fibers.size()); run_in_parallel(op_list); + LOGINFO("Btree{}: {} IOs completed", m_bt->ordinal(), m_options.num_ios); } - std::tuple< uint64_t, uint64_t, uint8_t > get_btree_metrics(const nlohmann::json& metrics_json) { - const auto& counters = metrics_json.at("Counters"); - - uint64_t int_cnt = counters.at("Btree Interior node count").get< uint64_t >(); - uint64_t leaf_cnt = counters.at("Btree Leaf node count").get< uint64_t >(); - uint8_t depth = counters.at("Depth of btree").get< uint8_t >(); - - return std::make_tuple(int_cnt, leaf_cnt, depth); - } - - void dump_to_file(const std::string& file = "") const { m_bt->dump_tree_to_file(file); } + void dump_to_file(const std::string& file = "") const { m_bt->dump(file); } void print_keys(const std::string& preamble = "") const { auto print_key_range = [](std::vector< std::pair< K, V > > const& kvs) -> std::string { uint32_t start = 0; @@ -477,7 +428,7 @@ struct BtreeTestHelper { LOGINFO("{}{}", preamble.empty() ? "" : preamble + ":\n", m_bt->to_custom_string(print_key_range)); } - void visualize_keys(const std::string& file) const { /*m_bt->visualize_tree_keys(file);*/ } + void visualize_keys(const std::string& file) const { m_bt->visualize_tree_keys(file); } void compare_files(const std::string& before, const std::string& after) { std::ifstream b(before, std::ifstream::ate); @@ -508,13 +459,37 @@ struct BtreeTestHelper { } } + ///////////////////////// All crash recovery methods /////////////////////////////////// + void save_snapshot() { this->m_shadow_map.save(m_shadow_filename); } + + void reapply_after_crash() { + ShadowMap< K, V > snapshot_map{m_shadow_map.max_keys()}; + snapshot_map.load(m_shadow_filename); + LOGDEBUG("Btree:{} Snapshot before crash\n{}", m_bt->ordinal(), snapshot_map.to_string()); + + auto diff = m_shadow_map.diff(snapshot_map); + std::string dif_str; + for (const auto& [k, delta] : diff) { + dif_str += fmt::format("[{}-{}] ", k.key(), enum_name(delta)); + } + LOGDEBUG("Btree:{} Diff between shadow map and snapshot map\n{}\n", m_bt->ordinal(), dif_str); + + for (const auto& [k, delta] : diff) { + if ((delta == ShadowMapDelta::Added) || (delta == ShadowMapDelta::Updated)) { + this->put_delta(k.key()); + } else if (delta == ShadowMapDelta::Removed) { + this->remove_one(k.key(), false); + } + } + } + private: void do_put(uint64_t k, btree_put_type put_type, V const& value, bool expect_success = true) { auto existing_v = std::make_unique< V >(); K key = K{k}; - auto sreq = BtreeSinglePutRequest{&key, &value, put_type, existing_v.get()}; - sreq.enable_route_tracing(); - bool done = expect_success == (m_bt->put(sreq) == btree_status_t::success); + auto ret = m_bt->put_one(key, value, put_type, existing_v.get()); + bool done = expect_success ? (ret == btree_status_t::success) : (ret == btree_status_t::put_failed); + if (put_type == btree_put_type::INSERT) { ASSERT_EQ(done, !m_shadow_map.exists(key)); } else if (put_type == btree_put_type::UPDATE) { @@ -527,10 +502,7 @@ struct BtreeTestHelper { K start_key = K{start_k}; K end_key = K{end_k}; - auto rreq = BtreeRangeRemoveRequest< K >{BtreeKeyRange< K >{start_key, true, end_key, true}}; - rreq.enable_route_tracing(); - const auto ret = m_bt->remove(rreq); - + auto [ret, cookie] = m_bt->remove_range(BtreeKeyRange< K >{start_key, true, end_key, true}); if (all_existing) { m_shadow_map.range_erase(start_key, end_key); ASSERT_EQ((ret == btree_status_t::success), true) @@ -541,61 +513,31 @@ struct BtreeTestHelper { } } -protected: +public: void run_in_parallel(const std::vector< std::pair< std::string, int > >& op_list) { auto test_count = m_fibers.size(); - const auto total_iters = SISL_OPTIONS["num_iters"].as< uint32_t >(); - const auto num_iters_per_thread = total_iters / m_fibers.size(); - const auto extra_iters = total_iters % num_iters_per_thread; - LOGINFO("number of fibers {} num_iters_per_thread {} extra_iters {} ", m_fibers.size(), num_iters_per_thread, - extra_iters); + const auto num_ios_per_thread = m_options.num_ios / m_fibers.size(); + const auto extra_ios = m_options.num_ios % num_ios_per_thread; + m_num_ops = 0; // Reset the ops counter for (uint32_t fiber_id = 0; fiber_id < m_fibers.size(); ++fiber_id) { - auto num_iters_this_fiber = num_iters_per_thread + (fiber_id < extra_iters ? 1 : 0); - iomanager.run_on_forget(m_fibers[fiber_id], [this, fiber_id, &test_count, op_list, num_iters_this_fiber]() { + auto num_ios_this_fiber = num_ios_per_thread + (fiber_id < extra_ios ? 1 : 0); + iomanager.run_on_forget(m_fibers[fiber_id], [this, fiber_id, &test_count, op_list, num_ios_this_fiber]() { std::random_device g_rd{}; std::default_random_engine re{g_rd()}; std::vector< uint32_t > weights; std::transform(op_list.begin(), op_list.end(), std::back_inserter(weights), [](const auto& pair) { return pair.second; }); - double progress_interval = (double)num_iters_this_fiber / 20; // 5% of the total number of iterations - double progress_thresh = progress_interval; // threshold for progress interval - double elapsed_time, progress_percent, last_progress_time = 0; - // Construct a weighted distribution based on the input frequencies std::discrete_distribution< uint32_t > s_rand_op_generator(weights.begin(), weights.end()); - auto m_start_time = Clock::now(); - auto time_to_stop = [this, m_start_time]() { - return (get_elapsed_time_sec(m_start_time) > m_run_time); - }; + m_start_time = Clock::now(); + auto time_to_stop = [this]() { return (get_elapsed_time_sec(m_start_time) > m_options.run_time_secs); }; - for (uint32_t i = 0; i < num_iters_this_fiber && !time_to_stop(); i++) { + for (uint32_t i = 0; i < num_ios_this_fiber && !time_to_stop(); i++) { uint32_t op_idx = s_rand_op_generator(re); (this->m_operations[op_list[op_idx].first])(); - m_num_ops.fetch_add(1); - - if (fiber_id == 0) { - elapsed_time = get_elapsed_time_sec(m_start_time); - progress_percent = (double)i / num_iters_this_fiber * 100; - - // check progress every 5% of the total number of iterations or every 30 seconds - bool print_time = false; - if (i >= progress_thresh) { - progress_thresh += progress_interval; - print_time = true; - } - if (elapsed_time - last_progress_time > 30) { - last_progress_time = elapsed_time; - print_time = true; - } - if (print_time) { - LOGINFO("Progress: iterations completed ({:.2f}%)- Elapsed time: {:.0f} seconds of total " - "{} ({:.2f}%) - total entries: {} ({:.2f}%)", - progress_percent, elapsed_time, m_run_time, elapsed_time * 100.0 / m_run_time, - m_shadow_map.size(), m_shadow_map.size() * 100.0 / m_max_range_input); - } - } + track_progress(m_options.num_ios, "Workload"); } { std::unique_lock lg(m_test_done_mtx); @@ -608,7 +550,6 @@ struct BtreeTestHelper { std::unique_lock< std::mutex > lk(m_test_done_mtx); m_test_done_cv.wait(lk, [&]() { return test_count == 0; }); } - LOGINFO("ALL parallel jobs joined"); } std::vector< std::pair< std::string, int > > build_op_list(std::vector< std::string > const& input_ops) { diff --git a/src/tests/btree_helpers/btree_test_kvs.hpp b/src/tests/btree_helpers/btree_test_kvs.hpp index e4a8e39bb..c1baa8f38 100644 --- a/src/tests/btree_helpers/btree_test_kvs.hpp +++ b/src/tests/btree_helpers/btree_test_kvs.hpp @@ -22,9 +22,9 @@ #include #include -#include -#include -#include +#include +#include +#include static constexpr uint32_t g_max_keysize{100}; // for node size = 512 : free space : 442 => 100+100+6(record size) = 46% static constexpr uint32_t g_max_valsize{100}; @@ -330,10 +330,7 @@ class TestIntervalKey : public BtreeIntervalKey { static uint32_t get_fixed_size() { return sizeof(TestIntervalKey); } /////////////////// Overriding methods of BtreeIntervalKey ///////////////// - void shift(int n, void* app_ctx) override { - if (willAdditionOverflow< uint32_t >(m_offset, n)) { m_base++; } - m_offset += n; - } + void shift(int n) override { m_offset += n; } int distance(BtreeKey const& f) const override { TestIntervalKey const& from = s_cast< TestIntervalKey const& >(f); @@ -406,7 +403,6 @@ class TestFixedValue : public BtreeValue { virtual ~TestFixedValue() = default; static TestFixedValue generate_rand() { return TestFixedValue{g_randval_generator(g_re)}; } - static TestFixedValue zero() { return TestFixedValue{uint32_t(0)}; } TestFixedValue& operator=(const TestFixedValue& other) { m_val = other.m_val; @@ -466,7 +462,6 @@ class TestVarLenValue : public BtreeValue { } static TestVarLenValue generate_rand() { return TestVarLenValue{gen_random_string(rand_val_size())}; } - static TestVarLenValue zero() { return TestVarLenValue{""}; } sisl::blob serialize() const override { sisl::blob b{r_cast< const uint8_t* >(m_val.c_str()), uint32_cast(m_val.size())}; @@ -520,7 +515,6 @@ class TestIntervalValue : public BtreeIntervalValue { static TestIntervalValue generate_rand() { return TestIntervalValue{g_randval_generator(g_re), s_cast< uint16_t >(0)}; } - static TestIntervalValue zero() { return TestIntervalValue{0, 0}; } ///////////////////////////// Overriding methods of BtreeValue ////////////////////////// TestIntervalValue& operator=(const TestIntervalValue& other) = default; @@ -555,10 +549,7 @@ class TestIntervalValue : public BtreeIntervalValue { } ///////////////////////////// Overriding methods of BtreeIntervalValue ////////////////////////// - void shift(int n, void* app_ctx) override { - if (willAdditionOverflow< uint32_t >(m_offset, n)) { m_base_val++; } - m_offset += n; - } + void shift(int n) override { m_offset += n; } sisl::blob serialize_prefix() const override { return sisl::blob{uintptr_cast(const_cast< uint32_t* >(&m_base_val)), uint32_cast(sizeof(uint32_t))}; diff --git a/src/tests/btree_helpers/shadow_map.hpp b/src/tests/btree_helpers/shadow_map.hpp index 6e7310c3f..7d2070e04 100644 --- a/src/tests/btree_helpers/shadow_map.hpp +++ b/src/tests/btree_helpers/shadow_map.hpp @@ -3,6 +3,8 @@ #include "btree_test_kvs.hpp" +ENUM(ShadowMapDelta, uint8_t, Added, Removed, Updated); + template < typename K, typename V > class ShadowMap { private: @@ -11,11 +13,11 @@ class ShadowMap { uint32_t m_max_keys; using mutex = iomgr::FiberManagerLib::shared_mutex; mutex m_mutex; - // #define SHOWM(X) cout << #X " = " << (X) << endl - // void testPrint(std::map< uint32_t, std::string >& m_map, int i) { - // SHOWM(m[i]); - // SHOWM(m.find(i)->first); - // } +//#define SHOWM(X) cout << #X " = " << (X) << endl +// void testPrint(std::map< uint32_t, std::string >& m_map, int i) { +// SHOWM(m[i]); +// SHOWM(m.find(i)->first); +// } public: ShadowMap(uint32_t num_keys) : m_range_scheduler(num_keys), m_max_keys{num_keys} {} @@ -40,7 +42,7 @@ class ShadowMap { for (uint32_t i{0}; i < count; ++i) { K key{start_k + i}; V range_value{val}; - if constexpr (std::is_same_v< V, TestIntervalValue >) { range_value.shift(i, nullptr); } + if constexpr (std::is_same_v< V, TestIntervalValue >) { range_value.shift(i); } m_map.insert_or_assign(key, range_value); } m_range_scheduler.put_keys(start_k, start_k + count - 1); @@ -141,33 +143,34 @@ class ShadowMap { m_range_scheduler.remove_keys(start_key.key(), end_key.key()); } - std::vector< std::pair< K, bool > > diff(ShadowMap< K, V > const& other) { + std::vector< std::pair< K, ShadowMapDelta > > diff(ShadowMap< K, V > const& other) { auto it1 = m_map.begin(); auto it2 = other.m_map.begin(); - std::vector< std::pair< K, bool > > ret_diff; + std::vector< std::pair< K, ShadowMapDelta > > ret_diff; while ((it1 != m_map.end()) && (it2 != other.m_map.end())) { auto const x = it1->first.compare(it2->first); if (x == 0) { + if (it1->second != it2->second) { ret_diff.emplace_back(it1->first, ShadowMapDelta::Updated); } ++it1; ++it2; } else if (x < 0) { // Has in current map, add it to addition - ret_diff.emplace_back(it1->first, true /* addition */); + ret_diff.emplace_back(it1->first, ShadowMapDelta::Added); ++it1; } else { - ret_diff.emplace_back(it2->first, false /* addition */); + ret_diff.emplace_back(it2->first, ShadowMapDelta::Removed); ++it2; } } while (it1 != m_map.end()) { - ret_diff.emplace_back(it1->first, true /* addition */); + ret_diff.emplace_back(it1->first, ShadowMapDelta::Added); ++it1; } while (it2 != other.m_map.end()) { - ret_diff.emplace_back(it2->first, false /* addition */); + ret_diff.emplace_back(it2->first, ShadowMapDelta::Removed); ++it2; } return ret_diff; @@ -236,7 +239,7 @@ class ShadowMap { std::lock_guard lock{m_mutex}; std::ofstream file(filename); for (const auto& [key, value] : m_map) { - file << key << " " << value << '\n'; + file << key.key() << " " << value << '\n'; } file.close(); LOGINFO("Saved shadow map to file: {}", filename); @@ -247,11 +250,12 @@ class ShadowMap { std::ifstream file(filename); if (file.is_open()) { m_map.clear(); - K key; + uint64_t k; V value; - while (file >> key >> value) { + while (file >> k >> value) { + K key{k}; m_map.emplace(key, std::move(value)); - m_range_scheduler.put_key(key.key()); + m_range_scheduler.put_key(k); } file.close(); } diff --git a/src/tests/index_btree_benchmark.cpp b/src/tests/index_btree_benchmark.cpp index 02cb065f9..589433524 100644 --- a/src/tests/index_btree_benchmark.cpp +++ b/src/tests/index_btree_benchmark.cpp @@ -40,7 +40,6 @@ using namespace homestore; // this is used to splite the setup and teardown from the benchmark to get a more accurate result void* g_btree_helper{nullptr}; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, index_btree_benchmark, iomgr, test_common_setup) SISL_OPTION_GROUP(index_btree_benchmark, diff --git a/src/tests/log_dev_benchmark.cpp b/src/tests/log_dev_benchmark.cpp index a16b616e7..dc1f3e3ba 100644 --- a/src/tests/log_dev_benchmark.cpp +++ b/src/tests/log_dev_benchmark.cpp @@ -28,7 +28,7 @@ #include "logstore/log_dev.hpp" -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + RCU_REGISTER_INIT static constexpr size_t ITERATIONS{100000}; diff --git a/src/tests/log_store_benchmark.cpp b/src/tests/log_store_benchmark.cpp index 986ab1cc7..c34db76a3 100644 --- a/src/tests/log_store_benchmark.cpp +++ b/src/tests/log_store_benchmark.cpp @@ -35,7 +35,7 @@ #include "test_common/homestore_test_common.hpp" using namespace homestore; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + SISL_OPTIONS_ENABLE(logging, log_store_benchmark, iomgr, test_common_setup) SISL_OPTION_GROUP(log_store_benchmark, diff --git a/src/tests/test_append_blkalloc.cpp b/src/tests/test_append_blkalloc.cpp index a1f3f515b..09050fcf6 100644 --- a/src/tests/test_append_blkalloc.cpp +++ b/src/tests/test_append_blkalloc.cpp @@ -44,9 +44,8 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// RCU_REGISTER_INIT -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + SISL_OPTIONS_ENABLE(logging, test_append_blkalloc, iomgr, test_common_setup) -SISL_LOGGING_DECL(test_append_blkalloc) constexpr uint64_t Ki{1024}; constexpr uint64_t Mi{Ki * Ki}; diff --git a/src/tests/test_blk_cache_queue.cpp b/src/tests/test_blk_cache_queue.cpp index e4b75ef26..c74199279 100644 --- a/src/tests/test_blk_cache_queue.cpp +++ b/src/tests/test_blk_cache_queue.cpp @@ -28,7 +28,7 @@ #include "blkalloc/varsize_blk_allocator.h" #include "blkalloc/blk_cache_queue.h" -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + using namespace homestore; @@ -50,8 +50,8 @@ struct BlkCacheQueueTest : public ::testing::Test { virtual ~BlkCacheQueueTest() override = default; protected: - virtual void SetUp() override {}; - virtual void TearDown() override {}; + virtual void SetUp() override{}; + virtual void TearDown() override{}; void SetUp(const uint32_t nslabs, const uint32_t count_per_slab) { m_nslabs = nslabs; diff --git a/src/tests/test_blk_read_tracker.cpp b/src/tests/test_blk_read_tracker.cpp index e1d3ec00c..8c372cf55 100644 --- a/src/tests/test_blk_read_tracker.cpp +++ b/src/tests/test_blk_read_tracker.cpp @@ -25,8 +25,7 @@ using namespace homestore; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) -SISL_OPTIONS_ENABLE(logging, test_blk_read_tracker, nuraft_mesg) +SISL_OPTIONS_ENABLE(logging, test_blk_read_tracker) VENUM(op_type_t, uint8_t, insert = 0, remove = 1, wait_on = 2, max_op = 3); class BlkReadTrackerTest : public testing::Test { diff --git a/src/tests/test_blkalloc.cpp b/src/tests/test_blkalloc.cpp index 1860441f1..51fecf9e5 100644 --- a/src/tests/test_blkalloc.cpp +++ b/src/tests/test_blkalloc.cpp @@ -41,7 +41,7 @@ #include "blkalloc/fixed_blk_allocator.h" #include "blkalloc/varsize_blk_allocator.h" -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + using namespace homestore; @@ -434,7 +434,8 @@ struct VarsizeBlkAllocatorTest : public ::testing::Test, BlkAllocatorTest { void create_allocator(const bool use_slabs = true, uint64_t size = 0) { if (size == 0) { size = static_cast< uint64_t >(m_total_count); } - VarsizeBlkAllocConfig cfg{4096, 4096, 4096u, size * 4096, false, "", use_slabs}; + VarsizeBlkAllocConfig cfg{4096, 4096, 4096u, size * 4096, + false, "", use_slabs}; m_allocator = std::make_unique< VarsizeBlkAllocator >(cfg, true, 0); } @@ -455,7 +456,6 @@ struct VarsizeBlkAllocatorTest : public ::testing::Test, BlkAllocatorTest { return false; } if (ret == BlkAllocStatus::SUCCESS) { -#if 0 if (is_contiguous) { if (bids.size() != 1) { { @@ -466,7 +466,6 @@ struct VarsizeBlkAllocatorTest : public ::testing::Test, BlkAllocatorTest { return false; } } -#endif blk_count_t sz{0}; for (auto& bid : bids) { @@ -640,15 +639,18 @@ namespace { void alloc_free_var_contiguous_unirandsize(VarsizeBlkAllocatorTest* const block_test_pointer, uint64_t capacity) { const auto nthreads{ std::clamp< uint32_t >(std::thread::hardware_concurrency(), 2, SISL_OPTIONS["num_threads"].as< uint32_t >())}; - auto max_rand_size{std::max(capacity / 4096, uint64_t(2))}; + auto max_rand_size{std::max(capacity/4096, uint64_t(2))}; std::uniform_int_distribution< blk_count_t > s_rand_size_generator{1, static_cast< blk_count_t >(max_rand_size)}; - auto rand_func = [&s_rand_size_generator]() -> blk_count_t { return s_rand_size_generator(g_re); }; + auto rand_func = [&s_rand_size_generator]() -> blk_count_t { + return s_rand_size_generator(g_re); + }; const uint8_t prealloc_pct{5}; LOGINFO("Step 1: Pre allocate {}% of total blks which is {} blks in {} threads", prealloc_pct, capacity * prealloc_pct / 100, nthreads); [[maybe_unused]] const auto preload_alloced{ - block_test_pointer->preload(capacity * prealloc_pct / 100, true /* is_contiguous */, rand_func, true)}; + block_test_pointer->preload(capacity * prealloc_pct / 100, true /* is_contiguous */, + rand_func, true)}; auto num_iters{SISL_OPTIONS["iters"].as< uint64_t >()}; const uint64_t divisor{1024}; @@ -660,7 +662,8 @@ void alloc_free_var_contiguous_unirandsize(VarsizeBlkAllocatorTest* const block_ const uint8_t runtime_pct{10}; LOGINFO("Step 2: Do alloc/free contiguous blks with completely random size ratio_range=[{}-{}] threads={} iters={}", prealloc_pct, runtime_pct, nthreads, num_iters); - const auto result{block_test_pointer->do_alloc_free(num_iters, true /* is_contiguous */, rand_func, runtime_pct, + const auto result{block_test_pointer->do_alloc_free(num_iters, true /* is_contiguous */, + rand_func, runtime_pct, false /* round_blks */, true)}; } } // namespace diff --git a/src/tests/test_blkid.cpp b/src/tests/test_blkid.cpp index 0a6307d32..93a1813b8 100644 --- a/src/tests/test_blkid.cpp +++ b/src/tests/test_blkid.cpp @@ -7,8 +7,7 @@ #include -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) -SISL_OPTIONS_ENABLE(logging, test_blkid, nuraft_mesg) +SISL_OPTIONS_ENABLE(logging, test_blkid) SISL_OPTION_GROUP(test_blkid, (num_iterations, "", "num_iterations", "number of iterations", diff --git a/src/tests/test_btree.cpp b/src/tests/test_btree.cpp new file mode 100644 index 000000000..730e13cc5 --- /dev/null +++ b/src/tests/test_btree.cpp @@ -0,0 +1,532 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include +#include + +#include +#include "common/homestore_config.hpp" +#include "common/resource_mgr.hpp" +#include "test_common/homestore_test_common.hpp" +#include "test_common/range_scheduler.hpp" +#include "btree_helpers/btree_test_helper.hpp" +#include "btree_helpers/btree_test_kvs.hpp" +#include "btree_helpers/btree_decls.h" + +using namespace homestore; + +SISL_OPTIONS_ENABLE(logging, test_btree, iomgr, test_common_setup) + +// TODO Add tests to do write,remove after recovery. +// TODO Test with var len key with io mgr page size is 512. + +SISL_OPTION_GROUP( + test_btree, + (test_type, "", "test_type", "What type of test, [unit | functional | stress ]", + ::cxxopts::value< std::string >()->default_value("unit"), "string"), + (num_ios, "", "num_ios", "[override] number of io operations to test", ::cxxopts::value< uint32_t >(), "number"), + (num_entries, "", "num_entries", "[override] number of entries per btree", ::cxxopts::value< uint32_t >(), + "number"), + (run_time, "", "run_time", "[override] run time for io", ::cxxopts::value< uint32_t >(), "seconds"), + (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), + (preload_size, "", "preload_size", "[ovveride] number of entries to preload tree with", + ::cxxopts::value< uint32_t >(), "number"), + (operation_list, "", "operation_list", "operation list instead of default created following by percentage", + ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), + (seed, "", "seed", "random engine seed, use random if not defined", + ::cxxopts::value< uint64_t >()->default_value("0"), "number")) + +void log_obj_life_counter() { + std::string str; + sisl::ObjCounterRegistry::foreach ([&str](const std::string& name, int64_t created, int64_t alive) { + fmt::format_to(std::back_inserter(str), "{}: created={} alive={}\n", name, created, alive); + }); + LOGINFO("Object Life Counter\n:{}", str); +} + +BtreeTestOptions g_opts; + +static void set_options() { + if (SISL_OPTIONS["test_type"].as< std::string >() == "unit") { + g_opts.num_entries = 5000; + g_opts.preload_size = 2500; + g_opts.num_ios = 500; + g_opts.run_time_secs = 36000; // Limit is on ios than time + } else if (SISL_OPTIONS["test_type"].as< std::string >() == "functional") { + g_opts.num_entries = 50000; + g_opts.preload_size = 25000; + g_opts.num_ios = 50000; + g_opts.run_time_secs = 36000; // Limit is on ios than time + } + + if (SISL_OPTIONS.count("num_entries")) { g_opts.num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); } + if (SISL_OPTIONS.count("preload_size")) { g_opts.preload_size = SISL_OPTIONS["preload_size"].as< uint32_t >(); } + if (SISL_OPTIONS.count("num_ios")) { g_opts.num_ios = SISL_OPTIONS["num_ios"].as< uint32_t >(); } + if (SISL_OPTIONS.count("run_time")) { g_opts.run_time_secs = SISL_OPTIONS["run_time"].as< uint32_t >(); } + if (SISL_OPTIONS.count("disable_merge")) { g_opts.disable_merge = SISL_OPTIONS["disable_merge"].as< bool >(); } + + if (SISL_OPTIONS.count("seed")) { + LOGINFO("Using seed {} to sow the random generation", SISL_OPTIONS["seed"].as< uint64_t >()); + g_re.seed(SISL_OPTIONS["seed"].as< uint64_t >()); + } +} + +template < typename TestType > +struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { + using T = TestType; + using K = typename TestType::KeyType; + using V = typename TestType::ValueType; + + class TestIndexServiceCallbacks : public IndexServiceCallbacks { + public: + TestIndexServiceCallbacks(BtreeTest* test) : m_test(test) {} + std::shared_ptr< Index > on_index_table_found(superblk< IndexSuperBlock >&& sb) override { + LOGINFO("Index table recovered"); + m_test->SetUp(std::make_shared< Btree< K, V > >(m_test->m_cfg, std::move(sb)), true /* load */, + m_test->m_multi_threaded); + return m_test->m_bt; + } + + private: + BtreeTest* m_test; + }; + + BtreeTest() : BtreeTestHelper< TestType >::BtreeTestHelper(g_opts), testing::Test() {} + + using BtreeTestHelper< TestType >::SetUp; + + void SetUp() override { + if (TestType::store_type == IndexStore::Type::MEM_BTREE) { + m_helper.start_homestore( + "test_btree", + {{ServiceType::META, {.size_pct = 100.0}}, + {ServiceType::INDEX, {.size_pct = 0.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}, + nullptr, + {homestore::dev_info{"", homestore::HSDevType::Data, 64 * 1024 * 1024}, + homestore::dev_info{"", homestore::HSDevType::Data, 64 * 1024 * 1024}}); + // For mem btree use create only 1 small device + } else { + m_helper.start_homestore( + "test_btree", + {{ServiceType::META, {.size_pct = 10.0}}, + {ServiceType::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}, + nullptr, {homestore::dev_info{"", homestore::HSDevType::Fast, 0}}); + // For persistent btree, we try to create a default size, but with only 1 device explictly, since this tests + // start restart homestore several times and its better to use 1 disk always. + } + + auto uuid = boost::uuids::random_generator()(); + auto parent_uuid = boost::uuids::random_generator()(); + + // Test cp flush of write back. + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.generic.cache_max_throttle_cnt = 10000; + HS_SETTINGS_FACTORY().save(); + }); + homestore::hs()->resource_mgr().reset_dirty_buf_qd(); + + // Create index table and attach to index service. + m_multi_threaded = + (testing::UnitTest::GetInstance()->current_test_info()->name() == std::string("ConcurrentMultiOps")); + BtreeTestHelper< TestType >::SetUp(std::make_shared< Btree< K, V > >(this->m_cfg, uuid, parent_uuid, 0), + false /* load */, m_multi_threaded); + hs()->index_service().add_index_table(this->m_bt); + LOGINFO("Added index table to index service"); + } + + void TearDown() override { + destroy_btree(); + BtreeTestHelper< TestType >::TearDown(); + m_helper.shutdown_homestore(false); + log_obj_life_counter(); + } + + void restart_homestore() { + m_helper.params(HS_SERVICE::INDEX).index_svc_cbs = new TestIndexServiceCallbacks(this); + this->m_bt.reset(); + m_helper.restart_homestore(); + } + + void destroy_btree() { + hs()->index_service().destroy_index_table(this->m_bt); + this->m_bt.reset(); + } + + test_common::HSTestHelper m_helper; + bool m_multi_threaded{false}; +}; + +using BtreeTypes = + testing::Types< FixedLenBtree< IndexStore::Type::MEM_BTREE >, // In memory fixed key/value sized btree + VarKeySizeBtree< IndexStore::Type::MEM_BTREE >, // In memory var key, but fixed value sized btree + VarValueSizeBtree< IndexStore::Type::MEM_BTREE >, // In memory fixed key, var value sizeds btree + VarObjSizeBtree< IndexStore::Type::MEM_BTREE >, // In memory var sized key/value btree + PrefixIntervalBtree< IndexStore::Type::MEM_BTREE >, // In memory interval key/value btree + FixedLenBtree< IndexStore::Type::COPY_ON_WRITE_BTREE >, // COW fixed key/value sized btree + VarKeySizeBtree< IndexStore::Type::COPY_ON_WRITE_BTREE >, // COW var key, fixed value sized btree + VarValueSizeBtree< IndexStore::Type::COPY_ON_WRITE_BTREE >, // COW fixed key, var value sizeds btree + VarObjSizeBtree< IndexStore::Type::COPY_ON_WRITE_BTREE >, // COW var sized key/value btree + PrefixIntervalBtree< IndexStore::Type::COPY_ON_WRITE_BTREE > // COW interval key/value btree + >; + +TYPED_TEST_SUITE(BtreeTest, BtreeTypes); + +TYPED_TEST(BtreeTest, SequentialInsert) { + LOGINFO("SequentialInsert test start"); + // Forward sequential insert + const auto entries_iter1 = g_opts.num_entries / 2; + LOGINFO("Step 1: Do Forward sequential insert for {} entries", entries_iter1); + for (uint32_t i{0}; i < entries_iter1; ++i) { + this->put(i, btree_put_type::INSERT); + // this->print(); + } + LOGINFO("Step 2: Query {} entries and validate with pagination of 75 entries", entries_iter1); + this->do_query(0, entries_iter1 - 1, 75); + + // Reverse sequential insert + const auto entries_iter2 = g_opts.num_entries - entries_iter1; + LOGINFO("Step 3: Do Reverse sequential insert of remaining {} entries", entries_iter2); + for (uint32_t i{g_opts.num_entries - 1}; i >= entries_iter1; --i) { + this->put(i, btree_put_type::INSERT); + } + LOGINFO("Step 4: Query {} entries and validate with pagination of 90 entries", entries_iter2); + this->do_query(entries_iter1, g_opts.num_entries - 1, 90); + + // Do validate all of them + LOGINFO("Step 5: Query all entries and validate with no pagination"); + this->query_all(); + + LOGINFO("Step 6: Query all entries and validate with pagination of 80 entries"); + this->query_all_paginate(80); + + LOGINFO("Step 7: Get all entries 1-by-1 and validate them"); + this->get_all(); + this->get_any(g_opts.num_entries - 3, g_opts.num_entries + 1); + + // Negative cases + LOGINFO("Step 8: Do incorrect input and validate errors"); + this->do_query(g_opts.num_entries + 100, g_opts.num_entries + 500, 5); + this->get_any(g_opts.num_entries + 1, g_opts.num_entries + 2); + + LOGINFO("SequentialInsert test end"); +} + +TYPED_TEST(BtreeTest, RandomInsert) { + // Forward sequential insert + std::vector< uint32_t > vec(g_opts.num_entries); + // make keys [0, num_entries) + iota(vec.begin(), vec.end(), 0); + // shuffle keys + std::random_shuffle(vec.begin(), vec.end()); + LOGINFO("Step 1: Do forward random insert for {} entries", g_opts.num_entries); + for (uint32_t i{0}; i < g_opts.num_entries; ++i) { + this->put(vec[i], btree_put_type::INSERT); + } + this->get_all(); +} + +TYPED_TEST(BtreeTest, SequentialRemove) { + LOGINFO("SequentialRemove test start"); + // Forward sequential insert + LOGINFO("Step 1: Do Forward sequential insert for {} entries", g_opts.num_entries); + for (uint32_t i{0}; i < g_opts.num_entries; ++i) { + this->put(i, btree_put_type::INSERT); + } + LOGINFO("Step 2: Query {} entries and validate with pagination of 75 entries", g_opts.num_entries); + this->do_query(0, g_opts.num_entries - 1, 75); + + const auto entries_iter1 = g_opts.num_entries / 2; + try { + LOGINFO("Step 3: Do Forward sequential remove for {} entries", entries_iter1); + for (uint32_t i{0}; i < entries_iter1; ++i) { + this->remove_one(i); + } + } catch (std::exception& e) { assert(false); } + LOGINFO("Step 4: Query {} entries and validate with pagination of 75 entries", entries_iter1); + this->do_query(0, entries_iter1 - 1, 75); + this->do_query(entries_iter1, g_opts.num_entries - 1, 75); + + const auto entries_iter2 = g_opts.num_entries - entries_iter1; + LOGINFO("Step 5: Do Reverse sequential remove of remaining {} entries", entries_iter2); + for (uint32_t i{g_opts.num_entries - 1}; i >= entries_iter1; --i) { + this->remove_one(i); + } + + LOGINFO("Step 6: Query the empty tree"); + this->do_query(0, g_opts.num_entries - 1, 75); + this->get_any(0, 1); + this->get_specific(0); + LOGINFO("SequentialRemove test end"); +} + +TYPED_TEST(BtreeTest, SimpleRemoveRange) { + // Forward sequential insert + const auto num_entries = 20; + LOGINFO("Step 1: Do forward sequential insert for {} entries", num_entries); + for (uint32_t i{0}; i < num_entries; ++i) { + this->put(i, btree_put_type::INSERT); + } + LOGINFO("Step 2: Do range remove for {} entries", num_entries); + // this->print_keys(); // EXPECT size = 20 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 + this->range_remove_any(5, 10); + // this->print_keys(); // EXPECT size = 14 : 0 1 2 3 4 [5 6 7 8 9 10] 11 12 13 14 15 16 17 18 19 + this->range_remove_any(0, 2); + // this->print_keys(); // EXPECT size = 11 : [0 1 2] 3 4 11 12 13 14 15 16 17 18 19 + this->range_remove_any(18, 19); + // this->print_keys(); // EXPECT size = 9 : 3 4 11 12 13 14 15 16 17 [18 19] + this->range_remove_any(17, 17); + // this->print_keys(); // EXPECT size = 8 : 3 4 11 12 13 14 15 16 [17] + this->range_remove_any(1, 5); + // this->print_keys(); // EXPECT size = 6 : [3 4] 11 12 13 14 15 16 + this->range_remove_any(1, 20); + // this->print_keys(); // EXPECT size = 0 : [11 12 13 14 15 16] + + this->query_all(); + // this->query_validate(0, num_entries , 75); +} + +TYPED_TEST(BtreeTest, RandomRemove) { + // Forward sequential insert + LOGINFO("Step 1: Do forward sequential insert for {} entries", g_opts.num_entries); + for (uint32_t i{0}; i < g_opts.num_entries; ++i) { + this->put(i, btree_put_type::INSERT); + } + + std::vector< uint32_t > vec(g_opts.num_entries); + iota(vec.begin(), vec.end(), 0); + + // shuffle keys in [0, num_entries) + std::random_shuffle(vec.begin(), vec.end()); + LOGINFO("Step 2: Do remove one by one for {} entries", g_opts.num_entries); + for (uint32_t i{0}; i < g_opts.num_entries; ++i) { + this->remove_one(vec[i]); + } + this->get_all(); +} + +TYPED_TEST(BtreeTest, RandomRemoveRange) { + // Forward sequential insert + LOGINFO("Step 1: Do forward sequential insert for {} entries", g_opts.num_entries); + for (uint32_t i{0}; i < g_opts.num_entries; ++i) { + this->put(i, btree_put_type::INSERT); + } + // generate keys including out of bound + static thread_local std::uniform_int_distribution< uint32_t > s_rand_key_generator{0, g_opts.num_entries}; + // this->print_keys(); + LOGINFO("Step 2: Do range remove for maximum of {} iterations", g_opts.num_ios); + for (uint32_t i{0}; i < g_opts.num_ios; ++i) { + uint32_t key1 = s_rand_key_generator(g_re); + uint32_t key2 = s_rand_key_generator(g_re); + + // LOGINFO("Step 2 - {}: Do Range Remove of maximum [{},{}] keys ", i, start_key, end_key); + this->range_remove_any(std::min(key1, key2), std::max(key1, key2)); + // this->print_keys(); + } + + this->query_all(); +} + +TYPED_TEST(BtreeTest, RangeUpdate) { + LOGINFO("RangeUpdate test start"); + // Forward sequential insert + LOGINFO("Step 1: Do Forward sequential insert for {} entries", g_opts.num_entries); + for (uint32_t i{0}; i < g_opts.num_entries; ++i) { + this->put(i, btree_put_type::INSERT); + } + + LOGINFO("Step 2: Do Range Update of random intervals between [1-50] for 100 times with random key ranges"); + for (uint32_t i{0}; i < 100; ++i) { + this->range_put_random(); + } + + LOGINFO("Step 2: Query {} entries and validate with pagination of 75 entries", g_opts.num_entries); + this->do_query(0, g_opts.num_entries - 1, 75); + LOGINFO("RangeUpdate test end"); +} + +TYPED_TEST(BtreeTest, CpFlush) { + using TestT = typename TestFixture::T; + if (TestT::store_type == IndexStore::Type::MEM_BTREE) { GTEST_SKIP(); } + + LOGINFO("CpFlush test start"); + LOGINFO("Do Forward sequential insert for {} entries", g_opts.num_entries / 2); + for (uint32_t i = 0; i < g_opts.num_entries; ++i) { + this->put(i, btree_put_type::INSERT); + } + + // Remove some of the entries. + for (uint32_t i = 0; i < g_opts.num_entries; i += 10) { + this->remove_one(i); + } + + LOGINFO("Query {} entries and validate with pagination of 75 entries", g_opts.num_entries / 2); + this->do_query(0, g_opts.num_entries / 2 - 1, 75); + + LOGINFO("Trigger checkpoint flush."); + test_common::HSTestHelper::trigger_cp(true /* wait */); + + LOGINFO("Query {} entries and validate with pagination of 75 entries", g_opts.num_entries); + this->do_query(0, g_opts.num_entries - 1, 75); + + this->dump_to_file(std::string("before.txt")); + + // Restart homestore. m_bt is updated by the TestIndexServiceCallback. + this->restart_homestore(); + + std::this_thread::sleep_for(std::chrono::seconds{1}); + LOGINFO("Restarted homestore with index recovered"); + + this->dump_to_file(std::string("after.txt")); + + LOGINFO("Query {} entries", g_opts.num_entries); + this->do_query(0, g_opts.num_entries - 1, 1000); + + this->compare_files("before.txt", "after.txt"); + LOGINFO("CpFlush test end"); +} + +TYPED_TEST(BtreeTest, MultipleCpFlush) { + using TestT = typename TestFixture::T; + if (TestT::store_type == IndexStore::Type::MEM_BTREE) { GTEST_SKIP(); } + + LOGINFO("MultipleCpFlush test start"); + + LOGINFO("Do Forward sequential insert for {} entries", g_opts.num_entries / 2); + for (uint32_t i = 0; i < g_opts.num_entries / 2; ++i) { + this->put(i, btree_put_type::INSERT); + if (i % 500 == 0) { + LOGINFO("Trigger checkpoint flush wait=false."); + test_common::HSTestHelper::trigger_cp(false /* wait */); + } + } + + LOGINFO("Trigger checkpoint flush wait=false."); + test_common::HSTestHelper::trigger_cp(false /* wait */); + + for (uint32_t i = g_opts.num_entries / 2; i < g_opts.num_entries; ++i) { + this->put(i, btree_put_type::INSERT); + } + + LOGINFO("Trigger checkpoint flush wait=false."); + test_common::HSTestHelper::trigger_cp(false /* wait */); + + LOGINFO("Trigger checkpoint flush wait=true."); + test_common::HSTestHelper::trigger_cp(true /* wait */); + + LOGINFO("Query {} entries and validate with pagination of 75 entries", g_opts.num_entries); + this->do_query(0, g_opts.num_entries - 1, 75); + + this->dump_to_file(std::string("before.txt")); + + // Restart homestore. m_bt is updated by the TestIndexServiceCallback. + this->restart_homestore(); + + std::this_thread::sleep_for(std::chrono::seconds{1}); + LOGINFO(" Restarted homestore with index recovered"); + this->dump_to_file(std::string("after.txt")); + + this->compare_files("before.txt", "after.txt"); + + LOGINFO("Query {} entries and validate with pagination of 1000 entries", g_opts.num_entries); + this->do_query(0, g_opts.num_entries - 1, 1000); + LOGINFO("MultipleCpFlush test end"); +} + +TYPED_TEST(BtreeTest, ThreadedCpFlush) { + using TestT = typename TestFixture::T; + if (TestT::store_type == IndexStore::Type::MEM_BTREE) { GTEST_SKIP(); } + + LOGINFO("ThreadedCpFlush test start"); + + bool stop = false; + std::atomic< uint32_t > last_index{0}; + auto insert_io_thread = std::thread([this, &last_index] { + LOGINFO("Do Forward sequential insert for {} entries", g_opts.num_entries); + uint32_t j = 0; + for (uint32_t i = 0; i < g_opts.num_entries; ++i) { + this->put(i, btree_put_type::INSERT); + last_index = i; + } + }); + + auto remove_io_thread = std::thread([this, &stop, &last_index] { + LOGINFO("Do random removes for {} entries", g_opts.num_entries); + while (!stop) { + std::this_thread::sleep_for(std::chrono::milliseconds{10}); + // Remove a random entry. + std::uniform_int_distribution< uint32_t > rand{0, last_index.load()}; + auto rm_idx = rand(g_re); + this->remove_one(rm_idx); + } + }); + + auto cp_flush_thread = std::thread([this, &stop] { + while (!stop) { + std::this_thread::sleep_for(std::chrono::seconds{1}); + LOGINFO("Trigger checkpoint flush wait=true."); + test_common::HSTestHelper::trigger_cp(false /* wait */); + LOGINFO("Trigger checkpoint flush wait=true done."); + } + }); + + insert_io_thread.join(); + stop = true; + remove_io_thread.join(); + cp_flush_thread.join(); + + LOGINFO("Trigger checkpoint flush wait=true."); + test_common::HSTestHelper::trigger_cp(true /* wait */); + + LOGINFO("Query {} entries and validate with pagination of 75 entries", g_opts.num_entries); + this->do_query(0, g_opts.num_entries - 1, 75); + + this->dump_to_file(std::string("before.txt")); + + // Restart homestore. m_bt is updated by the TestIndexServiceCallback. + this->restart_homestore(); + + std::this_thread::sleep_for(std::chrono::seconds{1}); + LOGINFO(" Restarted homestore with index recovered"); + this->dump_to_file(std::string("after.txt")); + + this->compare_files("before.txt", "after.txt"); + + LOGINFO("Query {} entries and validate with pagination of 1000 entries", g_opts.num_entries); + this->do_query(0, g_opts.num_entries - 1, 1000); + LOGINFO("ThreadedCpFlush test end"); +} + +TYPED_TEST(BtreeTest, ConcurrentMultiOps) { + // range put is not supported for non-extent keys + std::vector< std::string > input_ops = {"put:18", "remove:14", "range_put:20", "range_remove:2", "query:10"}; + if (SISL_OPTIONS.count("operation_list")) { + input_ops = SISL_OPTIONS["operation_list"].as< std::vector< std::string > >(); + } + auto ops = this->build_op_list(input_ops); + + this->multi_op_execute(ops); +} + +int main(int argc, char* argv[]) { + int parsed_argc{argc}; + ::testing::InitGoogleTest(&parsed_argc, argv); + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_btree, iomgr, test_common_setup); + sisl::logging::SetLogger("test_btree"); + spdlog::set_pattern("[%D %T%z] [%^%L%$] [%t] %v"); + + set_options(); + auto ret = RUN_ALL_TESTS(); + return ret; +} diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_btree_long_running similarity index 72% rename from src/tests/test_index_btree.cpp rename to src/tests/test_btree_long_running index f02537281..3c9ff5ffa 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_btree_long_running @@ -27,9 +27,9 @@ using namespace homestore; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + SISL_OPTIONS_ENABLE(logging, test_index_btree, iomgr, test_common_setup) -SISL_LOGGING_DECL(test_index_btree) + // TODO Add tests to do write,remove after recovery. // TODO Test with var len key with io mgr page size is 512. @@ -44,13 +44,6 @@ SISL_OPTION_GROUP( (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), - (preload_size, "", "preload_size", "number of entries to preload tree with", - ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), - (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), - (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", - ::cxxopts::value< bool >()->default_value("1"), ""), - (max_merge_level, "", "max_merge_level", "max merge level", ::cxxopts::value< uint8_t >()->default_value("127"), - ""), (seed, "", "seed", "random engine seed, use random if not defined", ::cxxopts::value< uint64_t >()->default_value("0"), "number")) @@ -71,10 +64,9 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { class TestIndexServiceCallbacks : public IndexServiceCallbacks { public: TestIndexServiceCallbacks(BtreeTest* test) : m_test(test) {} - std::shared_ptr< IndexTableBase > on_index_table_found(superblk< index_table_sb >&& sb) override { + std::shared_ptr< Index > on_index_table_found(superblk< IndexSuperBlock >&& sb) override { LOGINFO("Index table recovered"); - LOGINFO("Root bnode_id {} version {}", sb->root_node, sb->root_link_version); - m_test->m_bt = std::make_shared< typename T::BtreeType >(std::move(sb), m_test->m_cfg); + m_test->m_bt = std::make_shared< Btree< K, V > >(m_test->m_cfg, std::move(sb)); return m_test->m_bt; } @@ -82,16 +74,59 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { BtreeTest* m_test; }; - BtreeTest() : testing::Test() {} + BtreeTest() : testing::Test() { + std::string shadow_results = SISL_OPTIONS["shadow_results"].as< std::string >(); + m_load_shadow_file = m_gen_shadow_file = m_clean_shadow_file = false; + if (shadow_results == "save") { + m_gen_shadow_file = true; + } else if (shadow_results == "load_and_save") { + m_load_shadow_file = true; + m_gen_shadow_file = true; + } else if (shadow_results == "load_and_clean") { + m_load_shadow_file = true; + m_clean_shadow_file = true; + } + } + + void SetUp() override { + if (TestType::store_type == IndexStore::Type::MEM_BTREE) { + m_helper.start_homestore( + "test_btree", + {{ServiceType::META, {.size_pct = 100.0}}, + {ServiceType::INDEX, {.size_pct = 0.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}, + nullptr, + 1 * 1024 * 1024 // For mem btree, let us create small size device + ); + } else { + m_helper.start_homestore( + "test_btree", + {{ServiceType::META, {.size_pct = 10.0}}, + {ServiceType::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}); + } + + auto uuid = boost::uuids::random_generator()(); + auto parent_uuid = boost::uuids::random_generator()(); + + // Test cp flush of write back. + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.generic.cache_max_throttle_cnt = 10000; + HS_SETTINGS_FACTORY().save(); + }); + homestore::hs()->resource_mgr().reset_dirty_buf_qd(); + + // Create index table and attach to index service. + BtreeTestHelper< TestType >::SetUp(); + this->m_bt = std::make_shared< Btree< K, V > >(this->m_cfg, uuid, parent_uuid, 0); + hs()->index_service().add_index_table(this->m_bt); + LOGINFO("Added index table to index service"); + } void SetUp() override { m_helper.start_homestore( "test_index_btree", {{HS_SERVICE::META, {.size_pct = 10.0}}, - {HS_SERVICE::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}); - - LOGINFO("Node size {} ", hs()->index_service().node_size()); - this->m_cfg = BtreeConfig(hs()->index_service().node_size()); + {HS_SERVICE::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}, + nullptr, {}, SISL_OPTIONS["init_device"].as< bool >()); auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); @@ -105,18 +140,46 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { // Create index table and attach to index service. BtreeTestHelper< TestType >::SetUp(); - this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); + if (this->m_bt == nullptr || SISL_OPTIONS["init_device"].as< bool >()) { + this->m_bt = std::make_shared< Btree< K, V > >(this->m_cfg, uuid, parent_uuid, 0); + } else { + populate_shadow_map(); + } + hs()->index_service().add_index_table(this->m_bt); LOGINFO("Added index table to index service"); } + void populate_shadow_map() { + this->m_shadow_map.load(m_shadow_filename); + ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) + << "shadow map size and tree size mismatch"; + this->get_all(); + } + void TearDown() override { + bool cleanup = SISL_OPTIONS["cleanup_after_shutdown"].as< bool >(); + LOGINFO("cleanup the dump map and index data? {}", cleanup); + if (!cleanup) { + this->m_shadow_map.save(m_shadow_filename); + } else { + if (std::filesystem::remove(m_shadow_filename)) { + LOGINFO("File {} removed successfully", m_shadow_filename); + } else { + LOGINFO("Error: failed to remove {}", m_shadow_filename); + } + } + LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(), + this->m_bt->count_keys(this->m_bt->root_node_id())); + BtreeTestHelper< TestType >::TearDown(); + m_helper.shutdown_homestore(false); + } + + void TearDown() override { + if (SISL_OPTIONS["shadow_results"] == "load_and_save") {} + destroy_btree(); BtreeTestHelper< TestType >::TearDown(); - auto [interior, leaf] = this->m_bt->compute_node_count(); - LOGINFO("Teardown with Root bnode_id {} tree size: {} btree node count (interior = {} leaf= {})", - this->m_bt->root_node_id(), this->m_bt->count_keys(this->m_bt->root_node_id()), interior, leaf); m_helper.shutdown_homestore(false); - this->m_bt.reset(); log_obj_life_counter(); } @@ -126,18 +189,26 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { } void destroy_btree() { - auto cpg = hs()->cp_mgr().cp_guard(); - auto op_context = (void*)cpg.context(cp_consumer_t::INDEX_SVC); - const auto [ret, free_node_cnt] = this->m_bt->destroy_btree(op_context); - ASSERT_EQ(ret, btree_status_t::success) << "btree destroy failed"; + hs()->index_service().remove_index_table(this->m_bt); this->m_bt.reset(); } test_common::HSTestHelper m_helper; + +private: + std::string m_shadow_file_name; + bool m_load_shadow_file{false}; + bool m_gen_shadow_file{false}; + bool m_clean_shadow_file{true}; }; using BtreeTypes = - testing::Types< FixedLenBtree, PrefixIntervalBtree, VarKeySizeBtree, VarValueSizeBtree, VarObjSizeBtree >; + testing::Types< FixedLenBtree< IndexStore::Type::MEM_BTREE >, // In memory fixed key/value sized btree + VarKeySizeBtree< IndexStore::Type::MEM_BTREE >, // In memory var key, but fixed value sized btree + VarValueSizeBtree< IndexStore::Type::MEM_BTREE >, // In memory fixed key, var value sizeds btree + VarObjSizeBtree< IndexStore::Type::MEM_BTREE >, // In memory var sized key/value btree + PrefixIntervalBtree< IndexStore::Type::MEM_BTREE > // In memory interval key/value btree + >; TYPED_TEST_SUITE(BtreeTest, BtreeTypes); @@ -259,6 +330,32 @@ TYPED_TEST(BtreeTest, SequentialRemove) { LOGINFO("SequentialRemove test end"); } +TYPED_TEST(BtreeTest, SimpleRemoveRange) { + // Forward sequential insert + const auto num_entries = 20; + LOGINFO("Step 1: Do forward sequential insert for {} entries", num_entries); + for (uint32_t i{0}; i < num_entries; ++i) { + this->put(i, btree_put_type::INSERT); + } + LOGINFO("Step 2: Do range remove for {} entries", num_entries); + // this->print_keys(); // EXPECT size = 20 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 + this->range_remove_any(5, 10); + // this->print_keys(); // EXPECT size = 14 : 0 1 2 3 4 [5 6 7 8 9 10] 11 12 13 14 15 16 17 18 19 + this->range_remove_any(0, 2); + // this->print_keys(); // EXPECT size = 11 : [0 1 2] 3 4 11 12 13 14 15 16 17 18 19 + this->range_remove_any(18, 19); + // this->print_keys(); // EXPECT size = 9 : 3 4 11 12 13 14 15 16 17 [18 19] + this->range_remove_any(17, 17); + // this->print_keys(); // EXPECT size = 8 : 3 4 11 12 13 14 15 16 [17] + this->range_remove_any(1, 5); + // this->print_keys(); // EXPECT size = 6 : [3 4] 11 12 13 14 15 16 + this->range_remove_any(1, 20); + // this->print_keys(); // EXPECT size = 0 : [11 12 13 14 15 16] + + this->query_all(); + // this->query_validate(0, num_entries , 75); +} + TYPED_TEST(BtreeTest, RandomRemove) { // Forward sequential insert const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); @@ -280,6 +377,31 @@ TYPED_TEST(BtreeTest, RandomRemove) { this->get_all(); } +TYPED_TEST(BtreeTest, RandomRemoveRange) { + // Forward sequential insert + const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); + const auto num_iters = SISL_OPTIONS["num_iters"].as< uint32_t >(); + + LOGINFO("Step 1: Do forward sequential insert for {} entries", num_entries); + for (uint32_t i{0}; i < num_entries; ++i) { + this->put(i, btree_put_type::INSERT); + } + // generate keys including out of bound + static thread_local std::uniform_int_distribution< uint32_t > s_rand_key_generator{0, num_entries}; + // this->print_keys(); + LOGINFO("Step 2: Do range remove for maximum of {} iterations", num_iters); + for (uint32_t i{0}; (i < num_iters) && this->m_shadow_map.size(); ++i) { + uint32_t key1 = s_rand_key_generator(g_re); + uint32_t key2 = s_rand_key_generator(g_re); + + // LOGINFO("Step 2 - {}: Do Range Remove of maximum [{},{}] keys ", i, start_key, end_key); + this->range_remove_any(std::min(key1, key2), std::max(key1, key2)); + // this->print_keys(); + } + + this->query_all(); +} + TYPED_TEST(BtreeTest, RangeUpdate) { LOGINFO("RangeUpdate test start"); // Forward sequential insert @@ -300,8 +422,10 @@ TYPED_TEST(BtreeTest, RangeUpdate) { } TYPED_TEST(BtreeTest, CpFlush) { - LOGINFO("CpFlush test start"); + using TestT = typename TestFixture::T; + if (TestT::store_type == IndexStore::Type::MEM_BTREE) { GTEST_SKIP(); } + LOGINFO("CpFlush test start"); const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); LOGINFO("Do Forward sequential insert for {} entries", num_entries / 2); for (uint32_t i = 0; i < num_entries; ++i) { @@ -342,6 +466,9 @@ TYPED_TEST(BtreeTest, CpFlush) { } TYPED_TEST(BtreeTest, MultipleCpFlush) { + using TestT = typename TestFixture::T; + if (TestT::store_type == IndexStore::Type::MEM_BTREE) { GTEST_SKIP(); } + LOGINFO("MultipleCpFlush test start"); const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); @@ -389,6 +516,9 @@ TYPED_TEST(BtreeTest, MultipleCpFlush) { } TYPED_TEST(BtreeTest, ThreadedCpFlush) { + using TestT = typename TestFixture::T; + if (TestT::store_type == IndexStore::Type::MEM_BTREE) { GTEST_SKIP(); } + LOGINFO("ThreadedCpFlush test start"); const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); @@ -460,13 +590,11 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin public: TestIndexServiceCallbacks(BtreeConcurrentTest* test) : m_test(test) {} - std::shared_ptr< IndexTableBase > on_index_table_found(superblk< index_table_sb >&& sb) override { + std::shared_ptr< Index > on_index_table_found(superblk< IndexSuperBlock >&& sb) override { LOGINFO("Index table recovered"); - LOGINFO("Root bnode_id {} version {}", sb->root_node, sb->root_link_version); - m_test->m_cfg = BtreeConfig(hs()->index_service().node_size()); m_test->m_cfg.m_leaf_node_type = T::leaf_node_type; m_test->m_cfg.m_int_node_type = T::interior_node_type; - m_test->m_bt = std::make_shared< typename T::BtreeType >(std::move(sb), m_test->m_cfg); + m_test->m_bt = std::make_shared< Btree< K, V > >(m_test->m_cfg, std::move(sb)); return m_test->m_bt; } @@ -488,9 +616,6 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin {HS_SERVICE::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}, nullptr, {}, SISL_OPTIONS["init_device"].as< bool >()); - LOGINFO("Node size {} ", hs()->index_service().node_size()); - this->m_cfg = BtreeConfig(hs()->index_service().node_size()); - auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); @@ -504,7 +629,7 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin // Create index table and attach to index service. BtreeTestHelper< TestType >::SetUp(); if (this->m_bt == nullptr || SISL_OPTIONS["init_device"].as< bool >()) { - this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); + this->m_bt = std::make_shared< Btree< K, V > >(this->m_cfg, uuid, parent_uuid, 0); } else { populate_shadow_map(); } @@ -532,9 +657,8 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin LOGINFO("Error: failed to remove {}", m_shadow_filename); } } - auto [interior, leaf] = this->m_bt->compute_node_count(); - LOGINFO("Teardown with Root bnode_id {} tree size: {} btree node count (interior = {} leaf= {})", - this->m_bt->root_node_id(), this->m_bt->count_keys(this->m_bt->root_node_id()), interior, leaf); + LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(), + this->m_bt->count_keys(this->m_bt->root_node_id())); BtreeTestHelper< TestType >::TearDown(); m_helper.shutdown_homestore(false); this->m_bt.reset(); diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp index 0db9416e4..8698f5100 100644 --- a/src/tests/test_btree_node.cpp +++ b/src/tests/test_btree_node.cpp @@ -15,23 +15,20 @@ *********************************************************************************/ #include -#define StoreSpecificBtreeNode homestore::BtreeNode - #include #include #include -#include -#include -#include +#include +#include +#include #include "btree_helpers/btree_test_kvs.hpp" static constexpr uint32_t g_node_size{4096}; static constexpr uint32_t g_max_keys{6000}; static std::uniform_int_distribution< uint32_t > g_randkey_generator{0, g_max_keys - 1}; +static BtreeNode::Allocator::Token g_token{0}; using namespace homestore; -SISL_LOGGING_DEF(btree) -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) struct FixedLenNodeTest { using NodeType = SimpleNode< TestFixedKey, TestFixedValue >; @@ -69,35 +66,35 @@ struct NodeTest : public testing::Test { using K = typename TestType::KeyType; using V = typename TestType::ValueType; - std::unique_ptr< uint8_t[] > m_node1_buf; - std::unique_ptr< uint8_t[] > m_node2_buf; std::unique_ptr< typename T::NodeType > m_node1; std::unique_ptr< typename T::NodeType > m_node2; std::map< K, V > m_shadow_map; - BtreeConfig m_cfg{g_node_size}; + BtreeConfig m_cfg; void SetUp() override { - m_node1_buf = std::unique_ptr< uint8_t[] >(new uint8_t[g_node_size]); - m_node2_buf = std::unique_ptr< uint8_t[] >(new uint8_t[g_node_size]); - - m_node1 = std::make_unique< typename T::NodeType >(m_node1_buf.get(), 1ul, true, true, m_cfg); - m_node2 = std::make_unique< typename T::NodeType >(m_node2_buf.get(), 2ul, true, true, m_cfg); + g_token = BtreeNode::Allocator::add(BtreeNode::Allocator{ + [](uint32_t size) { return new uint8_t[size]; }, // alloc_btree_node + [](BtreeNode*) {}, // free_btree_node + [](uint32_t size) { return new uint8_t[size]; }, // alloc_node_buf + [](uint8_t* buf) { delete[] buf; } // free_node_buf + }); + + m_cfg.m_node_size = g_node_size; + m_node1 = std::make_unique< typename T::NodeType >(1ul, true, g_node_size, g_token); + m_node2 = std::make_unique< typename T::NodeType >(2ul, true, g_node_size, g_token); } void put(uint32_t k, btree_put_type put_type) { K key{k}; V value{V::generate_rand()}; V existing_v; - btree_status_t status = m_node1->put(key, value, put_type, &existing_v); + bool done = m_node1->put(key, value, put_type, &existing_v); - auto expected_status = btree_status_t::success; - if (m_shadow_map.contains(key)) { - expected_status = - put_type != btree_put_type::INSERT ? btree_status_t::success : btree_status_t::already_exists; - } - ASSERT_EQ(status, expected_status) - << "Expected put of key " << k << " of put_type " << enum_name(put_type) << " to be " << expected_status; - if (expected_status == btree_status_t::success) { + bool expected_done{true}; + if (m_shadow_map.find(key) != m_shadow_map.end()) { expected_done = (put_type != btree_put_type::INSERT); } + ASSERT_EQ(done, expected_done) << "Expected put of key " << k << " of put_type " << enum_name(put_type) + << " to be " << expected_done; + if (expected_done) { m_shadow_map.insert(std::make_pair(key, value)); } else { const auto r = m_shadow_map.find(key); @@ -130,7 +127,7 @@ struct NodeTest : public testing::Test { for (uint32_t i{0}; i < count; ++i) { K key{k + i}; V range_value{value}; - if constexpr (std::is_same_v< V, TestIntervalValue >) { range_value.shift(i, nullptr); } + if constexpr (std::is_same_v< V, TestIntervalValue >) { range_value.shift(i); } if (m_shadow_map.find(key) != m_shadow_map.end()) { if (put_type != btree_put_type::INSERT) { m_shadow_map.insert_or_assign(key, range_value); } @@ -373,14 +370,10 @@ TYPED_TEST(NodeTest, SimpleInsert) { for (uint32_t i = 10; i <= 20; ++i) { this->remove(i); } - this->m_node1->move_out_to_right_by_entries(this->m_cfg, *this->m_node2, 20); - this->m_node1->copy_by_entries(this->m_cfg, *this->m_node2, 0, std::numeric_limits< uint32_t >::max()); -} - -TYPED_TEST(NodeTest, RangeChangeInsert) { - if (this->m_node1->get_node_type() != btree_node_type::PREFIX) { return; } - this->put_range(0xFFFFFFFF - 10, 20); - this->print(); + this->m_node1->move_out_to_right_by_entries(*this->m_node2, 20); + uint32_t copy_idx{0u}; + this->m_node1->append_copy_in_upto_size(*this->m_node2, copy_idx, std::numeric_limits< uint32_t >::max(), + /*copy_only_if_fits=*/false); } TYPED_TEST(NodeTest, ReverseInsert) { @@ -472,31 +465,56 @@ TYPED_TEST(NodeTest, RandomInsertRemoveUpdate) { } TYPED_TEST(NodeTest, Move) { - std::vector< uint32_t > list{0, 1, 2, g_max_keys / 2 - 1}; + std::vector< uint32_t > list{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, g_max_keys / 2 - 1}; this->put_list(list); this->print(); - this->m_node1->move_out_to_right_by_entries(this->m_cfg, *this->m_node2, list.size()); - this->m_node1->move_out_to_right_by_entries(this->m_cfg, *this->m_node2, list.size()); // Empty move + // Full node move to right and validate its correctness. + this->m_node1->move_out_to_right_by_entries(*this->m_node2, list.size()); // Full move + this->m_node1->move_out_to_right_by_entries(*this->m_node2, list.size()); // Empty move ASSERT_EQ(this->m_node1->total_entries(), 0u) << "Move out to right has failed"; ASSERT_EQ(this->m_node2->total_entries(), list.size()) << "Move out to right has failed"; this->validate_get_all(); + auto filled_size = this->m_node2->occupied_size(); + + // Full copy in and validate its correctness + uint32_t cursor{0}; + auto has_copied = this->m_node1->append_copy_in_upto_size(*this->m_node2, cursor, filled_size, + /*copy_only_if_fits=*/true); // Full copy in + ASSERT_EQ(has_copied, true) << "Append copy in has failed"; + ASSERT_EQ(cursor, this->m_node2->total_entries()) << "Append copy cursor not updated"; + ASSERT_EQ(this->m_node1->total_entries(), list.size()) << "Move out to right has failed"; + ASSERT_EQ(this->m_node2->total_entries(), list.size()) << "Move out to right has failed"; - auto first_half = list.size() / 2; - auto second_half = list.size() - first_half; - this->m_node1->copy_by_entries(this->m_cfg, *this->m_node2, 0, first_half); // Copy half entries - this->m_node1->copy_by_entries(this->m_cfg, *this->m_node2, first_half, second_half); // Copy half entries - this->m_node2->remove_all(this->m_cfg); - ASSERT_EQ(this->m_node2->total_entries(), 0u) << "Remove all on right has failed"; - ASSERT_EQ(this->m_node1->total_entries(), list.size()) << "Move in from right has failed"; + // Make the node2 clean slate + this->m_node2->remove_all(); + ASSERT_EQ(this->m_node2->total_entries(), 0) << "Remove all failed"; this->validate_get_all(); - this->m_node1->move_out_to_right_by_entries(this->m_cfg, *this->m_node2, list.size() / 2); - ASSERT_EQ(this->m_node1->total_entries(), list.size() / 2) << "Move out half entries to right has failed"; - ASSERT_EQ(this->m_node2->total_entries(), list.size() - list.size() / 2) - << "Move out half entries to right has failed"; + + // Move roughly half of the size to the right node (node2) + filled_size = this->m_node1->occupied_size(); + auto const nmoved = this->m_node1->move_out_to_right_by_size(*this->m_node2, filled_size / 2); + ASSERT_NE(nmoved, 0u) << "Move out didn't move any keys"; + ASSERT_EQ(this->m_node1->total_entries(), list.size() - nmoved) + << "Move out roughly half size to right has failed on left node"; + ASSERT_EQ(this->m_node2->total_entries(), nmoved) << "Move out half entries to right has failed on right node"; this->validate_get_all(); this->print(); this->validate_key_order(); + + // Full copy back in to node1 and now it should be back to original full node + cursor = 0; + has_copied = this->m_node1->append_copy_in_upto_size(*this->m_node2, cursor, this->m_node1->node_data_size(), + /*copy_only_if_fits=*/true); // Full copy in + ASSERT_EQ(has_copied, true) << "Append copy in has failed"; + ASSERT_EQ(cursor, this->m_node2->total_entries()) << "Append copy cursor not updated"; + ASSERT_EQ(this->m_node1->total_entries(), list.size()) << "Move out to right has failed"; + this->print(); + + // Overwrite the node to right and check its equality. + this->m_node2->overwrite(*this->m_node1); + ASSERT_EQ(this->m_node1->total_entries(), list.size()) << "Move out to right has failed"; + ASSERT_EQ(this->m_node2->total_entries(), list.size()) << "Move out to right has failed"; } SISL_OPTIONS_ENABLE(logging, test_btree_node) diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 4e18b70d1..404ba8247 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -58,11 +58,7 @@ SISL_OPTION_GROUP( ::cxxopts::value< int >()->default_value("-1"), "number"), (num_io, "", "num_io", "number of IO operations", ::cxxopts::value< uint64_t >()->default_value("300"), "number"), (qdepth, "", "qdepth", "Max outstanding operations", ::cxxopts::value< uint32_t >()->default_value("8"), "number"), - (spdk, "", "spdk", "spdk", ::cxxopts::value< bool >()->default_value("false"), "true or false"), - (flip_list, "", "flip_list", "btree flip list", ::cxxopts::value< std::vector< std::string > >(), "flips [...]"), - (use_file, "", "use_file", "use file instead of real drive", ::cxxopts::value< bool >()->default_value("false"), - "true or false"), - (enable_crash, "", "enable_crash", "enable crash", ::cxxopts::value< bool >()->default_value("0"), "")); + (spdk, "", "spdk", "spdk", ::cxxopts::value< bool >()->default_value("false"), "true or false")); SETTINGS_INIT(iomgrcfg::IomgrSettings, iomgr_config); @@ -161,7 +157,6 @@ class HSTestHelper { blk_allocator_type_t blkalloc_type{blk_allocator_type_t::varsize}; uint32_t blk_size{0}; shared< ChunkSelector > custom_chunk_selector{nullptr}; - shared< ChunkSelector > index_chunk_selector{nullptr}; IndexServiceCallbacks* index_svc_cbs{nullptr}; shared< ReplApplication > repl_app{nullptr}; chunk_num_t num_chunks{1}; @@ -172,28 +167,28 @@ class HSTestHelper { struct test_token { std::string name_; - std::map< uint32_t, test_params > svc_params_; + std::map< ServiceType, test_params > svc_params_; hs_before_services_starting_cb_t cb_{nullptr}; std::vector< homestore::dev_info > devs_; - test_params& params(uint32_t svc) { return svc_params_[svc]; } + test_params& params(ServiceType svc) { return svc_params_[svc]; } hs_before_services_starting_cb_t& cb() { return cb_; } }; - virtual void start_homestore(const std::string& test_name, std::map< uint32_t, test_params >&& svc_params, + virtual void start_homestore(const std::string& test_name, std::map< ServiceType, test_params >&& svc_params, hs_before_services_starting_cb_t cb = nullptr, - std::vector< homestore::dev_info > devs = {}, bool init_device = true) { + std::vector< homestore::dev_info > devs = {}, bool create_device = true) { m_token = test_token{.name_ = test_name, .svc_params_ = std::move(svc_params), .cb_ = cb, .devs_ = std::move(devs)}; - do_start_homestore(false /* fake_restart */, init_device); + do_start_homestore(false /* fake_restart */, create_device, 5 /* shutdown_delay_sec */); } virtual void restart_homestore(uint32_t shutdown_delay_sec = 5) { - do_start_homestore(true /* fake_restart*/, false /* init_device */, shutdown_delay_sec); + do_start_homestore(true /* fake_restart*/, false /* create_device */, shutdown_delay_sec); } virtual void start_homestore() { - do_start_homestore(true /* fake_restart*/, false /* init_device */, 1 /* shutdown_delay_sec */); + do_start_homestore(true /* fake_restart*/, false /* create_device */, 1 /* shutdown_delay_sec */); } virtual void shutdown_homestore(bool cleanup = true) { @@ -214,11 +209,13 @@ class HSTestHelper { void change_start_cb(hs_before_services_starting_cb_t cb) { m_token.cb() = cb; } void change_device_list(std::vector< homestore::dev_info > devs) { m_token.devs_ = std::move(devs); } - test_params& params(uint32_t svc) { return m_token.svc_params_[svc]; } + test_params& params(ServiceType svc) { return m_token.svc_params_[svc]; } #ifdef _PRERELEASE void wait_for_crash_recovery(bool check_will_crash = false) { - if (check_will_crash && !homestore::HomeStore::instance()->crash_simulator().will_crash()) { return; } + if(check_will_crash && !homestore::HomeStore::instance()->crash_simulator().will_crash()) { + return; + } LOGDEBUG("Waiting for m_crash_recovered future"); m_crash_recovered.getFuture().get(); m_crash_recovered = folly::Promise< folly::Unit >(); @@ -363,57 +360,79 @@ class HSTestHelper { } private: - void do_start_homestore(bool fake_restart = false, bool init_device = true, uint32_t shutdown_delay_sec = 5) { - auto const ndevices = SISL_OPTIONS["num_devs"].as< uint32_t >(); - auto const dev_size = SISL_OPTIONS["dev_size_mb"].as< uint64_t >() * 1024 * 1024; - auto num_threads = SISL_OPTIONS["num_threads"].as< uint32_t >(); - auto num_fibers = SISL_OPTIONS["num_fibers"].as< uint32_t >(); - auto is_spdk = SISL_OPTIONS["spdk"].as< bool >(); - - auto use_file = SISL_OPTIONS["use_file"].as< bool >(); - - if (use_file && SISL_OPTIONS.count("device_list")) { - LOGWARN("Ignoring device_list as use_file is set to true"); - } - + void do_start_homestore(bool fake_restart = false, bool create_device = true, uint32_t shutdown_delay_sec = 5) { if (fake_restart) { // Fake restart, device list is unchanged. shutdown_homestore(false); std::this_thread::sleep_for(std::chrono::seconds{shutdown_delay_sec}); - } else if (SISL_OPTIONS.count("device_list") && !use_file) { - // User has provided explicit device list, use that and initialize them - auto const devs = SISL_OPTIONS["device_list"].as< std::vector< std::string > >(); - for (const auto& name : devs) { - // iomgr::DriveInterface::emulate_drive_type(name, iomgr::drive_type::block_hdd); - m_token.devs_.emplace_back(name, - m_token.devs_.empty() - ? homestore::HSDevType::Fast - : homestore::HSDevType::Data); // First device is fast device - } - - LOGINFO("Taking input dev_list: {}", - std::accumulate(m_token.devs_.begin(), m_token.devs_.end(), std::string(""), - [](const std::string& s, const homestore::dev_info& dinfo) { - return s.empty() ? dinfo.dev_name : s + "," + dinfo.dev_name; - })); - - if (init_device) { init_raw_devices(m_token.devs_); } } else { - for (uint32_t i{0}; i < ndevices; ++i) { - m_generated_devs.emplace_back(std::string{"/tmp/" + m_token.name_ + "_" + std::to_string(i + 1)}); + // Here is the order of how devices/sizes are considered to format homestore + // 1. Look if the test itself has some requirements for the devices to create and its size. If so use them. + // 2. If not provided by test, look for any input devices given as command line by the user + // 3. If both are empty, then use the default values for size and generate devices. + if (!m_token.devs_.empty()) { + for (uint32_t i{0}; i < m_token.devs_.size(); ++i) { + auto& dinfo = m_token.devs_[i]; + uint64_t gen_dev_size = dinfo.dev_size + ? dinfo.dev_size + : SISL_OPTIONS["dev_size_mb"].as< uint64_t >() * 1024ul * 1024ul; + + // User could have given an empty device name, which means we have to generate on a requested + // size + if (dinfo.dev_name.empty()) { + std::string fname = std::string{"/tmp/" + m_token.name_ + "_" + std::to_string(i + 1)}; + m_generated_devs.emplace_back(fname); + init_file(fname, gen_dev_size); + dinfo.dev_name = std::filesystem::canonical(fname).string(); + dinfo.dev_size = gen_dev_size; + } + } + } else if (SISL_OPTIONS.count("device_list")) { + // Test didn't provide any devices. + // Command line has device list, use that + auto const devs = SISL_OPTIONS["device_list"].as< std::vector< std::string > >(); + for (uint32_t i{0}; i < devs.size(); ++i) { + // iomgr::DriveInterface::emulate_drive_type(name, iomgr::drive_type::block_hdd); + // First device is fast device + m_token.devs_.emplace_back(devs[i], + (i == 0) ? homestore::HSDevType::Fast : homestore::HSDevType::Data); + if (create_device) { init_raw_device(m_token.devs_[i]); } + } + } else { + // Neither test nor command line provide devices, generate one + for (uint32_t i{0}; i < SISL_OPTIONS["num_devs"].as< uint32_t >(); ++i) { + uint64_t gen_dev_size = SISL_OPTIONS["dev_size_mb"].as< uint64_t >() * 1024ul * 1024ul; + auto fname = std::string{"/tmp/" + m_token.name_ + "_" + std::to_string(i + 1)}; + m_generated_devs.emplace_back(fname); + init_file(fname, gen_dev_size); + // First device is fast device + m_token.devs_.emplace_back(std::filesystem::canonical(fname).string(), + (i == 0) ? homestore::HSDevType::Fast : homestore::HSDevType::Data, + gen_dev_size); + } } - if (init_device) { - LOGINFO("creating {} device files with each of size {} ", ndevices, homestore::in_bytes(dev_size)); - init_files(m_generated_devs, dev_size); - } - for (auto const& fname : m_generated_devs) { - m_token.devs_.emplace_back(std::filesystem::canonical(fname).string(), - m_token.devs_.empty() - ? homestore::HSDevType::Fast - : homestore::HSDevType::Data); // First device is fast device + + // At this point all m_token.devs_ has required device name and its size. + if (m_generated_devs.empty()) { + // We are using raw dev list + LOGINFO("Using raw dev_list for testing: {}", + std::accumulate(m_token.devs_.begin(), m_token.devs_.end(), std::string(""), + [](const std::string& s, const homestore::dev_info& dinfo) { + return s.empty() ? dinfo.dev_name : s + "," + dinfo.dev_name; + })); + } else { + // We are using generated device list. + LOGINFO("Generated dev list: {}", + std::accumulate(m_generated_devs.begin(), m_generated_devs.end(), std::string(""), + [](const std::string& s, const std::string& fname) { + return s.empty() ? fname : s + "," + fname; + })); } } + auto num_threads = SISL_OPTIONS["num_threads"].as< uint32_t >(); + auto num_fibers = SISL_OPTIONS["num_fibers"].as< uint32_t >(); + auto is_spdk = SISL_OPTIONS["spdk"].as< bool >(); if (is_spdk) { LOGINFO("Spdk with more than 2 threads will cause overburden test systems, changing nthreads to 2"); num_threads = 2; @@ -429,26 +448,38 @@ class HSTestHelper { ioenvironment.with_http_server(); } - const uint64_t app_mem_size = ((ndevices * dev_size) * 15) / 100; + uint64_t total_dev_size{0}; + for (auto const& dinfo : m_token.devs_) { + if (std::filesystem::is_regular_file(dinfo.dev_name)) { + total_dev_size += dinfo.dev_size; + } else if (std::filesystem::is_block_file(dinfo.dev_name)) { + total_dev_size += std::filesystem::space(dinfo.dev_name).capacity; + } + } + const uint64_t app_mem_size = (total_dev_size * 15) / 100; + std::clamp(app_mem_size, 16ul * 1024ul * 1024ul, 64ul * 1024ul * 1024ul * 1024ul); // Between 16 MB to 64GB. LOGINFO("Initialize and start HomeStore with app_mem_size = {}", homestore::in_bytes(app_mem_size)); using namespace homestore; auto hsi = HomeStore::instance(); for (auto& [svc, tp] : m_token.svc_params_) { - if (svc == HS_SERVICE::DATA) { + if (svc == ServiceType::DATA) { hsi->with_data_service(tp.custom_chunk_selector); - } else if (svc == HS_SERVICE::INDEX) { - hsi->with_index_service(std::unique_ptr< IndexServiceCallbacks >(tp.index_svc_cbs), - tp.index_chunk_selector); - } else if ((svc == HS_SERVICE::LOG)) { + } else if (svc == ServiceType::INDEX) { + hsi->with_index_service( + std::unique_ptr< IndexServiceCallbacks >(tp.index_svc_cbs), + {ServiceSubType::INDEX_BTREE_COPY_ON_WRITE, ServiceSubType::INDEX_BTREE_MEMORY}); + } else if ((svc == ServiceType::LOG)) { hsi->with_log_service(); - } else if (svc == HS_SERVICE::REPLICATION) { + } else if (svc == ServiceType::REPLICATION) { +#ifdef REPLICATION_SUPPORT hsi->with_repl_data_service(tp.repl_app, tp.custom_chunk_selector); +#endif } } #ifdef _PRERELEASE hsi->with_crash_simulator([this](void) mutable { - LOGWARN("CrashSimulator::crash() is called - restarting homestore"); + LOGINFO("CrashSimulator::crash() is called - restarting homestore"); this->restart_homestore(); m_crash_recovered.setValue(); }); @@ -458,37 +489,34 @@ class HSTestHelper { hsi->start(hs_input_params{.devices = m_token.devs_, .app_mem_size = app_mem_size}, m_token.cb_); // We need to set the min chunk size before homestore format - if (m_token.svc_params_.contains(HS_SERVICE::LOG) && m_token.svc_params_[HS_SERVICE::LOG].min_chunk_size != 0) { - set_min_chunk_size(m_token.svc_params_[HS_SERVICE::LOG].min_chunk_size); + if (m_token.svc_params_.contains(ServiceType::LOG) && + m_token.svc_params_[ServiceType::LOG].min_chunk_size != 0) { + set_min_chunk_size(m_token.svc_params_[ServiceType::LOG].min_chunk_size); } if (need_format) { auto svc_params = m_token.svc_params_; hsi->format_and_start( - {{HS_SERVICE::META, - {.dev_type = homestore::HSDevType::Fast, .size_pct = svc_params[HS_SERVICE::META].size_pct}}, - {HS_SERVICE::LOG, - {.dev_type = homestore::HSDevType::Fast, - .size_pct = svc_params[HS_SERVICE::LOG].size_pct, - .chunk_size = svc_params[HS_SERVICE::LOG].chunk_size, - .vdev_size_type = svc_params[HS_SERVICE::LOG].vdev_size_type}}, - {HS_SERVICE::DATA, - {.size_pct = svc_params[HS_SERVICE::DATA].size_pct, - .num_chunks = svc_params[HS_SERVICE::DATA].num_chunks, - .alloc_type = svc_params[HS_SERVICE::DATA].blkalloc_type, - .chunk_sel_type = svc_params[HS_SERVICE::DATA].custom_chunk_selector - ? chunk_selector_type_t::CUSTOM - : chunk_selector_type_t::ROUND_ROBIN}}, - {HS_SERVICE::INDEX, + {{{ServiceType::META}, + {.dev_type = homestore::HSDevType::Fast, .size_pct = svc_params[ServiceType::META].size_pct}}, + {{ServiceType::LOG}, {.dev_type = homestore::HSDevType::Fast, - .size_pct = svc_params[HS_SERVICE::INDEX].size_pct, - .chunk_sel_type = svc_params[HS_SERVICE::INDEX].custom_chunk_selector + .size_pct = svc_params[ServiceType::LOG].size_pct, + .chunk_size = svc_params[ServiceType::LOG].chunk_size, + .vdev_size_type = svc_params[ServiceType::LOG].vdev_size_type}}, + {{ServiceType::DATA}, + {.size_pct = svc_params[ServiceType::DATA].size_pct, + .num_chunks = svc_params[ServiceType::DATA].num_chunks, + .alloc_type = svc_params[ServiceType::DATA].blkalloc_type, + .chunk_sel_type = svc_params[ServiceType::DATA].custom_chunk_selector ? chunk_selector_type_t::CUSTOM : chunk_selector_type_t::ROUND_ROBIN}}, - {HS_SERVICE::REPLICATION, - {.size_pct = svc_params[HS_SERVICE::REPLICATION].size_pct, - .alloc_type = svc_params[HS_SERVICE::REPLICATION].blkalloc_type, - .chunk_sel_type = svc_params[HS_SERVICE::REPLICATION].custom_chunk_selector + {{ServiceType::INDEX, ServiceSubType::INDEX_BTREE_COPY_ON_WRITE}, + {.dev_type = homestore::HSDevType::Fast, .size_pct = svc_params[ServiceType::INDEX].size_pct}}, + {{ServiceType::REPLICATION}, + {.size_pct = svc_params[ServiceType::REPLICATION].size_pct, + .alloc_type = svc_params[ServiceType::REPLICATION].blkalloc_type, + .chunk_sel_type = svc_params[ServiceType::REPLICATION].custom_chunk_selector ? chunk_selector_type_t::CUSTOM : chunk_selector_type_t::ROUND_ROBIN}}}); } @@ -500,30 +528,41 @@ class HSTestHelper { } } + void init_file(std::string const& fpath, uint64_t dev_size) { + if (std::filesystem::exists(fpath)) { std::filesystem::remove(fpath); } + + LOGINFO("Creating {} and initializing device file with size of {} ", fpath, homestore::in_bytes(dev_size)); + std::ofstream ofs{fpath, std::ios::binary | std::ios::out | std::ios::trunc}; + std::filesystem::resize_file(fpath, dev_size); + } + void init_files(const std::vector< std::string >& file_paths, uint64_t dev_size) { - remove_files(file_paths); for (const auto& fpath : file_paths) { - std::ofstream ofs{fpath, std::ios::binary | std::ios::out | std::ios::trunc}; - std::filesystem::resize_file(fpath, dev_size); + init_file(fpath, dev_size); } } - void init_raw_devices(const std::vector< homestore::dev_info >& devs) { - auto const zero_size = hs_super_blk::first_block_size() * 1024; - std::vector< int > zeros(zero_size, 0); - for (auto const& dinfo : devs) { - if (!std::filesystem::exists(dinfo.dev_name)) { - HS_REL_ASSERT(false, "Device {} does not exist", dinfo.dev_name); - } + void init_raw_device(homestore::dev_info const& dinfo) { + static auto zero_size = hs_super_blk::first_block_size() * 1024; + static std::vector< int > zeros(zero_size, 0); + + if (!std::filesystem::exists(dinfo.dev_name)) { + HS_REL_ASSERT(false, "Device {} does not exist", dinfo.dev_name); + } - auto fd = ::open(dinfo.dev_name.c_str(), O_RDWR, 0640); - HS_REL_ASSERT(fd != -1, "Failed to open device"); + auto fd = ::open(dinfo.dev_name.c_str(), O_RDWR, 0640); + HS_REL_ASSERT(fd != -1, "Failed to open device"); - auto const write_sz = - pwrite(fd, zeros.data(), zero_size /* size */, hs_super_blk::first_block_offset() /* offset */); - HS_REL_ASSERT(write_sz == zero_size, "Failed to write to device"); - LOGINFO("Successfully zeroed the 1st {} bytes of device {}", zero_size, dinfo.dev_name); - ::close(fd); + auto const write_sz = + pwrite(fd, zeros.data(), zero_size /* size */, hs_super_blk::first_block_offset() /* offset */); + HS_REL_ASSERT(write_sz == zero_size, "Failed to write to device"); + LOGINFO("Successfully zeroed the 1st {} bytes of device {}", zero_size, dinfo.dev_name); + ::close(fd); + } + + void init_raw_devices(const std::vector< homestore::dev_info >& devs) { + for (auto const& dinfo : devs) { + init_raw_device(dinfo); } } diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp index 136aceb7f..80eeb1573 100644 --- a/src/tests/test_common/raft_repl_test_base.hpp +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -183,8 +183,9 @@ class TestReplicatedDB : public homestore::ReplDevListener { void on_config_rollback(int64_t lsn) override { LOGINFOMOD(replication, "[Replica={}] Received config rollback at lsn={}", g_helper->replica_num(), lsn); } - void on_no_space_left(repl_lsn_t lsn, sisl::blob const& header) override { - LOGINFOMOD(replication, "[Replica={}] Received no_space_left at lsn={}", g_helper->replica_num(), lsn); + void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) override { + LOGINFOMOD(replication, "[Replica={}] Received no_space_left at lsn={}, chunk_id={}", g_helper->replica_num(), + lsn, chunk_id); } AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { @@ -335,22 +336,20 @@ class TestReplicatedDB : public homestore::ReplDevListener { auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); Key k{.id_ = jheader->key_id}; auto iter = inmem_db_.find(k); - auto hints = blk_alloc_hints{}; if (iter != inmem_db_.end()) { LOGDEBUG("data already exists in mem db, key={}", k.id_); + auto hints = blk_alloc_hints{}; hints.committed_blk_id = iter->second.blkid_; + return hints; } - return hints; + return blk_alloc_hints{}; } - - void on_start_replace_member(const std::string& task_id, const replica_member_info& member_out, - const replica_member_info& member_in, trace_id_t tid) override { + void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override { LOGINFO("[Replica={}] start replace member out {} in {}", g_helper->replica_num(), boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); } - void on_complete_replace_member(const std::string& task_id, const replica_member_info& member_out, - const replica_member_info& member_in, trace_id_t tid) override { + void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override { LOGINFO("[Replica={}] complete replace member out {} in {}", g_helper->replica_num(), boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); } @@ -444,13 +443,6 @@ class TestReplicatedDB : public homestore::ReplDevListener { LOGINFO("Manually truncated"); } - repl_lsn_t get_truncation_upper_limit() { - auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); - auto limit = raft_repl_dev->get_truncation_upper_limit(); - LOGINFO("Truncation upper limit is {}", limit); - return limit; - } - void set_zombie() { zombie_ = true; } bool is_zombie() { // Wether a group is zombie(non recoverable) @@ -749,19 +741,16 @@ class RaftReplDevTestBase : public testing::Test { void create_snapshot() { dbs_[0]->create_snapshot(); } void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } - repl_lsn_t get_truncation_upper_limit() { return dbs_[0]->get_truncation_upper_limit(); } - void replace_member(std::shared_ptr< TestReplicatedDB > db, std::string& task_id, replica_id_t member_out, - replica_id_t member_in, uint32_t commit_quorum = 0, - ReplServiceError error = ReplServiceError::OK) { - this->run_on_leader(db, [this, error, db, &task_id, member_out, member_in, commit_quorum]() { - LOGINFO("Start replace member task_id={}, out={}, in={}", task_id, boost::uuids::to_string(member_out), + void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, + uint32_t commit_quorum = 0, ReplServiceError error = ReplServiceError::OK) { + this->run_on_leader(db, [this, error, db, member_out, member_in, commit_quorum]() { + LOGINFO("Start replace member out={} in={}", boost::uuids::to_string(member_out), boost::uuids::to_string(member_in)); replica_member_info out{member_out, ""}; replica_member_info in{member_in, ""}; - auto result = - hs()->repl_service().replace_member(db->repl_dev()->group_id(), task_id, out, in, commit_quorum).get(); + auto result = hs()->repl_service().replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); if (error == ReplServiceError::OK) { ASSERT_EQ(result.hasError(), false) << "Error in replacing member, err=" << result.error(); } else { @@ -771,22 +760,6 @@ class RaftReplDevTestBase : public testing::Test { }); } - ReplaceMemberStatus check_replace_member_status(std::shared_ptr< TestReplicatedDB > db, std::string& task_id, - replica_id_t member_out, replica_id_t member_in) { - LOGINFO("check replace member status, task_id={}, out={} in={}", task_id, boost::uuids::to_string(member_out), - boost::uuids::to_string(member_in)); - - replica_member_info out{member_out, ""}; - replica_member_info in{member_in, ""}; - std::vector< replica_member_info > others; - for (auto m : g_helper->members_) { - if (m.first != member_out && m.first != member_in) { - others.emplace_back(replica_member_info{.id = m.first, .name = ""}); - } - } - return hs()->repl_service().get_replace_member_status(db->repl_dev()->group_id(), task_id, out, in, others); - } - protected: std::vector< std::shared_ptr< TestReplicatedDB > > dbs_; uint32_t written_entries_{0}; diff --git a/src/tests/test_cow_btree_recovery.cpp b/src/tests/test_cow_btree_recovery.cpp new file mode 100644 index 000000000..0a637c9c3 --- /dev/null +++ b/src/tests/test_cow_btree_recovery.cpp @@ -0,0 +1,574 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include +#include + +#include +#include "common/homestore_config.hpp" +#include "common/resource_mgr.hpp" +#include "test_common/homestore_test_common.hpp" +#include "test_common/range_scheduler.hpp" +#include "btree_helpers/btree_test_helper.hpp" +#include "btree_helpers/btree_test_kvs.hpp" +#include "btree_helpers/btree_decls.h" + +using namespace homestore; + +SISL_OPTIONS_ENABLE(logging, test_cow_btree_recovery, iomgr, test_common_setup) + +// TODO Add tests to do write,remove after recovery. +// TODO Test with var len key with io mgr page size is 512. + +SISL_OPTION_GROUP( + test_cow_btree_recovery, + (test_type, "", "test_type", "What type of test, [unit | functional | stress ]", + ::cxxopts::value< std::string >()->default_value("unit"), "string"), + (num_ios, "", "num_ios", "[override] number of io operations to test", ::cxxopts::value< uint32_t >(), "number"), + (num_btrees, "", "num_btrees", "[override] number of btrees to test", ::cxxopts::value< uint32_t >(), "number"), + (num_cps, "", "num_cps", "[override] number of cps to test", ::cxxopts::value< uint32_t >(), "number"), + (num_entries, "", "num_entries", "[override] number of entries per btree", ::cxxopts::value< uint32_t >(), + "number"), + (run_time, "", "run_time", "[override] run time for io", ::cxxopts::value< uint32_t >(), "seconds"), + (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), + (preload_size, "", "preload_size", "[ovveride] number of entries to preload tree with", + ::cxxopts::value< uint32_t >(), "number"), + (seed, "", "seed", "random engine seed, use random if not defined", + ::cxxopts::value< uint64_t >()->default_value("0"), "number")) + +struct COWBtreeTestOptions : public BtreeTestOptions { + uint32_t num_cps; + uint32_t num_btrees; +}; +COWBtreeTestOptions g_opts; + +void log_obj_life_counter() { + std::string str; + sisl::ObjCounterRegistry::foreach ([&str](const std::string& name, int64_t created, int64_t alive) { + fmt::format_to(std::back_inserter(str), "{}: created={} alive={}\n", name, created, alive); + }); + LOGINFO("Object Life Counter\n:{}", str); +} + +static void set_options() { + if (SISL_OPTIONS["test_type"].as< std::string >() == "unit") { + g_opts.num_entries = 5000; + g_opts.preload_size = 2500; + g_opts.num_ios = 500; + g_opts.run_time_secs = 36000; // Limit is on ios than time + g_opts.num_btrees = 2; + g_opts.num_cps = 0; + } else if (SISL_OPTIONS["test_type"].as< std::string >() == "functional") { + g_opts.num_entries = 50000; + g_opts.preload_size = 25000; + g_opts.num_ios = 50000; + g_opts.run_time_secs = 36000; // Limit is on ios than time + g_opts.num_btrees = 2; + g_opts.num_cps = 25; + } + + if (SISL_OPTIONS.count("num_entries")) { g_opts.num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); } + if (SISL_OPTIONS.count("preload_size")) { g_opts.preload_size = SISL_OPTIONS["preload_size"].as< uint32_t >(); } + if (SISL_OPTIONS.count("num_ios")) { g_opts.num_ios = SISL_OPTIONS["num_ios"].as< uint32_t >(); } + if (SISL_OPTIONS.count("run_time")) { g_opts.run_time_secs = SISL_OPTIONS["run_time"].as< uint32_t >(); } + if (SISL_OPTIONS.count("num_cps")) { g_opts.num_cps = SISL_OPTIONS["num_cps"].as< uint32_t >(); } + if (SISL_OPTIONS.count("disable_merge")) { g_opts.disable_merge = SISL_OPTIONS["disable_merge"].as< bool >(); } + + if (SISL_OPTIONS.count("seed")) { + LOGINFO("Using seed {} to sow the random generation", SISL_OPTIONS["seed"].as< uint64_t >()); + g_re.seed(SISL_OPTIONS["seed"].as< uint64_t >()); + } +} + +struct BtreeTest : public test_common::HSTestHelper, public ::testing::Test { + using T = VarObjSizeBtree< IndexStore::Type::COPY_ON_WRITE_BTREE >; + using K = typename T::KeyType; + using V = typename T::ValueType; + + class TestIndexServiceCallbacks : public IndexServiceCallbacks { + public: + TestIndexServiceCallbacks(BtreeTest* test) : m_test(test) {} + std::shared_ptr< Index > on_index_table_found(superblk< IndexSuperBlock >&& sb) override { + // Locate the helper corresponding to this btree ordinal + auto it1 = m_test->m_bt_helpers.find(sb->ordinal); + if (it1 == m_test->m_bt_helpers.end()) { + auto it2 = m_test->m_destroyed_bt_helpers.find(sb->ordinal); + RELEASE_ASSERT((it2 != m_test->m_destroyed_bt_helpers.end()), + "BT Helper for ordinal={} is not found, some issue in destroying btree?", sb->ordinal); + LOGINFO("Prior to restart, btree_ordinal={} was attempted to destroy, but CP was not taken, so we " + "recovered that as well", + sb->ordinal); + bool happened; + std::tie(it1, happened) = m_test->m_bt_helpers.insert(*it2); + m_test->m_destroyed_bt_helpers.erase(it2); + } + + ++m_test->m_recovered; + auto bt_helper = it1->second.get(); + bt_helper->SetUp(std::make_shared< Btree< K, V > >(bt_helper->m_cfg, std::move(sb)), true /* load */, + true /* multi_threaded */); + return bt_helper->m_bt; + } + + private: + BtreeTest* m_test; + }; + friend class TestIndexServiceCallbacks; + + BtreeTest() : testing::Test() {} + + void SetUp() override { + start_homestore( + "test_btree", + {{ServiceType::META, {.size_pct = 10.0}}, + {ServiceType::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}, + nullptr, {homestore::dev_info{"", homestore::HSDevType::Fast, 0}}); + // For persistent btree, we try to create a default size, but with only 1 device explictly, since this tests + // start restart homestore several times and its better to use 1 disk always. + + // Test cp flush of write back. + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.generic.cache_max_throttle_cnt = 10000; + HS_SETTINGS_FACTORY().save(); + }); + homestore::hs()->resource_mgr().reset_dirty_buf_qd(); + + // Create index table and attach to index service. + auto const multi_threaded = + (testing::UnitTest::GetInstance()->current_test_info()->name() == std::string("ConcurrentMultiOps")); + + for (uint32_t i{0}; i < g_opts.num_btrees; ++i) { + create_new_btree(); + } + } + + uint32_t create_new_btree() { + auto uuid = boost::uuids::random_generator()(); + auto parent_uuid = boost::uuids::random_generator()(); + + auto bt_helper = std::make_shared< BtreeTestHelper< T > >(g_opts); + bt_helper->SetUp(std::make_shared< Btree< K, V > >(bt_helper->m_cfg, uuid, parent_uuid, 0), false /* load */, + true /* multi_threaded */); + hs()->index_service().add_index_table(bt_helper->m_bt); + auto ordinal = bt_helper->m_bt->ordinal(); + m_bt_helpers.insert(std::make_pair(bt_helper->m_bt->ordinal(), std::move(bt_helper))); + return ordinal; + } + + void destroy_a_btree() { + if (m_bt_helpers.empty()) { return; } + + auto it = m_bt_helpers.begin(); + auto [ordinal, bt_helper] = *it; + + hs()->index_service().destroy_index_table(bt_helper->m_bt).thenValue([this, bt_helper, ordinal](auto&&) { + m_destroyed_bt_helpers.insert(std::make_pair(ordinal, bt_helper)); + }); + bt_helper->m_bt.reset(); + m_bt_helpers.erase(it); + } + + void io_on_btrees() { + std::vector< std::string > input_ops = {"put:70", "remove:30"}; + for (auto& [_, bt_helper] : m_bt_helpers) { + bt_helper->multi_op_execute(bt_helper->build_op_list(input_ops)); + } + } + + void validate_btrees() { + for (auto& [_, bt_helper] : m_bt_helpers) { + bt_helper->query_all_paginate(500); + } + } + + void TearDown() override { + for (auto& [_, bt_helper] : m_bt_helpers) { + hs()->index_service().destroy_index_table(bt_helper->m_bt); + bt_helper->m_bt.reset(); + bt_helper->TearDown(); + } + shutdown_homestore(false); + log_obj_life_counter(); + } + + void restart_homestore(uint32_t shutdown_delay_sec = 5) override { + m_recovered = 0; + this->params(HS_SERVICE::INDEX).index_svc_cbs = new TestIndexServiceCallbacks(this); + for (auto& [_, bt_helper] : this->m_bt_helpers) { + bt_helper->m_bt.reset(); + } + + test_common::HSTestHelper::restart_homestore(shutdown_delay_sec); + } + + void restart_and_validate() { + LOGINFO("Restart homestore and validate if before and after states of btrees are identical"); + for (auto& [_, bt_helper] : this->m_bt_helpers) { + std::string fname = fmt::format("/tmp/btree_{}_before.txt", bt_helper->m_bt->ordinal()); + bt_helper->dump_to_file(fname); + } + restart_homestore(); + LOGINFO(" Restarted homestore with {} indexes recovered", m_recovered); + + ASSERT_EQ(m_recovered, this->m_bt_helpers.size()) << "Number of btrees before and after restart mismatch"; + for (auto& [_, bt_helper] : this->m_bt_helpers) { + std::string before_fname = fmt::format("/tmp/btree_{}_before.txt", bt_helper->m_bt->ordinal()); + std::string after_fname = fmt::format("/tmp/btree_{}_after.txt", bt_helper->m_bt->ordinal()); + bt_helper->dump_to_file(after_fname); + bt_helper->compare_files(before_fname, after_fname); // Validate with dumping + bt_helper->query_all_paginate(500); // Validate with query as well. + } + } + + void trigger_incremental_map_cp() { do_trigger_cp(false /* full_map_cp */, false /* crash */); } + + void trigger_full_map_cp() { do_trigger_cp(true /* full_map_cp */, false /* crash */); } + + void post_crash_validate() { + // Post crash reapply on all btrees + for (auto& [_, bt_helper] : this->m_bt_helpers) { + bt_helper->reapply_after_crash(); + bt_helper->query_all_paginate(500); // Validate with query as well. + } + } + + struct CPParams { + enum class RestartType : uint8_t { none, clean, crash }; + + uint32_t num_new_btrees{0}; // # of new btrees to create before cp + uint32_t num_destroy_btrees{0}; // # of btrees to destroy before cp + uint32_t num_io_btrees{std::numeric_limits< uint32_t >::max()}; // # of btrees to do IO + bool is_full_map_flush_cp{false}; // Is it a full map flush cp or incremental + RestartType restart_post_cp{RestartType::none}; // Should we restart the homestore after cp + + std::string restart_type() const { + switch (restart_post_cp) { + case RestartType::none: + return "none"; + case RestartType::clean: + return "clean"; + case RestartType::crash: + return "crash"; + default: + return "unknown"; + } + } + }; + + void action_with_cp(CPParams p) { + std::vector< uint32_t > created; + created.reserve(p.num_new_btrees); + + for (uint32_t i{0}; i < p.num_new_btrees; ++i) { + created.push_back(create_new_btree()); + } + + auto created_list = [](std::vector< uint32_t > const& v) -> std::string { + std::string str = v.empty() ? "" : std::to_string(v[0]); + for (size_t i{1}; i < v.size(); ++i) { + str += std::string(",") + std::to_string(v[i]); + } + return str; + }; + + auto first_n = [](std::map< uint32_t, std::shared_ptr< BtreeTestHelper< T > > > const& m, + size_t n) -> std::string { + auto it = m.begin(); + std::string str = (it == m.end() || n == 0) ? "" : std::to_string(it->first); + size_t i{1}; + for (++it; (it != m.end()) && (i < n); ++it, ++i) { + str += std::string(",") + std::to_string(it->first); + } + return str; + }; + + if (p.num_io_btrees > this->m_bt_helpers.size()) { p.num_io_btrees = this->m_bt_helpers.size(); } + LOGINFO("CPSpec: Create btrees=[{}] -> IO on btrees=[{}] -> Destroy btrees=[{}] -> CP_type={} -> Restart?={}", + created_list(created), first_n(m_bt_helpers, p.num_io_btrees), + first_n(m_bt_helpers, p.num_destroy_btrees), p.is_full_map_flush_cp ? "FullFlush" : "IncrementalFlush", + p.restart_type()); + + std::vector< std::string > input_ops = {"put:70", "remove:30"}; + uint32_t b{0}; + for (auto& [_, bt_helper] : this->m_bt_helpers) { + if (b++ == p.num_io_btrees) { break; } + bt_helper->multi_op_execute(bt_helper->build_op_list(input_ops)); + } + + for (uint32_t i{0}; i < p.num_destroy_btrees; ++i) { + destroy_a_btree(); + } + + if (p.restart_post_cp == CPParams::RestartType::crash) { + do_trigger_cp(p.is_full_map_flush_cp /* full_map_cp */, true /* crash */); + post_crash_validate(); + } else if (p.restart_post_cp == CPParams::RestartType::clean) { + do_trigger_cp(p.is_full_map_flush_cp /* full_map_cp */, false /* crash */); + restart_and_validate(); + } else { + do_trigger_cp(p.is_full_map_flush_cp /* full_map_cp */, false /* crash */); + } + } + +#ifdef _PRERELEASE + void set_btree_flip(std::string const& flip_name, std::optional< uint32_t > bt_ordinal = std::nullopt, + uint32_t count = 1, uint32_t percent = 100) { + flip::FlipCondition cond; + auto fc = iomgr_flip::client_instance(); + if (bt_ordinal) { + fc->create_condition("btree_ordinal", flip::Operator::EQUAL, (int)*bt_ordinal, &cond); + } else { + fc->create_condition("", flip::Operator::DONT_CARE, (int)1, &cond); + } + flip::FlipFrequency freq; + freq.set_count(count); + freq.set_percent(percent); + fc->inject_noreturn_flip(flip_name, {cond}, freq); + } +#endif + +private: + void do_trigger_cp(bool full_map_cp, bool crash) { + LOGINFO("Trigger {} Map Flush CP {}", full_map_cp ? "Full" : "Incremental", crash ? " to simulate crash" : ""); + + // Modify the settings to take incremental map flushes only once + HS_SETTINGS_FACTORY().modifiable_settings([full_map_cp](auto& s) { + s.btree.cow_max_incremental_map_flushes = full_map_cp ? 0 : 100000; + HS_SETTINGS_FACTORY().save(); + }); + if (crash) { + test_common::HSTestHelper::trigger_cp(false /* wait */); +#ifdef _PRERELEASE + this->wait_for_crash_recovery(); +#endif + } else { + test_common::HSTestHelper::trigger_cp(true /* wait */); + for (auto& [_, bt_helper] : this->m_bt_helpers) { + bt_helper->save_snapshot(); // Save every btree shadow as snapshot + } + } + } + +protected: + std::map< uint32_t, std::shared_ptr< BtreeTestHelper< T > > > m_bt_helpers; + std::map< uint32_t, std::shared_ptr< BtreeTestHelper< T > > > m_destroyed_bt_helpers; + uint32_t m_recovered{0}; +}; + +TEST_F(BtreeTest, DeleteCheckSizeReduction) { + std::vector< std::string > input_ops = {"put:70", "remove:30"}; + for (auto& [_, bt_helper] : this->m_bt_helpers) { + bt_helper->multi_op_execute(bt_helper->build_op_list(input_ops)); + } + + auto const before_space = hs()->index_service().space_occupied(); + for (auto& [_, bt_helper] : this->m_bt_helpers) { + destroy_a_btree(); + } + auto const after_space = hs()->index_service().space_occupied(); + ASSERT_LT(after_space, before_space) << "Destroy of btree didn't recapture space"; +} + +TEST_F(BtreeTest, IOThenFullMapFlushThenRestart) { + action_with_cp({.num_new_btrees = 0, + .num_destroy_btrees = 0, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = true, + .restart_post_cp = CPParams::RestartType::clean}); + + LOGINFO("Post Restart we do IO on all recovered btrees"); + this->io_on_btrees(); +} + +TEST_F(BtreeTest, IOThenIncrementalMapFlushThenRestart) { + action_with_cp({.num_new_btrees = 0, + .num_destroy_btrees = 0, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = false, + .restart_post_cp = CPParams::RestartType::clean}); + + LOGINFO("Post Restart we do IO on all recovered btrees"); + this->io_on_btrees(); +} + +TEST_F(BtreeTest, CreateThenFullMapFlushThenRestart) { + action_with_cp({.num_new_btrees = 1, + .num_destroy_btrees = 0, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = true, + .restart_post_cp = CPParams::RestartType::clean}); + + LOGINFO("Post Restart we do IO on all recovered btrees"); + this->io_on_btrees(); +} + +TEST_F(BtreeTest, CreateThenIncrementalMapFlushThenRestart) { + action_with_cp({.num_new_btrees = 1, + .num_destroy_btrees = 0, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = false, + .restart_post_cp = CPParams::RestartType::clean}); + + LOGINFO("Post Restart we do IO on all recovered btrees"); + this->io_on_btrees(); +} + +TEST_F(BtreeTest, DestroyThenFullMapFlushThenRestart) { + action_with_cp({.num_new_btrees = 0, + .num_destroy_btrees = 1, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = true, + .restart_post_cp = CPParams::RestartType::clean}); + + LOGINFO("Post Restart we do IO on all recovered btrees"); + this->io_on_btrees(); +} + +TEST_F(BtreeTest, DestroyThenIncrementalMapFlushThenRestart) { + action_with_cp({.num_new_btrees = 0, + .num_destroy_btrees = 1, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = false, + .restart_post_cp = CPParams::RestartType::clean}); + + LOGINFO("Post Restart we do IO on all recovered btrees"); + this->io_on_btrees(); +} + +TEST_F(BtreeTest, RandomMultiOps) { + if (SISL_OPTIONS["test_type"].as< std::string >() == "unit") { GTEST_SKIP(); } + + static std::uniform_int_distribution< uint32_t > new_rand_count{0, 3}; + static std::uniform_int_distribution< uint32_t > destroy_rand_count{0, 2}; + static std::normal_distribution<> io_rand_count{(double)(g_opts.num_btrees), 4.0}; + static std::uniform_int_distribution< uint32_t > rand_cp_type{0, 3}; // 25% times for full map cp + static std::uniform_int_distribution< uint32_t > rand_restart{0, 3}; // 25% times for restart + + for (uint32_t i{0}; i < g_opts.num_cps; ++i) { + action_with_cp({.num_new_btrees = new_rand_count(g_re), + .num_destroy_btrees = destroy_rand_count(g_re), + .num_io_btrees = (uint32_t)std::lround(io_rand_count(g_re)), + .is_full_map_flush_cp = (rand_cp_type(g_re) == 0), + .restart_post_cp = + (rand_restart(g_re) == 0) ? CPParams::RestartType::clean : CPParams::RestartType::none}); + } + + LOGINFO("Post Restart we do IO on all recovered btrees"); + this->io_on_btrees(); +} + +#ifdef _PRERELEASE +TEST_F(BtreeTest, CrashBeforeFirstCp) { + // Simulate the crash even before first cp. Here we trigger crash CP, so no actual CP is taken in this test + this->set_btree_flip("crash_on_flush_cow_btree_nodes"); + action_with_cp({.num_new_btrees = 0, + .num_destroy_btrees = 0, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = false, + .restart_post_cp = CPParams::RestartType::crash}); + + LOGINFO("Post Restart we do IO on all recovered btrees"); + this->io_on_btrees(); +} + +TEST_F(BtreeTest, CrashDuringFlushNodes) { + // Take couple of CPs, one full map and then one incremental + action_with_cp({.num_new_btrees = 0, + .num_destroy_btrees = 0, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = true, + .restart_post_cp = CPParams::RestartType::none}); + action_with_cp({.num_new_btrees = 0, + .num_destroy_btrees = 0, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = false, + .restart_post_cp = CPParams::RestartType::none}); + + // Simulate the crash after couple of cps by triggering an incremental cp. + this->set_btree_flip("crash_on_flush_cow_btree_nodes", (uint32_t)1); + action_with_cp({.num_new_btrees = 0, + .num_destroy_btrees = 0, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = false, + .restart_post_cp = CPParams::RestartType::crash}); + + LOGINFO("Post Restart we do IO on all recovered btrees"); + this->io_on_btrees(); +} + +TEST_F(BtreeTest, CrashBeforeIncrementalCpCommit) { + // Take couple of CPs, one full map and then one incremental + action_with_cp({.num_new_btrees = 0, + .num_destroy_btrees = 0, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = true, + .restart_post_cp = CPParams::RestartType::none}); + action_with_cp({.num_new_btrees = 0, + .num_destroy_btrees = 0, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = false, + .restart_post_cp = CPParams::RestartType::none}); + + // Simulate the crash on next cp + this->set_btree_flip("crash_before_incr_map_flush_commit"); + action_with_cp({.num_new_btrees = 0, + .num_destroy_btrees = 0, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = false, + .restart_post_cp = CPParams::RestartType::crash}); + + LOGINFO("Post Restart we do IO on all recovered btrees"); + this->io_on_btrees(); +} + +TEST_F(BtreeTest, CrashBeforeLastFullMapCpCommit) { + // Take couple of CPs, one full map and then one incremental + action_with_cp({.num_new_btrees = 0, + .num_destroy_btrees = 0, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = true, + .restart_post_cp = CPParams::RestartType::none}); + action_with_cp({.num_new_btrees = 0, + .num_destroy_btrees = 0, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = false, + .restart_post_cp = CPParams::RestartType::none}); + + // Set the flip to crash while full map cp flush is ongoing on the last btree, which means other btrees have + // successfully completed the full map flush cp and the last one isn't. This should test both replay of map updates + // which already committed and one btree which has not. + this->set_btree_flip("crash_during_full_map_flush", (uint32_t)1); + action_with_cp({.num_new_btrees = 0, + .num_destroy_btrees = 0, + .num_io_btrees = std::numeric_limits< uint32_t >::max(), + .is_full_map_flush_cp = true, + .restart_post_cp = CPParams::RestartType::crash}); + + LOGINFO("Post Restart we do IO on all recovered btrees"); + this->io_on_btrees(); +} +#endif + +int main(int argc, char* argv[]) { + int parsed_argc{argc}; + ::testing::InitGoogleTest(&parsed_argc, argv); + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_cow_btree_recovery, iomgr, test_common_setup); + sisl::logging::SetLogger("test_cow_btree_recovery"); + spdlog::set_pattern("[%D %T%z] [%^%L%$] [%t] %v"); + + set_options(); + auto ret = RUN_ALL_TESTS(); + return ret; +} diff --git a/src/tests/test_cp_mgr.cpp b/src/tests/test_cp_mgr.cpp index 5413a1a3b..ace629f24 100644 --- a/src/tests/test_cp_mgr.cpp +++ b/src/tests/test_cp_mgr.cpp @@ -27,10 +27,9 @@ using namespace homestore; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + SISL_OPTIONS_ENABLE(logging, test_cp_mgr, iomgr, test_common_setup) -SISL_LOGGING_DECL(test_cp_mgr) SISL_OPTION_GROUP(test_cp_mgr, (num_records, "", "num_records", "number of record to test", diff --git a/src/tests/test_data_service.cpp b/src/tests/test_data_service.cpp index dc72fa2fe..e6c47e211 100644 --- a/src/tests/test_data_service.cpp +++ b/src/tests/test_data_service.cpp @@ -53,9 +53,8 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + SISL_OPTIONS_ENABLE(logging, test_data_service, iomgr, test_common_setup) -SISL_LOGGING_DECL(test_data_service) constexpr uint64_t Ki{1024}; constexpr uint64_t Mi{Ki * Ki}; @@ -456,7 +455,7 @@ class BlkDataServiceTest : public testing::Test { // every piece in bid is a single block, e.g. nblks = 1 auto const nbids = bid.num_pieces(); auto sub_io_size = nbids * inst().get_blk_size(); - HS_REL_ASSERT_LE(sub_io_size, remaining_io_size, "not expecting sub_io_size to exceed remaining_io_size"); + HS_REL_ASSERT_LE(sub_io_size, remaining_io_size, "not expecting sub_io_size to exceed remaining_io_size"); // we pass crc from lambda becaues if there is any async_free_blk, the written blks in the blkcrc map will // be removed by the time read thenVlue is called; diff --git a/src/tests/test_device_manager.cpp b/src/tests/test_device_manager.cpp index 6a53d7773..d15e57f4e 100644 --- a/src/tests/test_device_manager.cpp +++ b/src/tests/test_device_manager.cpp @@ -35,7 +35,7 @@ #include "device/virtual_dev.hpp" using namespace homestore; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + SISL_OPTIONS_ENABLE(logging, test_device_manager, iomgr) SISL_OPTION_GROUP(test_device_manager, @@ -79,11 +79,9 @@ class DeviceMgrTest : public ::testing::Test { return std::make_shared< homestore::VirtualDev >(*m_dmgr, vinfo_tmp, nullptr /* event_cb */, false); }); if (m_dmgr->is_first_time_boot()) { - LOGINFO("First time boot, formatting devices"); m_dmgr->format_devices(); m_dmgr->commit_formatting(); } else { - LOGINFO("Not first time boot, loading devices"); m_dmgr->load_devices(); } m_pdevs = m_dmgr->get_pdevs_by_dev_type(homestore::HSDevType::Data); @@ -97,12 +95,6 @@ class DeviceMgrTest : public ::testing::Test { setup_device_manager(); } - void add_data_file(std::string fname, uint64_t data_dev_size) { - init_file(fname, data_dev_size); - m_data_dev_names.emplace_back(fname); - m_dev_infos.emplace_back(std::filesystem::canonical(fname).string(), homestore::HSDevType::Data); - } - virtual void SetUp() override { auto const data_ndevices = SISL_OPTIONS["num_data_devs"].as< uint32_t >(); auto const data_dev_size = SISL_OPTIONS["data_dev_size_mb"].as< uint64_t >() * 1024 * 1024; @@ -111,7 +103,9 @@ class DeviceMgrTest : public ::testing::Test { homestore::in_bytes(data_dev_size)); for (uint32_t i{0}; i < data_ndevices; ++i) { auto fname = std::string{"/tmp/test_devmgr_data_" + std::to_string(i + 1)}; - add_data_file(fname, data_dev_size); + init_file(fname, data_dev_size); + m_data_dev_names.emplace_back(fname); + m_dev_infos.emplace_back(std::filesystem::canonical(fname).string(), homestore::HSDevType::Data); } setup_device_manager(); @@ -124,7 +118,7 @@ class DeviceMgrTest : public ::testing::Test { remove_files(m_data_dev_names); } - void validate_striped_vdevs(uint32_t expected_pdev_num = 3) { + void validate_striped_vdevs() { for (auto& vdev : m_vdevs) { auto chunks = vdev->get_chunks(); ASSERT_EQ(vdev->get_total_chunk_num(), m_pdevs.size() * 2) @@ -140,8 +134,6 @@ class DeviceMgrTest : public ::testing::Test { if (!inserted) { ++(it->second); } } - ASSERT_TRUE(chunks_in_pdev_count.size() == expected_pdev_num) - << "pdev num mismatch, expected " << expected_pdev_num << " but found " << chunks_in_pdev_count.size(); for (const auto& [pdev, count] : chunks_in_pdev_count) { ASSERT_EQ(count, 2) << "Every pdev should have exactly 2 chunks, that has not happened here"; } @@ -190,243 +182,6 @@ TEST_F(DeviceMgrTest, StripedVDevCreation) { this->validate_striped_vdevs(); } -TEST_F(DeviceMgrTest, ReplaceDeviceWithEmptyDevice) { - static constexpr uint32_t num_test_vdevs = 5; - uint64_t avail_size{0}; - for (auto& pdev : m_pdevs) { - avail_size += pdev->data_size(); - } - - uint32_t size_pct = 4; - uint64_t remain_size = avail_size; - - LOGINFO("Step 1: Creating {} vdevs with combined size as {}", num_test_vdevs, in_bytes(avail_size)); - for (uint32_t i = 0; i < num_test_vdevs; ++i) { - std::string name = "test_vdev_" + std::to_string(i + 1); - uint64_t size = std::min(remain_size, (avail_size * size_pct) / 100); - remain_size -= size; - size_pct *= 2; // Double the next vdev size - - LOGINFO("Step 1a: Creating vdev of name={} with size={}", name, in_bytes(size)); - auto vdev = - m_dmgr->create_vdev(homestore::vdev_parameters{.vdev_name = name, - .vdev_size = size, - .num_chunks = uint32_cast(m_pdevs.size() * 2), - .blk_size = 4096, - .dev_type = HSDevType::Data, - .alloc_type = blk_allocator_type_t::none, - .chunk_sel_type = chunk_selector_type_t::NONE, - .multi_pdev_opts = vdev_multi_pdev_opts_t::ALL_PDEV_STRIPED, - .context_data = sisl::blob{}}); - m_vdevs.push_back(std::move(vdev)); - } - - LOGINFO("Step 2: Validating all vdevs if they have created with correct number of chunks"); - this->validate_striped_vdevs(); - - std::set< uint32_t > pdev_ids; - std::vector< PhysicalDev* > pdevs = m_dmgr->get_pdevs_by_dev_type(HSDevType::Data); - for (auto d : pdevs) { - pdev_ids.insert(d->pdev_id()); - } - - auto fpath = m_data_dev_names[0]; - m_data_dev_names.erase(m_data_dev_names.begin()); - auto dinfo = m_dev_infos[0]; - m_dev_infos.erase(m_dev_infos.begin()); - LOGINFO("Step 3a: Remove device to simulate device failure, file={}", fpath); - if (std::filesystem::exists(fpath)) { std::filesystem::remove(fpath); } - LOGINFO("Step 3b: Restart dmgr", fpath); - this->restart(); - - LOGINFO("Step 4: Validate after one device is removed"); - this->validate_striped_vdevs(2); - - LOGINFO("Step 5: Recreate file to simulate a new device", fpath); - auto const data_dev_size = SISL_OPTIONS["data_dev_size_mb"].as< uint64_t >() * 1024 * 1024; - this->add_data_file(fpath, data_dev_size); - - LOGINFO("Step 6: Restart and validate if new device can be added to vdevs"); - this->restart(); - this->validate_striped_vdevs(); - - LOGINFO("Step 7: Restart and validate again"); - this->restart(); - this->validate_striped_vdevs(); - - pdevs = m_dmgr->get_pdevs_by_dev_type(HSDevType::Data); - for (auto d : pdevs) { - pdev_ids.insert(d->pdev_id()); - } - ASSERT_EQ(pdev_ids.size(), m_pdevs.size() + 1) - << "Pdev ids size mismatch after replacing two devices, duplicate pdev ids found or missing pdev ids"; -} - -TEST_F(DeviceMgrTest, ReplaceTwoDevicesAtOnce) { - static constexpr uint32_t num_test_vdevs = 5; - uint64_t avail_size{0}; - for (auto& pdev : m_pdevs) { - avail_size += pdev->data_size(); - } - - uint32_t size_pct = 4; - uint64_t remain_size = avail_size; - - LOGINFO("Step 1: Creating {} vdevs with combined size as {}", num_test_vdevs, in_bytes(avail_size)); - for (uint32_t i = 0; i < num_test_vdevs; ++i) { - std::string name = "test_vdev_" + std::to_string(i + 1); - uint64_t size = std::min(remain_size, (avail_size * size_pct) / 100); - remain_size -= size; - size_pct *= 2; // Double the next vdev size - - LOGINFO("Step 1a: Creating vdev of name={} with size={}", name, in_bytes(size)); - auto vdev = - m_dmgr->create_vdev(homestore::vdev_parameters{.vdev_name = name, - .vdev_size = size, - .num_chunks = uint32_cast(m_pdevs.size() * 2), - .blk_size = 4096, - .dev_type = HSDevType::Data, - .alloc_type = blk_allocator_type_t::none, - .chunk_sel_type = chunk_selector_type_t::NONE, - .multi_pdev_opts = vdev_multi_pdev_opts_t::ALL_PDEV_STRIPED, - .context_data = sisl::blob{}}); - m_vdevs.push_back(std::move(vdev)); - } - - LOGINFO("Step 2: Validating all vdevs if they have created with correct number of chunks"); - this->validate_striped_vdevs(); - - std::set< uint32_t > pdev_ids; - std::vector< PhysicalDev* > pdevs = m_dmgr->get_pdevs_by_dev_type(HSDevType::Data); - for (auto d : pdevs) { - pdev_ids.insert(d->pdev_id()); - } - - auto fpath1 = m_data_dev_names[0]; - m_data_dev_names.erase(m_data_dev_names.begin()); - auto dinfo = m_dev_infos[0]; - m_dev_infos.erase(m_dev_infos.begin()); - LOGINFO("Step 3a: Remove device to simulate device failure, file={}", fpath1); - if (std::filesystem::exists(fpath1)) { std::filesystem::remove(fpath1); } - - auto fpath2 = m_data_dev_names[1]; - m_data_dev_names.erase(m_data_dev_names.end()); - auto dinfo2 = m_dev_infos[1]; - m_dev_infos.erase(m_dev_infos.end()); - LOGINFO("Step 3a: Remove device to simulate device failure, file={}", fpath2); - if (std::filesystem::exists(fpath2)) { std::filesystem::remove(fpath2); } - - LOGINFO("Step 3b: Restart dmgr"); - this->restart(); - - LOGINFO("Step 4: Validate after one device is removed"); - this->validate_striped_vdevs(1); - - LOGINFO("Step 5: Recreate files to simulate new devices"); - auto const data_dev_size = SISL_OPTIONS["data_dev_size_mb"].as< uint64_t >() * 1024 * 1024; - this->add_data_file(fpath1, data_dev_size); - this->add_data_file(fpath2, data_dev_size); - - LOGINFO("Step 6: Restart and validate if new device can be added to vdevs"); - this->restart(); - this->validate_striped_vdevs(); - - LOGINFO("Step 7: Restart and validate again"); - this->restart(); - this->validate_striped_vdevs(); - - pdevs = m_dmgr->get_pdevs_by_dev_type(HSDevType::Data); - for (auto d : pdevs) { - pdev_ids.insert(d->pdev_id()); - } - ASSERT_EQ(pdev_ids.size(), m_pdevs.size() + 2) - << "Pdev ids size mismatch after replacing two devices, duplicate pdev ids found or missing pdev ids"; -} - -TEST_F(DeviceMgrTest, ReplaceTwoDevicesOneByOne) { - static constexpr uint32_t num_test_vdevs = 5; - uint64_t avail_size{0}; - for (auto& pdev : m_pdevs) { - avail_size += pdev->data_size(); - } - - uint32_t size_pct = 4; - uint64_t remain_size = avail_size; - - LOGINFO("Step 1: Creating {} vdevs with combined size as {}", num_test_vdevs, in_bytes(avail_size)); - for (uint32_t i = 0; i < num_test_vdevs; ++i) { - std::string name = "test_vdev_" + std::to_string(i + 1); - uint64_t size = std::min(remain_size, (avail_size * size_pct) / 100); - remain_size -= size; - size_pct *= 2; // Double the next vdev size - - LOGINFO("Step 1a: Creating vdev of name={} with size={}", name, in_bytes(size)); - auto vdev = - m_dmgr->create_vdev(homestore::vdev_parameters{.vdev_name = name, - .vdev_size = size, - .num_chunks = uint32_cast(m_pdevs.size() * 2), - .blk_size = 4096, - .dev_type = HSDevType::Data, - .alloc_type = blk_allocator_type_t::none, - .chunk_sel_type = chunk_selector_type_t::NONE, - .multi_pdev_opts = vdev_multi_pdev_opts_t::ALL_PDEV_STRIPED, - .context_data = sisl::blob{}}); - m_vdevs.push_back(std::move(vdev)); - } - - LOGINFO("Step 2: Validating all vdevs if they have created with correct number of chunks"); - this->validate_striped_vdevs(); - - std::set< uint32_t > pdev_ids; - std::vector< PhysicalDev* > pdevs = m_dmgr->get_pdevs_by_dev_type(HSDevType::Data); - for (auto d : pdevs) { - pdev_ids.insert(d->pdev_id()); - } - - auto fpath1 = m_data_dev_names[0]; - m_data_dev_names.erase(m_data_dev_names.begin()); - auto dinfo = m_dev_infos[0]; - m_dev_infos.erase(m_dev_infos.begin()); - LOGINFO("Step 3a: Remove device to simulate device failure, file={}", fpath1); - if (std::filesystem::exists(fpath1)) { std::filesystem::remove(fpath1); } - - auto fpath2 = m_data_dev_names[1]; - m_data_dev_names.erase(m_data_dev_names.end()); - auto dinfo2 = m_dev_infos[1]; - m_dev_infos.erase(m_dev_infos.end()); - LOGINFO("Step 3a: Remove device to simulate device failure, file={}", fpath2); - if (std::filesystem::exists(fpath2)) { std::filesystem::remove(fpath2); } - - LOGINFO("Step 3b: Restart dmgr after removing devices"); - this->restart(); - - LOGINFO("Step 4: Validate after devices is removed"); - this->validate_striped_vdevs(1); - - LOGINFO("Step 5: Recreate file to simulate replacement with a new device, file={}", fpath1); - auto const data_dev_size = SISL_OPTIONS["data_dev_size_mb"].as< uint64_t >() * 1024 * 1024; - this->add_data_file(fpath1, data_dev_size); - - this->restart(); - this->validate_striped_vdevs(2); - - LOGINFO("Step 6: Recreate file to simulate replacement with a new device, file={}", fpath2); - this->add_data_file(fpath2, data_dev_size); - this->restart(); - this->validate_striped_vdevs(); - - LOGINFO("Step 7: Restart and validate again"); - this->restart(); - this->validate_striped_vdevs(); - - pdevs = m_dmgr->get_pdevs_by_dev_type(HSDevType::Data); - for (auto d : pdevs) { - pdev_ids.insert(d->pdev_id()); - } - ASSERT_EQ(pdev_ids.size(), m_pdevs.size() + 2) - << "Pdev ids size mismatch after replacing two devices, duplicate pdev ids found or missing pdev ids"; -} - TEST_F(DeviceMgrTest, SmallStripedVDevCreation) { std::string name = "test_vdev_small"; diff --git a/src/tests/test_home_raft_logstore.cpp b/src/tests/test_home_raft_logstore.cpp index b4349e6b8..882ea2bfc 100644 --- a/src/tests/test_home_raft_logstore.cpp +++ b/src/tests/test_home_raft_logstore.cpp @@ -10,7 +10,7 @@ using namespace homestore; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + static constexpr uint32_t g_max_logsize{512}; static std::random_device g_rd{}; diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index a19dc15b7..35b44eeaf 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -27,9 +27,8 @@ using namespace homestore; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + SISL_OPTIONS_ENABLE(logging, test_index_crash_recovery, iomgr, test_common_setup) -SISL_LOGGING_DECL(test_index_crash_recovery) // TODO Add tests to do write,remove after recovery. // TODO Test with var len key with io mgr page size is 512. @@ -49,8 +48,6 @@ SISL_OPTION_GROUP( ""), (min_keys_in_node, "", "min_keys_in_node", "min_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("6"), ""), - (max_merge_level, "", "max_merge_level", "max merge level", ::cxxopts::value< uint8_t >()->default_value("1"), ""), - (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", @@ -60,8 +57,6 @@ SISL_OPTION_GROUP( (save_to_file, "", "save_to_file", "save to file", ::cxxopts::value< bool >()->default_value("0"), ""), (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", ::cxxopts::value< bool >()->default_value("1"), ""), - (print_keys_verbose_logging, "", "print_keys_verbose_logging", "print_keys_verbose_logging", - ::cxxopts::value< bool >()->default_value("0"), ""), (seed, "", "seed", "random engine seed, use random if not defined", ::cxxopts::value< uint64_t >()->default_value("0"), "number")) @@ -73,9 +68,6 @@ void log_obj_life_counter() { LOGINFO("Object Life Counter\n:{}", str); } -#define print_keys_logging(msg) \ - if (SISL_OPTIONS.count("print_keys_verbose_logging")) { this->print_keys(msg); } - enum class OperationType { Put, Remove, @@ -351,32 +343,19 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } void reset_btree() { - LOGINFO("Destroying index btree with uuid {} root id {}", boost::uuids::to_string(this->m_bt->uuid()), - this->m_bt->root_node_id()); hs()->index_service().remove_index_table(this->m_bt); this->m_bt->destroy(); this->trigger_cp(true); - ASSERT_EQ(hs()->index_service().num_tables(), 0) << "After destroying the index table, some table still exists"; auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); hs()->index_service().add_index_table(this->m_bt); - auto num_keys = this->m_bt->count_keys(this->m_bt->root_node_id()); this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); this->m_shadow_map.save(m_shadow_filename); LOGINFO("Reset btree with uuid {} - erase shadow map {}", boost::uuids::to_string(uuid), m_shadow_filename); } - void destroy_btree() { - hs()->index_service().remove_index_table(this->m_bt); - this->m_bt->destroy(); - this->trigger_cp(true); - this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); - this->m_shadow_map.save(m_shadow_filename); - LOGINFO("destroy btree - erase shadow map {}", m_shadow_filename); - } - void restart_homestore(uint32_t shutdown_delay_sec = 3) override { this->params(HS_SERVICE::INDEX).index_svc_cbs = new TestIndexServiceCallbacks(this); LOGINFO("\n\n\n\n\n\n shutdown homestore for index service Test\n\n\n\n\n"); @@ -490,7 +469,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } void crash_and_recover_common(OperationList& operations, std::string filename = "") { - print_keys_logging("Btree prior to CP and susbsequent simulated crash: "); + // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); LOGINFO("Before Crash: {} keys in shadow map and it is actually {} keys in tree - operations size {}", this->m_shadow_map.size(), tree_key_count(), operations.size()); @@ -500,7 +479,6 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->visualize_keys(b_filename); } - print_keys_logging("Before crash"); trigger_cp(false); LOGINFO("waiting for crash to recover"); this->wait_for_crash_recovery(true); @@ -510,7 +488,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Visualize the tree file after recovery : {}", rec_filename); this->visualize_keys(rec_filename); } - print_keys_logging("Post crash and recovery, btree structure: "); + // this->print_keys("Post crash and recovery, btree structure: "); sanity_check(operations); // Added to the index service right after recovery. Not needed here // test_common::HSTestHelper::trigger_cp(true); @@ -522,7 +500,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Visualize the tree after reapply {}", re_filename); this->visualize_keys(re_filename); } - print_keys_logging("Post reapply, btree structure: "); + // this->print_keys("Post reapply, btree structure: "); this->get_all(); LOGINFO("After reapply: {} keys in shadow map and actually {} in tress", this->m_shadow_map.size(), @@ -594,14 +572,13 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT test_common::HSTestHelper::trigger_cp(true); this->get_all(); this->m_shadow_map.save(this->m_shadow_filename); - print_keys_logging("reapply: after preload"); + // this->print_keys("reapply: after preload"); this->visualize_keys("tree_after_preload.dot"); for (uint32_t round = 1; round <= crash_test_options.rounds && !time_to_stop(); round++) { LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, crash_test_options.rounds); bool print_time = false; elapsed_time = get_elapsed_time_sec(m_start_time); - print_keys_logging(fmt::format("Round {}: before crash", round)); if (crash_test_options.load_mode) { operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); @@ -743,7 +720,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->m_run_time, elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), crash_test_options.num_entries, this->tree_key_count() * 100.0 / crash_test_options.num_entries); } - print_keys_logging(fmt::format("reapply: after round {}", round)); + // this->print_keys(fmt::format("reapply: after round {}", round)); if (renew_btree_after_crash) { this->reset_btree(); }; } this->destroy_btree(); @@ -884,9 +861,8 @@ TYPED_TEST(IndexCrashTest, long_running_put_remove_crash) { // Basic reverse and forward order remove with different flip points TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { vector< std::string > flip_points = { - "crash_flush_on_merge_at_parent", - "crash_flush_on_merge_at_left_child", - "crash_flush_on_freed_child", + "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child", + // "crash_flush_on_freed_child", }; for (size_t i = 0; i < flip_points.size(); ++i) { @@ -896,7 +872,7 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { LOGINFO("=== Testing flip point: {} - {} ===", i + 1, flip_point); // Populate some keys [1,num_entries) and trigger cp to persist - LOGINFO("Step {}-0: Populate some keys and flush", i + 1); + LOGINFO("Step {}-1: Populate some keys and flush", i + 1); auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); for (auto k = 0u; k < num_entries; ++k) { this->put(k, btree_put_type::INSERT, true /* expect_success */); @@ -904,8 +880,10 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { test_common::HSTestHelper::trigger_cp(true); this->m_shadow_map.save(this->m_shadow_filename); + this->visualize_keys("tree_merge_full.dot"); + // Split keys into batches and remove the last one in reverse order - LOGINFO("\n\n\n\n\n\n\n\n\n\n\n\n\n\nStep {}-1: Set crash flag {}", i + 1, flip_point); + LOGINFO("Step {}-2: Set crash flag, remove some keys in reverse order", i + 1); int batch_num = 4; { int n = batch_num; @@ -915,21 +893,20 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { for (auto k = r; k >= l; --k) { ops.emplace_back(k, OperationType::Remove); } - LOGINFO("Step {}-1-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, r, l); - this->print_keys(fmt::format("Print before Step {}-1-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, - batch_num, r, l)); + LOGINFO("Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, r, l); + this->set_basic_flip(flip_point); for (auto [k, _] : ops) { this->remove_one(k, true); } - LOGINFO("Step {}-1-2: Trigger cp to crash", i + 1); - this->crash_and_recover(flip_point, ops); + this->visualize_keys("tree_merge_before_first_crash.dot"); + + LOGINFO("Step {}-2-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); } - this->print_keys(fmt::format("Print after recover Step {}1--3: flip {}", i + 1, flip_point)); // Remove the next batch of keys in forward order - LOGINFO("\n\n\n\n\n\n\n\n\n\n\n\n\n\nStep {}-2: Set crash flag {}", i + 1, flip_point); - { + LOGINFO("Step {}-3: Remove another batch in ascending order", i + 1) { int n = batch_num - 1; auto r = num_entries * n / batch_num - 1; auto l = num_entries * (n - 1) / batch_num; @@ -937,47 +914,21 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { for (auto k = l; k <= r; ++k) { ops.emplace_back(k, OperationType::Remove); } - LOGINFO("Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); - this->print_keys(fmt::format("Print before Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, - batch_num, l, r)); - this->set_basic_flip(flip_point); - for (auto [k, _] : ops) { - this->remove_one(k, true); - } - LOGINFO("Step {}-2-2: Trigger cp to crash", i + 1); - this->crash_and_recover(flip_point, ops); - } - this->print_keys(fmt::format("Print after recover Step {}-2-3: flip {}", i + 1, flip_point)); - - // Remove the next batch of keys in random order - LOGINFO("\n\n\n\n\n\n\n\n\n\n\n\n\n\nStep {}-3: Set crash flag {}", i + 1, flip_point); - { - int n = batch_num - 2; - auto r = num_entries * n / batch_num - 1; - auto l = num_entries * (n - 1) / batch_num; - SequenceGenerator generator(0, 100, l, r); - generator.fillRange(l, r); - OperationList ops = generator.generateOperations(r - l + 1, false); - LOGINFO("Step {}-3-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); this->set_basic_flip(flip_point); for (auto [k, _] : ops) { this->remove_one(k, true); } - this->print_keys(fmt::format("Print before Step {}-3: Remove keys in batch {}/{} ({} to {})", i + 1, n, - batch_num, l, r)); + this->visualize_keys("tree_merge_before_second_crash.dot"); LOGINFO("Step {}-3-2: Trigger cp to crash", i + 1); - this->crash_and_recover(flip_point, ops); + this->crash_and_recover(ops); } - this->print_keys(fmt::format("Print after recover Step {}-3-3: flip {}", i + 1, flip_point)); // Remove the next batch of keys in random order - LOGINFO("\n\n\n\n\n\n\n\n\n\n\n\n\n\nStep {}-4: Set crash flag {} Remove another batch in ascending order", - i + 1, flip_point); - { - int n = batch_num - 3; + LOGINFO("Step {}-4: Remove another batch in random order", i + 1) { + int n = batch_num - 2; auto r = num_entries * n / batch_num - 1; auto l = num_entries * (n - 1) / batch_num; SequenceGenerator generator(0, 100, l, r); @@ -985,72 +936,26 @@ TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { OperationList ops = generator.generateOperations(r - l + 1, false); LOGINFO("Step {}-4-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); - this->print_keys(fmt::format("Print before Step {}-4-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, - batch_num, l, r)); + this->set_basic_flip(flip_point); for (auto [k, _] : ops) { this->remove_one(k, true); } + this->visualize_keys("tree_merge_before_third_crash.dot"); + LOGINFO("Step {}-4-2: Trigger cp to crash", i + 1); - this->crash_and_recover(flip_point, ops); + this->crash_and_recover(ops); } - this->print_keys(fmt::format("Print after recover Step {}-4-3: flip {}", i + 1, flip_point)); + LOGINFO("Step {}-5: Cleanup the tree", i + 1); + for (auto k = 0u; k < num_entries; ++k) { + this->remove_one(k, false); + } test_common::HSTestHelper::trigger_cp(true); this->get_all(); } } -TYPED_TEST(IndexCrashTest, MetricsTest) { - const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); - std::vector< uint32_t > vec(num_entries); - iota(vec.begin(), vec.end(), 0); - std::random_shuffle(vec.begin(), vec.end()); - for (auto key : vec) { - this->put(key, btree_put_type::INSERT, true /* expect_success */); - } - print_keys_logging("After populating"); - - auto log_btree_metrics = [this](std::string prompt) { - auto metrics = this->m_bt->get_metrics_in_json().dump(1, '\t'); - LOGDEBUG("metrics: \n{}", metrics); - auto metrics_json = this->m_bt->get_metrics_in_json(); - auto bt_cnts = this->m_bt->get_num_nodes(); - auto bt_d = this->m_bt->get_btree_depth(); - auto com_cnts = this->m_bt->compute_node_count(); - auto com_d = this->m_bt->compute_btree_depth(); - auto [int_cnt, leaf_cnt, depth] = this->get_btree_metrics(metrics_json); - - LOGDEBUG("\n{}:\nmetrics (interior, leaf, height):\ncompute ({}, {}, {})\nbtree ({}, {}, {})\nmetrics ({}, " - "{}, {})", - prompt, com_cnts.first, com_cnts.second, com_d, bt_cnts.first, bt_cnts.second, bt_d, int_cnt, leaf_cnt, - depth); - ASSERT_EQ(bt_cnts.first, com_cnts.first) << "btree interior count doesn't match the actual node counts"; - ASSERT_EQ(bt_cnts.first, int_cnt) << "btree interior count doesn't match the metrics node counts"; - ASSERT_EQ(bt_cnts.second, com_cnts.second) << "btree leaf count doesn't match the actual node counts"; - ASSERT_EQ(bt_cnts.second, leaf_cnt) << "btree leaf count doesn't match the metrics node counts"; - ASSERT_EQ(bt_d, com_d) << "btree depth doesn't match the actual btee depth"; - ASSERT_EQ(bt_d, depth) << "btree depth doesn't match the metrics depth report"; - }; - log_btree_metrics("node count before CP"); - - test_common::HSTestHelper::trigger_cp(true); - log_btree_metrics("node count after CP"); - - this->m_shadow_map.save(this->m_shadow_filename); - this->restart_homestore(); - print_keys_logging("After restart"); - log_btree_metrics("node count after restart"); - std::string flip = "crash_flush_on_merge_at_parent"; - for (auto key : vec) { - this->remove_one(key, true); - } - this->trigger_cp(false); - this->wait_for_crash_recovery(true); - log_btree_metrics("node count after crash recovery"); - print_keys_logging("after removing all keys"); -} - // // TYPED_TEST(IndexCrashTest, MergeCrash1) { // auto const num_entries = SISL_OPTIONS["num_entries"].as(); @@ -1187,14 +1092,11 @@ int main(int argc, char* argv[]) { SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_index_crash_recovery, iomgr, test_common_setup); sisl::logging::SetLogger("test_index_crash_recovery"); spdlog::set_pattern("[%D %T%z] [%^%L%$] [%t] %v"); + if (SISL_OPTIONS.count("seed")) { auto seed = SISL_OPTIONS["seed"].as< uint64_t >(); LOGINFO("Using seed {} to sow the random generation", seed); g_re.seed(seed); - } else { - auto seed = std::chrono::system_clock::now().time_since_epoch().count(); - LOGINFO("No seed provided. Using randomly generated seed: {}", seed); - g_re.seed(seed); } #ifdef _PRERELEASE diff --git a/src/tests/test_journal_vdev.cpp b/src/tests/test_journal_vdev.cpp index 8a06911cf..3d4b2a8ec 100644 --- a/src/tests/test_journal_vdev.cpp +++ b/src/tests/test_journal_vdev.cpp @@ -41,7 +41,7 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + SISL_OPTIONS_ENABLE(logging, test_journal_vdev, iomgr, test_common_setup) @@ -57,7 +57,7 @@ struct Param { uint32_t max_wrt_sz; uint32_t truncate_watermark_percentage; }; -SISL_LOGGING_DECL(test_journal_vdev) + static Param gp; // trigger truncate when used space ratio reaches more than 80% diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp index 45ecee96f..871eafdaf 100644 --- a/src/tests/test_log_dev.cpp +++ b/src/tests/test_log_dev.cpp @@ -41,9 +41,8 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + SISL_OPTIONS_ENABLE(logging, test_log_dev, iomgr, test_common_setup) -SISL_LOGGING_DECL(test_log_dev) struct test_log_data { test_log_data() = default; diff --git a/src/tests/test_log_store.cpp b/src/tests/test_log_store.cpp index 8f18d71f2..1aa580bba 100644 --- a/src/tests/test_log_store.cpp +++ b/src/tests/test_log_store.cpp @@ -53,7 +53,7 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + struct test_log_data { test_log_data() = default; diff --git a/src/tests/test_log_store_long_run.cpp b/src/tests/test_log_store_long_run.cpp index 5a7437754..507e51633 100644 --- a/src/tests/test_log_store_long_run.cpp +++ b/src/tests/test_log_store_long_run.cpp @@ -53,7 +53,7 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + struct test_log_data { test_log_data() = default; diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 680acb3bd..83330422d 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -22,17 +22,15 @@ #include #include -#include #include "test_common/range_scheduler.hpp" -#include -#include -#include -#include +#include +#include +#include +#include #include "btree_helpers/btree_test_helper.hpp" using namespace homestore; -SISL_LOGGING_DEF(btree) -SISL_LOGGING_INIT(btree) + SISL_OPTIONS_ENABLE(logging, test_mem_btree) SISL_OPTION_GROUP( @@ -42,8 +40,6 @@ SISL_OPTION_GROUP( (num_entries, "", "num_entries", "number of entries to test with", ::cxxopts::value< uint32_t >()->default_value("10000"), "number"), (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), - (max_merge_level, "", "max_merge_level", "max merge level", ::cxxopts::value< uint8_t >()->default_value("127"), - ""), (num_threads, "", "num_threads", "number of threads", ::cxxopts::value< uint32_t >()->default_value("2"), "number"), (num_fibers, "", "num_fibers", "number of fibers", ::cxxopts::value< uint32_t >()->default_value("10"), "number"), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", @@ -57,43 +53,43 @@ SISL_OPTION_GROUP( (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds")) struct FixedLenBtreeTest { - using BtreeType = MemBtree< TestFixedKey, TestFixedValue >; using KeyType = TestFixedKey; using ValueType = TestFixedValue; static constexpr btree_node_type leaf_node_type = btree_node_type::FIXED; static constexpr btree_node_type interior_node_type = btree_node_type::FIXED; + static constexpr IndexStore::Type store_type = IndexStore::Type::MEM_BTREE; }; struct VarKeySizeBtreeTest { - using BtreeType = MemBtree< TestVarLenKey, TestFixedValue >; using KeyType = TestVarLenKey; using ValueType = TestFixedValue; static constexpr btree_node_type leaf_node_type = btree_node_type::VAR_KEY; static constexpr btree_node_type interior_node_type = btree_node_type::VAR_KEY; + static constexpr IndexStore::Type store_type = IndexStore::Type::MEM_BTREE; }; struct VarValueSizeBtreeTest { - using BtreeType = MemBtree< TestFixedKey, TestVarLenValue >; using KeyType = TestFixedKey; using ValueType = TestVarLenValue; static constexpr btree_node_type leaf_node_type = btree_node_type::VAR_VALUE; static constexpr btree_node_type interior_node_type = btree_node_type::FIXED; + static constexpr IndexStore::Type store_type = IndexStore::Type::MEM_BTREE; }; struct VarObjSizeBtreeTest { - using BtreeType = MemBtree< TestVarLenKey, TestVarLenValue >; using KeyType = TestVarLenKey; using ValueType = TestVarLenValue; static constexpr btree_node_type leaf_node_type = btree_node_type::VAR_OBJECT; static constexpr btree_node_type interior_node_type = btree_node_type::VAR_OBJECT; + static constexpr IndexStore::Type store_type = IndexStore::Type::MEM_BTREE; }; struct PrefixIntervalBtreeTest { - using BtreeType = MemBtree< TestIntervalKey, TestIntervalValue >; using KeyType = TestIntervalKey; using ValueType = TestIntervalValue; - static constexpr btree_node_type leaf_node_type = btree_node_type::PREFIX; + static constexpr btree_node_type leaf_node_type = btree_node_type::FIXED_PREFIX; static constexpr btree_node_type interior_node_type = btree_node_type::FIXED; + static constexpr IndexStore::Type store_type = IndexStore::Type::MEM_BTREE; }; template < typename TestType > @@ -106,16 +102,12 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { void SetUp() override { BtreeTestHelper< TestType >::SetUp(); -#ifdef _PRERELEASE - this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); -#endif - this->m_cfg.m_max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); - this->m_cfg.m_merge_turned_on = !SISL_OPTIONS["disable_merge"].as< bool >(); - this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); + this->m_bt = std::make_shared< Btree< K, V > >(this->m_cfg); } }; -using BtreeTypes = testing::Types< FixedLenBtreeTest, PrefixIntervalBtreeTest, VarKeySizeBtreeTest, +// TODO Enable PrefixIntervalBtreeTest later +using BtreeTypes = testing::Types< /* PrefixIntervalBtreeTest, */ FixedLenBtreeTest, VarKeySizeBtreeTest, VarValueSizeBtreeTest, VarObjSizeBtreeTest >; TYPED_TEST_SUITE(BtreeTest, BtreeTypes); @@ -291,131 +283,6 @@ TYPED_TEST(BtreeTest, RandomRemoveRange) { this->query_all(); } -TYPED_TEST(BtreeTest, SimpleTombstone) { - const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); - LOGINFO("Step 1: Do forward sequential insert for {} entries", num_entries); - for (uint32_t i{0}; i < 20; ++i) { - this->put(i, btree_put_type::INSERT); - } - this->move_to_tombstone(10, btree_status_t::success); - this->move_to_tombstone(10, btree_status_t::filtered_out); - this->move_to_tombstone(40, btree_status_t::not_found); -} - -TYPED_TEST(BtreeTest, SimpleMultiTombstone) { - if constexpr (std::is_same_v< TypeParam, PrefixIntervalBtreeTest >) { return; } - uint32_t start_key = 500; - uint32_t end_key = 1000; - LOGDEBUG("Step 1: Do forward sequential insert for [{},{}] entries", start_key, end_key); - for (uint32_t i{start_key}; i <= end_key; ++i) { - this->put(i, btree_put_type::INSERT); - } - std::vector< std::pair< typename TypeParam::KeyType, typename TypeParam::ValueType > > out; - auto format_tombstoned = [](const auto& out) { - std::stringstream ss; - for (const auto& [k, v] : out) { - ss << "[" << k.to_string() << "] =" << v.to_string() << std::endl; - } - return ss.str(); - }; - auto run_and_validate_tombstone = [&](auto s, auto e, auto expect_status, auto expected_size) { - this->move_to_tombstone(s, e, out, expect_status); - LOGDEBUG("Tombstoned {} keys:\n{}", out.size(), format_tombstoned(out)); - ASSERT_EQ(out.size(), expected_size) << "Tombstoned keys should be " << expected_size << ", but got " - << out.size() << " keys in range [" << s << ", " << e << "]"; - }; - auto sum_tombstoned = 0; - { - run_and_validate_tombstone(0, start_key - 100, btree_status_t::not_found, 0); - run_and_validate_tombstone(end_key + 100, end_key + 2000, btree_status_t::not_found, 0); - } - { - run_and_validate_tombstone(start_key - 100, start_key, btree_status_t::success, 1); - run_and_validate_tombstone(start_key - 100, start_key, btree_status_t::success, 0); - sum_tombstoned += 1; - } - { - run_and_validate_tombstone(start_key + 20, start_key + 40, btree_status_t::success, 21); - run_and_validate_tombstone(start_key + 20, start_key + 40, btree_status_t::success, 0); - run_and_validate_tombstone(start_key + 20, start_key + 41, btree_status_t::success, 1); - run_and_validate_tombstone(start_key + 45, start_key + 50, btree_status_t::success, 6); - run_and_validate_tombstone(start_key + 20, start_key + 60, btree_status_t::success, 41 - 28); - sum_tombstoned += 21 + 1 + 6 + (41 - 28); - } - - { - run_and_validate_tombstone(end_key, end_key + 1000, btree_status_t::success, 1); - run_and_validate_tombstone(end_key, end_key + 1000, btree_status_t::success, 0); - sum_tombstoned += 1; - } - { - run_and_validate_tombstone(0, end_key + 1000, btree_status_t::success, - end_key - start_key - sum_tombstoned + 1); - run_and_validate_tombstone(0, end_key + 1000, btree_status_t::success, 0); - } - this->range_remove_existing(start_key, end_key - start_key + 1); - ASSERT_EQ(this->m_bt->count_keys(), 0); - // creating two intervals - uint32_t start_key1 = 1000; - uint32_t end_key1 = 1999; - uint32_t start_key2 = 3000; - uint32_t end_key2 = 3999; - sum_tombstoned = 0; - for (uint32_t i{start_key1}; i <= end_key1; ++i) { - this->put(i, btree_put_type::INSERT); - } - for (uint32_t i{start_key2}; i <= end_key2; ++i) { - this->put(i, btree_put_type::INSERT); - } - { - run_and_validate_tombstone(start_key1 + 100, end_key2 + 100, btree_status_t::success, 1900); - run_and_validate_tombstone(start_key1 + 100, end_key2 + 100, btree_status_t::success, 0); - } -} - -TYPED_TEST(BtreeTest, SimpleGC) { - if constexpr (std::is_same_v< TypeParam, PrefixIntervalBtreeTest >) { return; } - uint32_t start_key1 = 1000; - uint32_t end_key1 = 1999; - uint32_t start_key2 = 3000; - uint32_t end_key2 = 3999; - std::vector< std::pair< typename TypeParam::KeyType, typename TypeParam::ValueType > > out; - for (uint32_t i{start_key1}; i <= end_key1; ++i) { - this->put(i, btree_put_type::INSERT); - } - for (uint32_t i{start_key2}; i <= end_key2; ++i) { - this->put(i, btree_put_type::INSERT); - } - this->print_keys(" Before tombstone "); - auto start_tombstone = start_key1 + 100; - auto end_tombstone = end_key1 - 100; - auto expected_size = end_key1 - 200 - start_key1 + 1; - this->move_to_tombstone(start_tombstone, end_tombstone, out, btree_status_t::success); - ASSERT_EQ(out.size(), expected_size) << "Tombstoned keys should be " << expected_size << ", but got " << out.size() - << " keys in range [" << start_tombstone << ", " << end_tombstone << "]"; - - this->print_keys(fmt::format(" After tombstone [{},{}] ", start_tombstone, end_tombstone)); - LOGINFO("Step 2: Do GC on the tree for keys in range [{}, {}]", start_key1, end_key2); - this->remove_tombstone(start_key1, end_key2, out, btree_status_t::success); - expected_size = end_key2 - start_key1 + 1 - 1000 - expected_size; - ASSERT_EQ(out.size(), expected_size) << "# of keys after GCs hould be " << expected_size << ", but got " - << out.size() << " keys in range [" << start_key1 << ", " << end_key2 << "]"; - auto format_tombstoned = [](const auto& out) { - std::stringstream ss; - for (const auto& [k, v] : out) { - ss << "[" << k.to_string() << "] =" << v.to_string() << std::endl; - } - return ss.str(); - }; - - this->print_keys(fmt::format(" After GC {} entries are still in range [{},{}] ", out.size(), start_key1, end_key2)); - LOGDEBUG("GC {} keys:\n{}", out.size(), format_tombstoned(out)); - this->remove_tombstone(start_key1, end_key2, out, btree_status_t::not_found); - ASSERT_EQ(out.size(), expected_size) << "After GC, no keys should be left in range [" << start_key1 << ", " - << end_key2 << "] but got " << out.size(); - LOGDEBUG("GC {} keys:\n{}", out.size(), format_tombstoned(out)); -} - template < typename TestType > struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testing::Test { using T = TestType; @@ -433,12 +300,7 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin .hugepage_size_mb = 0}); BtreeTestHelper< TestType >::SetUp(); -#ifdef _PRERELEASE - this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); -#endif - this->m_cfg.m_max_merge_level = SISL_OPTIONS["max_merge_level"].as< uint8_t >(); - this->m_cfg.m_merge_turned_on = !SISL_OPTIONS["disable_merge"].as< bool >(); - this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); + this->m_bt = std::make_shared< Btree< K, V > >(this->m_cfg); } void TearDown() override { diff --git a/src/tests/test_meta_blk_mgr.cpp b/src/tests/test_meta_blk_mgr.cpp index f4bbe2386..d3c5401e9 100644 --- a/src/tests/test_meta_blk_mgr.cpp +++ b/src/tests/test_meta_blk_mgr.cpp @@ -49,12 +49,10 @@ extern "C" { using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + SISL_OPTIONS_ENABLE(logging, test_meta_blk_mgr, iomgr, test_common_setup) -SISL_LOGGING_DECL(test_meta_blk_mgr) - struct Param { uint64_t num_io; uint64_t run_time; @@ -125,7 +123,7 @@ class VMetaBlkMgrTest : public ::testing::Test { protected: void SetUp() override { m_helper.start_homestore("test_meta_blk_mgr", {{HS_SERVICE::META, {.size_pct = 85.0}}}); } - void TearDown() override {}; + void TearDown() override{}; public: [[nodiscard]] uint64_t get_elapsed_time(const Clock::time_point& start) { @@ -402,7 +400,7 @@ class VMetaBlkMgrTest : public ::testing::Test { iomanager.iobuf_free(buf); } else { if (unaligned_addr) { - delete[] (buf - unaligned_shift); + delete[](buf - unaligned_shift); } else { delete[] buf; } @@ -840,7 +838,7 @@ TEST_F(VMetaBlkMgrTest, recovery_test) { // write 1/2 of the available blks; for (uint64_t i = 0; i < max_write_times / 2; i++) { EXPECT_GT(this->do_sb_write(true, uint64_cast(64 * Ki)), uint64_cast(0)); - LOGINFO("iter {}, available_blks {}", i, m_mbm->available_blks()); + LOGDEBUG("iter {}, available_blks {}", i, m_mbm->available_blks()); } // restart homestore @@ -851,7 +849,7 @@ TEST_F(VMetaBlkMgrTest, recovery_test) { this->register_client(); for (uint64_t i = 0; i < (max_write_times / 2); i++) { EXPECT_GT(this->do_sb_write(true, uint64_cast(64 * Ki)), uint64_cast(0)); - LOGINFO("iter {}, available_blks {}", i, m_mbm->available_blks()); + LOGDEBUG("iter {}, available_blks {}", i, m_mbm->available_blks()); } this->shutdown(); } diff --git a/src/tests/test_pdev.cpp b/src/tests/test_pdev.cpp index 4447c500b..e318c7a66 100644 --- a/src/tests/test_pdev.cpp +++ b/src/tests/test_pdev.cpp @@ -34,7 +34,7 @@ #include "device/physical_dev.hpp" using namespace homestore; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + SISL_OPTIONS_ENABLE(logging, test_pdev, iomgr) SISL_OPTION_GROUP(test_pdev, diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index d705ef130..f6d458943 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -558,137 +558,6 @@ TEST_F(RaftReplDevTest, ComputePriority) { g_helper->sync_for_cleanup_start(); } -TEST_F(RaftReplDevTest, RaftLogTruncationTest) { - LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); - g_helper->sync_for_test_start(); - - auto pre_raft_logstore_reserve_threshold = 0; - HS_SETTINGS_FACTORY().modifiable_settings([&pre_raft_logstore_reserve_threshold](auto& s) { - pre_raft_logstore_reserve_threshold = s.resource_limits.raft_logstore_reserve_threshold; - s.resource_limits.raft_logstore_reserve_threshold = 200; - }); - HS_SETTINGS_FACTORY().save(); - - uint64_t entries_per_attempt = 100; - uint64_t total_entires = 0; - - LOGINFO("Write on leader num_entries={}", entries_per_attempt); - this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); - total_entires += entries_per_attempt; - // wait for commmit on all members - this->wait_for_commits(total_entires); - test_common::HSTestHelper::trigger_cp(true /* wait */); - g_helper->sync_for_verify_start(); - - // trigger snapshot to update log truncation upper limit - // sleep 1s to ensure the new truncation upper limit is updated - this->create_snapshot(); - std::this_thread::sleep_for(std::chrono::seconds{1}); - ASSERT_GT(this->get_truncation_upper_limit(), 0); - LOGINFO("After 100 entries written, truncation upper limit became {}", this->get_truncation_upper_limit()); - - // shutdown replica 1. - LOGINFO("Shutdown replica 1"); - this->shutdown_replica(1); - - // write another 100 entries on leader. - LOGINFO("Write on leader num_entries={}", entries_per_attempt); - if (g_helper->replica_num() == 0 || g_helper->replica_num() == 2) { - this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); - // Wait for commmit on leader and follower 2 - this->wait_for_all_commits(); - LOGINFO("Got all commits for replica 0 and 2"); - test_common::HSTestHelper::trigger_cp(true /* wait */); - LOGINFO("Trigger cp after writing 100 entries for replica 0 and 2"); - } - total_entires += entries_per_attempt; - - // trigger snapshot and check the truncation upper limit on leader - // it should not larger than 200 because replica 1 is shutdown - if (g_helper->replica_num() == 0) { - this->create_snapshot(); - std::this_thread::sleep_for(std::chrono::seconds{1}); - ASSERT_LT(this->get_truncation_upper_limit(), 200); - LOGINFO("After another 100 entries written, truncation upper limit {}", this->get_truncation_upper_limit()); - } - - g_helper->sync_for_test_start(); - - // start replica 1 after this. - LOGINFO("Start replica 1"); - this->start_replica(1); - - // write on leader to have some entries saved in raft log store. - entries_per_attempt = 50; - LOGINFO("Write on leader num_entries={}", entries_per_attempt); - this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); - total_entires += entries_per_attempt; - - // wait till all writes are down. - this->wait_for_commits(total_entires); - test_common::HSTestHelper::trigger_cp(true /* wait */); - g_helper->sync_for_verify_start(); - - // trigger snapshot and check the truncation upper limit - // it should no less than 250 on because all replicas has committed upto 250 - this->create_snapshot(); - std::this_thread::sleep_for(std::chrono::seconds{1}); - ASSERT_GE(this->get_truncation_upper_limit(), 250); - LOGINFO("After another 50 entries written, truncation upper limit became {}", this->get_truncation_upper_limit()); - - // wait all members sync and test raft_logstore_reserve_threshold limitation - g_helper->sync_for_test_start(); - - // shutdown replica1 again - LOGINFO("Shutdown replica 1 again"); - this->shutdown_replica(1); - - // write another 300 entries on leader to test one member lagged too much - entries_per_attempt = 300; - LOGINFO("Write on leader num_entries={}", entries_per_attempt); - if (g_helper->replica_num() == 0 || g_helper->replica_num() == 2) { - this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); - // Wait for commmit on leader and follower 2 - this->wait_for_all_commits(); - LOGINFO("Got all commits for replica 0 and 2"); - test_common::HSTestHelper::trigger_cp(true /* wait */); - LOGINFO("Trigger cp after writing 300 entries for replica 0 and 2"); - } - total_entires += entries_per_attempt; - - // trigger snapshot and check the truncation upper limit on leader - // this time leader will use its commit_idx - resource_limits.raft_logstore_reserve_threshold >= 550 - 200 = 350 - if (g_helper->replica_num() == 0) { - this->create_snapshot(); - std::this_thread::sleep_for(std::chrono::seconds{1}); - ASSERT_GE(this->get_truncation_upper_limit(), 350); - ASSERT_LT(this->get_truncation_upper_limit(), 550); - LOGINFO("After another 300 entries written, truncation upper limit {}", this->get_truncation_upper_limit()); - } - g_helper->sync_for_verify_start(); - - // start replica1 again, wait for replica1 catch up - LOGINFO("Start replica 1 again"); - this->start_replica(1); - g_helper->sync_for_test_start(); - this->wait_for_commits(total_entires); - g_helper->sync_for_verify_start(); - - // validate all data written so far by reading them - LOGINFO("Validate all data written so far by reading them"); - this->validate_data(); - - // set the settings back and save. - LOGINFO("Set the raft_logstore_reserve_threshold back to previous value={}", pre_raft_logstore_reserve_threshold); - HS_SETTINGS_FACTORY().modifiable_settings([pre_raft_logstore_reserve_threshold](auto& s) { - s.resource_limits.raft_logstore_reserve_threshold = pre_raft_logstore_reserve_threshold; - }); - HS_SETTINGS_FACTORY().save(); - - g_helper->sync_for_cleanup_start(); - LOGINFO("RaftLogTruncationTest done"); -} - int main(int argc, char* argv[]) { int parsed_argc = argc; char** orig_argv = argv; diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp index 71e9d0821..4ae56a9c3 100644 --- a/src/tests/test_raft_repl_dev_dynamic.cpp +++ b/src/tests/test_raft_repl_dev_dynamic.cpp @@ -39,17 +39,12 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { uint32_t member_in = num_replicas; g_helper->sync_for_test_start(num_members); - std::string task_id = "task_id"; - this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { - ASSERT_EQ( - check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), - ReplaceMemberStatus::TASK_NOT_FOUND); - }); if (g_helper->replica_num() < num_replicas) { // With existing raft repl dev group, write IO's, validate and call replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); - replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -58,14 +53,6 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { g_helper->sync_for_verify_start(num_members); LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); - this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { - ASSERT_EQ( - check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), - ReplaceMemberStatus::IN_PROGRESS); - std::string new_task_id = "mismatched_task_id"; - replace_member(db, new_task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, - ReplServiceError::REPLACE_MEMBER_TASK_MISMATCH); - }); if (is_replica_num_in({0, 1, member_in})) { // Skip the member which is going to be replaced. Validate data on all other replica's. LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); @@ -74,7 +61,7 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { g_helper->sync_for_verify_start(num_members); LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); - // wait for background reaper thread to trigger complete_replace_member + //wait for background reaper thread to trigger complete_replace_member if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); @@ -88,11 +75,6 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) { } g_helper->sync_for_cleanup_start(num_members); - this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { - ASSERT_EQ( - check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), - ReplaceMemberStatus::COMPLETED); - }); LOGINFO("ReplaceMember test done replica={}", g_helper->replica_num()); } @@ -127,13 +109,11 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) { LOGINFO("Shutdown replica 2"); } - std::string task_id = "task_id"; if (g_helper->replica_num() == 0) { // Replace down replica 2 with spare replica 3 with commit quorum 1 // so that leader can go ahead with replacing member. - LOGINFO("Replace member started, task_id={}", task_id); - replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in), - 1 /* commit quorum*/); + LOGINFO("Replace member started"); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/); this->write_on_leader(num_io_entries, true /* wait_for_commit */); LOGINFO("Leader completed num_io={}", num_io_entries); } @@ -148,12 +128,6 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) { LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); this->validate_data(); } - g_helper->sync_for_verify_start(num_members); - this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { - ASSERT_EQ( - check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), - ReplaceMemberStatus::IN_PROGRESS); - }); if (g_helper->replica_num() == 1) { LOGINFO("Start replica 1"); @@ -193,13 +167,12 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) { LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); } - // shut down before replace member + //shut down before replace member this->shutdown_replica(2); LOGINFO("Shutdown replica 2"); - std::string task_id = "task_id"; if (g_helper->replica_num() == 0) { - replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -222,11 +195,6 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) { // data synced, waiting for removing learner LOGINFO("data synced, sync for completing replace member, replica={}", g_helper->replica_num()); g_helper->sync_for_verify_start(num_members); - this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { - ASSERT_EQ( - check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), - ReplaceMemberStatus::IN_PROGRESS); - }); // Since the out_member stopped, it cannot response to remove_srv req, as a result the first time will get CANCELLED // error, so waiting time is longer than other tests. if (g_helper->replica_num() == 2) { @@ -243,26 +211,9 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) { LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); db->set_zombie(); } - g_helper->sync_for_test_start(num_members); - if (g_helper->replica_num() != 2) { - this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { - auto status = check_replace_member_status(db, task_id, g_helper->replica_id(member_out), - g_helper->replica_id(member_in)); - // out_member is down, so it can not response to remove req. Based on nuraft logic, leader will wait for - // timeout and remove it automatically. Simulate next complete_replace_member retry. - if (status == ReplaceMemberStatus::IN_PROGRESS) { - auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); - raft_repl_svc.monitor_replace_member_replication_status(); - LOGINFO("Simulate reaper thread to complete_replace_member"); - std::this_thread::sleep_for(std::chrono::seconds(1)); - } - ASSERT_EQ(check_replace_member_status(db, task_id, g_helper->replica_id(member_out), - g_helper->replica_id(member_in)), - ReplaceMemberStatus::COMPLETED); - }); - } + g_helper->sync_for_cleanup_start(num_members); - LOGINFO("OutMemberDown test done replica={}", g_helper->replica_num()); + LOGINFO("OneMemberDown test done replica={}", g_helper->replica_num()); } TEST_F(ReplDevDynamicTest, LeaderReplace) { @@ -282,7 +233,7 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { uint32_t member_in = num_replicas; g_helper->sync_for_test_start(num_members); - std::string task_id = "task_id"; + if (g_helper->replica_num() == member_out) { LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); // With existing raft repl dev group, write IO's, validate and call replace_member on leader. @@ -291,13 +242,13 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { // Leader will return error NOT_LEADER and yield leadership, sleep and connect again // to the new leader. LOGINFO("Replace old leader"); - replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, ReplServiceError::NOT_LEADER); LOGINFO("Replace member leader yield done"); } std::this_thread::sleep_for(std::chrono::seconds(3)); if (g_helper->replica_num() != member_in) { - replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); LOGINFO("Replace member old leader done"); } @@ -313,12 +264,8 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { this->validate_data(); } + g_helper->sync_for_verify_start(num_members); LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); - this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { - ASSERT_EQ( - check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), - ReplaceMemberStatus::IN_PROGRESS); - }); if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); @@ -333,11 +280,6 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) { } g_helper->sync_for_cleanup_start(num_members); - this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { - ASSERT_EQ( - check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), - ReplaceMemberStatus::COMPLETED); - }); LOGINFO("LeaderReplace test done replica={}", g_helper->replica_num()); } @@ -361,13 +303,13 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { LOGINFO("Restart replica 1, "); this->restart_replica(15); } - std::string task_id = "task_id"; + if (g_helper->replica_num() == 0) { // With existing raft repl dev group, write IO's, validate and call replace_member on leader. LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); - replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); std::this_thread::sleep_for(std::chrono::seconds(3)); } else if (g_helper->replica_num() == member_in) { LOGINFO("Wait for commits replica={}", g_helper->replica_num()); @@ -382,12 +324,8 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { this->validate_data(); } + g_helper->sync_for_verify_start(num_members); LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); - this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { - ASSERT_EQ( - check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), - ReplaceMemberStatus::IN_PROGRESS); - }); if (g_helper->replica_num() == member_out) { // The out member will have the repl dev destroyed. auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); @@ -401,11 +339,6 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) { } g_helper->sync_for_cleanup_start(num_members); - this->run_on_leader(db, [this, db, &task_id, member_out, member_in] { - ASSERT_EQ( - check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)), - ReplaceMemberStatus::COMPLETED); - }); LOGINFO("OneMemberRestart test done replica={}", g_helper->replica_num()); } @@ -429,11 +362,11 @@ TEST_F(ReplDevDynamicTest, ValidateRequest) { g_helper->sync_for_test_start(num_members); - // shut down before replace member + //shut down before replace member this->shutdown_replica(1); LOGINFO("Shutdown replica 1"); - // wait for shutdown + //wait for shutdown std::this_thread::sleep_for(std::chrono::seconds(3)); g_helper->sync_for_verify_start(num_members); if (g_helper->replica_num() == 0) { @@ -441,18 +374,17 @@ TEST_F(ReplDevDynamicTest, ValidateRequest) { LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); this->write_on_leader(num_io_entries, true /* wait_for_commit */); } - - std::string task_id = "task_id"; + g_helper->sync_for_verify_start(num_members); if (g_helper->replica_num() == 0) { // generate uuid replica_id_t fake_member_out = boost::uuids::random_generator()(); replica_id_t fake_member_in = boost::uuids::random_generator()(); LOGINFO("test SERVER_NOT_FOUND"); - replace_member(db, task_id, fake_member_out, fake_member_in, 0, ReplServiceError::SERVER_NOT_FOUND); + replace_member(db, fake_member_out, fake_member_in, 0, ReplServiceError::SERVER_NOT_FOUND); LOGINFO("test replace_member already complete"); - replace_member(db, task_id, fake_member_out, g_helper->replica_id(0)); + replace_member(db, fake_member_out, g_helper->replica_id(0)); LOGINFO("test QUORUM_NOT_MET", num_io_entries, g_helper->replica_num()); - replace_member(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, ReplServiceError::QUORUM_NOT_MET); } diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index 0cfb3f497..b9e55a15e 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -12,7 +12,7 @@ class TestFailedError(Exception): def run_test(options, type): cmd_opts = f"--gtest_filter=BtreeConcurrentTest/{type}.ConcurrentAllOps --gtest_break_on_failure --cleanup_after_shutdown={options['cleanup_after_shutdown']} --init_device={options['init_device']} --preload_size={options['preload_size']} {options['log_mods']} --run_time={options['run_time']} --num_iters={options['num_iters']} --num_entries={options['num_entries']} --num_threads={options['threads']} --num_fibers={options['fibers']} {options['dev_list']} {options['op_list']}" - print(f"Running test with options: {cmd_opts}") + # print(f"Running test with options: {cmd_opts}") try: subprocess.check_call(f"{options['dirpath']}test_index_btree {cmd_opts}", stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError as e: @@ -21,9 +21,9 @@ def run_test(options, type): print("Test completed") -def run_crash_test(options, crash_type='put', type=0): - cmd_opts = f"--gtest_filter=IndexCrashTest/{type}.long_running_{crash_type}_crash --gtest_break_on_failure --min_keys_in_node={options['min_keys_in_node']} --max_keys_in_node={options['max_keys_in_node']} --num_entries_per_rounds={options['num_entries_per_rounds']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} --num_rounds={options['num_rounds']} {options['dev_list']} " - print(f"Running test with options: {cmd_opts}") +def run_crash_test(options): + cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --log_mods=wbcache:trace --max_keys_in_node={options['max_keys_in_node']} --num_entries_per_rounds={options['num_entries_per_rounds']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} --num_rounds={options['num_rounds']} {options['dev_list']} " + # print(f"Running test with options: {cmd_opts}") try: subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT, shell=True) @@ -96,7 +96,7 @@ def long_running_crash_put(options): print("Long running crash put started") options['num_entries'] = 1310720 # 1280K options['init_device'] = True - options['run_time'] = 7200 # 2 hours + options['run_time'] = 14400 # 4 hours options['preload_size'] = 1024 print(f"options: {options}") run_crash_test(options, 'put', 0) @@ -104,9 +104,9 @@ def long_running_crash_put(options): def long_running_crash_remove(options): print("Long running crash remove started") - options['num_entries'] = 102400 # 100K + options['num_entries'] = 1000 options['init_device'] = True - options['run_time'] = 7200 # 2 hours + options['run_time'] = 14400 # 4 hours options['num_entries_per_rounds'] = 100 options['min_keys_in_node'] = 2 options['max_keys_in_node'] = 10 @@ -116,9 +116,9 @@ def long_running_crash_remove(options): def long_running_crash_put_remove(options): print("Long running crash put_remove started") - options['num_entries'] = 102400 # 100K + options['num_entries'] = 2000 # 1280K options['init_device'] = True - options['run_time'] = 7200 # 2 hours + options['run_time'] = 14400 # 4 hours options['preload_size'] = 1024 options['min_keys_in_node'] = 3 options['max_keys_in_node'] = 10 @@ -146,12 +146,12 @@ def long_running(*args): options = parse_arguments() long_runnig_index(options, 0) long_running_clean_shutdown(options, 0) - # long_runnig_index(options, 1) - # long_running_clean_shutdown(options, 1) - for i in range(5): + long_runnig_index(options, 1) + long_running_clean_shutdown(options, 1) + for i in range(20): print(f"Iteration {i + 1}") long_running_crash_put_remove(options) - for i in range(5): + for i in range(50): print(f"Iteration {i + 1}") long_running_crash_remove(options) for i in range(5): @@ -159,6 +159,7 @@ def long_running(*args): long_running_crash_put(options) long_runnig_index(options) long_running_clean_shutdown(options) + long_running_crash_put(options) if __name__ == "__main__": diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 501871ec1..57247dad7 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -37,7 +36,6 @@ #include "common/homestore_utils.hpp" #include "test_common/homestore_test_common.hpp" #include "replication/service/generic_repl_svc.h" -#define private public #include "replication/repl_dev/solo_repl_dev.h" //////////////////////////////////////////////////////////////////////////// @@ -49,9 +47,8 @@ using namespace homestore; using namespace test_common; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + SISL_OPTIONS_ENABLE(logging, test_solo_repl_dev, iomgr, test_common_setup) -SISL_LOGGING_DECL(test_solo_repl_dev) static thread_local std::random_device g_rd{}; static thread_local std::default_random_engine g_re{g_rd()}; @@ -133,14 +130,12 @@ class SoloReplDevTest : public testing::Test { cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received error={} on repl_dev", enum_name(error)); } - void on_start_replace_member(const std::string& task_id, const replica_member_info& member_out, - const replica_member_info& member_in, trace_id_t tid) override {} - void on_complete_replace_member(const std::string& task_id, const replica_member_info& member_out, - const replica_member_info& member_in, trace_id_t tid) override {} + void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {} + void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {} void on_destroy(const group_id_t& group_id) override {} void notify_committed_lsn(int64_t lsn) override {} void on_config_rollback(int64_t lsn) override {} - void on_no_space_left(repl_lsn_t lsn, sisl::blob const& header) override {} + void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) override {} }; class Application : public ReplApplication { @@ -187,9 +182,6 @@ class SoloReplDevTest : public testing::Test { m_repl_dev2 = hs()->repl_service().create_repl_dev(m_uuid2, {}).get().value(); } - shared< ReplDev > repl_dev1() { return m_repl_dev1; } - shared< ReplDev > repl_dev2() { return m_repl_dev2; } - virtual void TearDown() override { m_repl_dev1.reset(); m_repl_dev2.reset(); @@ -231,8 +223,7 @@ class SoloReplDevTest : public testing::Test { rdev->async_alloc_write(*req->header, req->key ? *req->key : sisl::blob{}, req->write_sgs, req); } - intrusive< test_repl_req > async_write_data_and_journal(uint32_t key_size, uint64_t data_size, - uint32_t max_size_per_iov, bool rand_dev = true) { + void async_write_data_and_journal(uint32_t key_size, uint64_t data_size, uint32_t max_size_per_iov) { data_size = data_size == 0 ? g_block_size : data_size; auto req = intrusive< test_repl_req >(new test_repl_req()); req->header = sisl::make_byte_array(sizeof(test_repl_req::journal_header)); @@ -249,8 +240,7 @@ class SoloReplDevTest : public testing::Test { req->write_sgs = HSTestHelper::create_sgs(data_size, max_size_per_iov, hdr->data_pattern); - auto rdev = m_repl_dev1; - if (rand_dev) { rdev = (rand() % 2) ? m_repl_dev1 : m_repl_dev2; } + auto& rdev = (rand() % 2) ? m_repl_dev1 : m_repl_dev2; auto const cap = hs()->repl_service().get_cap_stats(); LOGDEBUG("Before write, cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); @@ -265,7 +255,6 @@ class SoloReplDevTest : public testing::Test { RELEASE_ASSERT(!err, "Error during async_write"); rdev->async_write_journal(blkids, *req->header, req->key ? *req->key : sisl::blob{}, data_size, req); }); - return req; } void validate_replay(ReplDev& rdev, int64_t lsn, sisl::blob const& header, sisl::blob const& key, @@ -307,22 +296,6 @@ class SoloReplDevTest : public testing::Test { } } - void validate_sync(shared< ReplDev > rdev, intrusive< test_repl_req > req) { - auto const hdr = r_cast< test_repl_req::journal_header const* >(req->header->cbytes()); - for (const auto& blkid : req->written_blkids) { - uint32_t size = blkid.blk_count() * g_block_size; - auto read_sgs = HSTestHelper::create_sgs(size, size); - auto err = rdev->async_read(blkid, read_sgs, size).get(); - RELEASE_ASSERT(!err, "Error during async_read"); - for (auto const& iov : read_sgs.iovs) { - HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr->data_pattern); - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - LOGDEBUG("[{}] Validating of blkid={} validated successfully", boost::uuids::to_string(rdev->group_id()), - blkid.to_string()); - } - } - void on_write_complete(ReplDev& rdev, intrusive< test_repl_req > req) { if (req->written_blkids.empty()) { m_io_runner.next_task(); @@ -362,36 +335,6 @@ class SoloReplDevTest : public testing::Test { } } } - - void trigger_cp_flush() { homestore::hs()->cp_mgr().trigger_cp_flush(true /* force */).get(); } - void truncate_and_verify(shared< ReplDev > repl_dev) { - auto solo_dev = std::dynamic_pointer_cast< SoloReplDev >(repl_dev); - // Truncate and verify the CP LSN's - solo_dev->truncate(); - - auto& sb = solo_dev->m_rd_sb; - RELEASE_ASSERT(sb->last_checkpoint_lsn_2 <= sb->last_checkpoint_lsn_1, "invalid cp lsn"); - RELEASE_ASSERT(sb->last_checkpoint_lsn_1 <= sb->checkpoint_lsn, "invalid cp lsn"); - - auto [last_trunc_lsn, trunc_ld_key, tail_lsn] = solo_dev->m_data_journal->truncate_info(); - RELEASE_ASSERT(sb->last_checkpoint_lsn_2 == last_trunc_lsn, "invalid trunc lsn"); - } - -#ifdef _PRERELEASE - void set_flip_point(const std::string flip_name) { - flip::FlipCondition null_cond; - flip::FlipFrequency freq; - freq.set_count(2); - freq.set_percent(100); - m_fc.inject_noreturn_flip(flip_name, {null_cond}, freq); - LOGINFO("Flip {} set", flip_name); - } -#endif - -private: -#ifdef _PRERELEASE - flip::FlipClient m_fc{iomgr_flip::instance()}; -#endif }; TEST_F(SoloReplDevTest, TestSingleDataBlock) { @@ -437,23 +380,6 @@ TEST_F(SoloReplDevTest, TestAsyncWriteJournal) { this->m_task_waiter.start([this]() { this->restart(); }).get(); } -#ifdef _PRERELEASE -TEST_F(SoloReplDevTest, TestTruncate) { - // Write and truncate on repl dev. - LOGINFO("Step 1: run on worker threads to schedule write and truncate"); - - set_flip_point("solo_repl_dev_manual_truncate"); - - m_io_runner.set_task([this]() mutable { - this->async_write_data_and_journal(0u, g_block_size, g_block_size, false /* rand_dev */); - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - truncate_and_verify(repl_dev1()); - }); - m_io_runner.execute().get(); - std::this_thread::sleep_for(std::chrono::seconds(1)); -} -#endif - SISL_OPTION_GROUP(test_solo_repl_dev, (block_size, "", "block_size", "block size to io", ::cxxopts::value< uint32_t >()->default_value("4096"), "number")); @@ -465,13 +391,6 @@ int main(int argc, char* argv[]) { sisl::logging::SetLogger("test_solo_repl_dev"); spdlog::set_pattern("[%D %T%z] [%^%l%$] [%n] [%t] %v"); - // TODO make it part of the test case. - HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { - // Checkpoint taken every 1s - s.generic.cp_timer_us = 1000000; - }); - HS_SETTINGS_FACTORY().save(); - g_block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); return RUN_ALL_TESTS(); }

zlLe#a`*Z_(ihmrGp7uqu3TdP`(K zW|_s)%Gm5EW0dsc+~qZ9KB4czl2q^+k4>I)SsBW&@dPN2^&F170R#86&%tt+3>9*PGE(Qg-q|V#|Opc;B`uEen-jP|UoZ>wcb^g8?)+ zxuIEcjw@pQ3u3?%nF!ggOEmW@AGpO_74`e-N!3t`EE^Snw-rOpdQNf;8Z`_5C^XU~ zZRXx_*(-@NDUr6UcGxc)98cI4?_1dpg1l(%1#_ylaH`cR5IfJgLv)|;M%yqEy~Vb( zi{jPiSL*2zy$zb@C^f5#V<`N{nSoh6PR>9U-`Ybtm}2!V)uvk7D^;gPnq<6<8=?6k7Ct1-I~NM>N?I&B|} zYWm2z{;EvE-dgA5P#)Fu^+vj|Q_=O;twHO`x?5{i1aEuhAiCa zEpv+wj&W{n(6YqmT2RH%#0_ZTuF194@6!pOqffEgs!i?uzJf^ zZC1k#>Ak~^Lb#^W8CI~yVsa8_K-GeRu?D4wdF#l!F&Dg2JuL|xUe_|&A5wo$h=IoR z3qzKxJnXIv^CkAhQ;)YCVb;bAxRkurJI3_SPR(8loXQG7W^4pLu4RyD+CJ#B;iIc( z4cEe#Z5voNyiJ~!l}Q|O(<=9UKyP|7OOC`FCQi!r#)rQ%zI^O|F&O=okowVvhkQU# z8ufG38&+PskhQ}1;zq3Q4{4T|@P5qQRfarodJyjPqFmyP)_sYXFLysBB-M3hVjHNh z0?U^d{bO=L)9ms7o9lGG!`_!lwM%RiCO8fgH{Q-@p z@}4PwHBIsXHd?Cd_#pkNE(<+hi;0J~)A@SMize5#R2Utum-3RA|B$(TV^9u%TPBQl ziD?winsEDL0gb!hLH1uMT}WI+x-^o1Wd6|d6+LF)`N#I!9~XbRi#dGnLi!xv-@G5! zZK#Ghy^h9jGCo<7dZju^*B0^ooje0g-Z(wE{L{yHVFP%3oH>x(_c$UUByFIp7!&8u z01whdO&vY`<1_?~<|>NL_rjhi#qTJ;l=(~IXgHwOK~$-jU5 zO86;~-}L!?M?0=k6Fy{%{*N3Wh1L6~rXdgj>Bhw<<^RRLud~ z5k#1xFSsKu@fKStmPN^BC|Ak#99dgTCF0@KEci}Ww>t>{YJ3qZ%grCSW>uAy04M9} z4h0!F4M>R&$lw^7;Z{nB&O!_^&Ef!}zV=C<%9&|8oBP3uYfaGn-bSn4l2t5?*H z=05{GIoiaSIZJtO;0^h_1#}$cdVh(8L5OaPx!lwnU!ix6epSo#+t-ENU7p=nd5u3- z8oR4nBGHp|Ownl_s&XLFrW+>oo_QW@CGtDQ0?b{cA}XB(&Z#G_d){L$5LN^p5=jw- zJNxI0P=G?cu8n{GE-N=^BDyE1}l zC!fiFq=$JFn_f}L_I|VTDUh9{|D-B4Wr9u4zxCXXkS=64va(515E+<>0XKRR5?rwQ zy*4UDf5t0B2r+|Ts;nu!_9zjCMIn!phdVXll2t!^6;$omE5KrrDe=^cNfWHfAjo?@ z!f-C1xZk}phyG`XNUlaRDBc&WB{ss@?>tywhbPQ8SZGlPVqTgw+?ug_-RA^>Gd|bLh!j|CYVpH z!DhT~=X*4wi6)vSvaOyEVrPNuOMw5?Q_*g5~HU zuW&ZziZbp&QeCTuqSo}<9rKz=i-x4B7ZE%_(nDO}IH9tVa(HYQu2lh$XsVlw>`|Fn z)gyOp1;Lr;QPU3t#r{oyJys3YbcyN-lmFlnPRY|{JNEd^pa6C;^>wdt4WWOj#Csuw z3ZAI2Tk{wv_rDxnT1HNA@QNkT_hHfca=5UqssZ*T9;{>8Fas&0F+1s8&U^4pA>h=q zY4xwgP*C|CB+ojXxLJ{ORpb1a-Vi_l?-@oeJ31Cn)5|J2HCdBi6hs?VyhmVdxtX5> zs8sJMeR_A$F!~bK0YXesH!Tjn%J9m6eKI72fzlJZjy26twA4TTR%Ff#Ncd9?4i8@1H8G2uUHeLr;Irdnn-QU8_nO-0>a_$ zga(F)!f=#CZDRoCGyg^U&iJx9gucfUW|*A8;b`&;`pauoj~SS6IXuhzX*=6MlisaJ zu2~vDS3Ch2XVuIz3*h5r5X*3=>ZLX`i97sape!{9{I!yO&NY7wJ4V|2o?g3a9%!Hh z(D{%2SR%xX+?(ilU7NjY>AjaS#l#js>g@_`&M^7uy#DuJLx_@@5O{`cjITX|kI(ze=_+@-YN}TZ$$qYPEN2xaMFKob_v3h5 z0(ajyb&Yj5&YjnZ3mDF3-LfUR`++83)YHws`>Q<;uTh~j$bTu~{SiwK*H0_3ebf`U z1hAaksNJjD3iIz?a;gUB&SM=)M#+DiabHbEf&`6&zGp)7pYKltE(YYKsyX^6N zcIp-bdi`j@BdP((1Kann=z)dAK7iH;fSxv|?x!vCcpRFKO>pm&=9J#!AH{2;Iis4C zdUqnYS;)XWxdKed6W}-I$!Pj7B^Y>gjv<=w2=(+;B*1I7K!(SUo8rmGS^78V8Yp^S zfMvbo;V>FUFL2KEZYf2;aT+9bBAFFazSpxG@X(iFj9sP{SaN~ejAQFY^xC(84r~n2 z&}--X@KCQ!`fkQj?)S;~3XQUzWKoAxTozXn&-`)K$l(Wdv)ad$_DjH`5Hfb3fj`|C zu#dZSeN+}`1V&Vt1vYiS0)%?mh#eNlBM|Q+(bOy59x<@G<}~Y#5_pHJs~ZVC*&`%k zYUr5@RO&3m-(lha zQl-F}Igrajh646@d@@8BV60)_JtZ*4yX%1rZ`9Ln4TcCSdToodsoBfXqKZK|8(_{8 zd^_y3iE$KL0S6$$i#J`c&~e2*L}%>e@vlg#CZS*QSIjYRLWp^%B`okUrUmm@Ke1zG zea@}nJX6fcR)eE+5l#nTOCbFnM#wwfC`J~v?c-(>1`ctNCxd%s8(=g9(w_fi;kylx zd@}}O73!cuFM|*iTK+>r;zZSfw1gVyj7K-Zu9>hwfdT}pgqFXlPR=}M#=|ztdtwpb zi3W=6TwJ&R70itKE_8Xah_EOqickE523AB4!xm-W#;n(#28(^>Y(cl%U>(kbg{p$H zEOU0VVN{#&#^2D&mO(HcGke?L~wci7AF4PtvK&C@E8`dI(_hq5&w4NPdys-D`T(ia_f83~L|=2Vs>W4Iferlz8tDGZ*2&*_ zA-7$G7|~t;s6w&$ppMXXI(?JrJ_%Mw(hifjUF5UU1Ja z%sp3#?=MD*Az8%r?3!>R8j=GLx?0+O91I|)qW5MER9}4y#k)QVo<`u?O|vI{Chx`N zxjc58@X;DsRJL`c>295OsNF$6kP17vhzL)>f>>ZoJ7x!BF6DprW)+B7f?ohjuR;`F z?O}C$T!XQji&(Sj5x@k{0wX`OK3T|y3NwBqIQ}*%?})nwnBa95vZe_-=YzDTbSgfZ z6e@nZA3f%=Y|LrG`yC9(0U0`Hey*N(v*5Jzo=ZPZH?vzI+?c~E`O5Nv*Nc@NW-fSR z32QCgeT9P0VM!K~dN>15 z$PQ9f`S?1dp5^OgHP9_|y$issC+FWfK`MDc*McV3b9&ozbzKnVu^h2eBw)P%80XPW z#gHNrTnDC-aC?wTR*P}lQr%Y)ZGim0i5WP81qjLb=Rmw;`dj-JJq0}ACZ%oNUQjdH zq|=ZB`otKdlKZ^3thLSpd%g-|NAO~fFozFMHv6mlE_Ycy%%4IY<-GDXt9yIcuNC#} zk*p8c{UCfTvTSMurC`Rbax`E2*D_hYz#KZ+0MYytilWh^OwIW>1ie9-S>V5v&+(5L zt%1yAIq}IFfONWA;6Tx=)4vmY-*O|dQxS5*L-cehmeylXoA}!`*@N4PSY&LolZ0!# zs<(gnIbv01;ztNIKE&I1h7wX|V9tz93GpP7St+7@?EXH2BTHXS5~@Sa;qdRFUKmoxu#$IY#> zDv^;d;oN61`+a`SmJsRIs|v?@iTim2$8q=Fc5>3n3~dV^y_1{}LtTRex*@FEqWn z(-N*&nyay$&!Z}*TQ-v=Gzchno<7Q0A-bdKB}b@R;z zN@&}&2CSR7#2;kGZb)sD>EO+Psj12J6Oo6Un4(80RRTHRohRLDidXw!FeAz0ZNqzi zP#iOn;>ad%L~kCqp!wmWKt)~A*A0#MjT&~SGn@2UrU$2$;S!u5e1{Z+7=w0dC~s|a zH((x#TYbM8HN~bHU&Wa6y=mYH6?(~xZWNHC=`J?6z;)0ta~-k>?bLbyCa)KoPm|^ zWxNjiONq5~OlN0>+y0xdI4OfTwK|RgV!n}PlA#UqqpU_L>=i!#>dfa9vR5+hV%p?h z%X|y-DHKse(5bdIPdut!JL=|2Ha?9g#XQ&n+uQgb50gsl@2mhbV>*urbF`-$Cl9HA zAOE3=)z)B(sh^R~C8w8B$u=_exq-54#cnEz7mAGbCcVs; zHY8=({EZyyZyB#%q73r|_|N?~%JL_6q$xQ2^2A+>Bze~jO*j@I#sE4#Z#d~Ja;wnG zaM>e29)Q;Qc_%-FQ;ENy+&Ec4_UHm4Lcp!O|D%+{LJkbbxQm=}e}>e5`vjXwUjysv z3wlL*<0YRf6C$}hX;0~)ygdXTP$oA#N+34X5$jGQu82)Z62k_$74^vkCK8h$6T_<^ z4F_?RA&N_{h_UYUKQAJ&iIL-#g54%0ifRySK|!FT>cA+lk7s>(|JCH(&gTUkPe;a8 zG_VuWd>P`0-Rs7m+1&p6q95-e-#LA;cZmf;*HFX$_*Ln>BO*TTdy#^E?&RwlevCt= zo{mks(v0C3HD^v+D{d#@TZ$D(JKbQw2rQ-X(a*ihfhvd_45FE7n=lq%E}mrC?yXnv z!AAMc6uE2|@W^rcq3*-x-YkA!@sDnHY`xO(J|eu8*X;>Ie@!*f%#f)$pSIuYMXxM%md=g!n45%6zy>PSC3mR@& z(fd{^C*)tdT@a*spv1;Akd_v5a_d^7VCn-JskZvsUylHnMQ?iJW9S4A)mi4$y!%-= zoiJ?Jz0`S2at<~5fXTM&jzmzMlkik#scs=}>tNG-r#ijEK+fz+o}fHE_VnUA6S3#} zpC0QsQqhOXwcpE`0u|l0a~ljYg>Bcp5GsW`^3$zAQ@6!Q6DPgKqA;?VOB!{Z-}tU2 z^||CPzq5PnOI%D78p+`VdV0-c|IV+30A2P}R~^rMAKF`>+>7hXu2+ogke9n4XyUeU zl2SG8LBlOY=q0rTFm1P#zH()2VY%L`o*RgKFY4GaDM+th*m=>kZ=_hSZh9kZy!QJ{ zw^FBfMkia^QA&1(yd}(wjHVAF&)z%cG+qxR56Wy``X1SSsD{u+DerZ=X)3}4L^3X; zQ^zK}r}+gMiRe%9NM8ZSduTPZ zD2dvYQ~lj%lfqVWoo?<4lB?aHn>)~WOJb4Alh)n8#7Qian~I~t>u z$$>UUh0o0>)TVlLCF@;5mE%4`ayGGp^HS}}ArPNT6+2&saWop;!T$@+k7O2fv+=sZ z5kY>C8NOJ;wuAC8uZ=II%*4dZ@pfYO6_lf1I_I3%bH?6EHy=}ym0spZLDLS_3K@}# zlFplAo0>+wrs)0iYz%(>OWpd++Hqw&8R{2TLVU?OuTYaYwz1VqNo3`p<&vo<8TYlR z=^rUiHEWc;(6gC|WA5u9skZ@GMRbDEBJ`MCK7se7Zybb;)9U<5o3# z-4%0LoksVtRXe@;~w3Pr~RK{ZN`V z4bn3@L}`%LUq?F2j|nwZI8M-ciYGeUl!W|OCB8Q;sqK1CPv4Nj?Wn4|eq-6XxX}|z zc|uN2yT6s4+F2Q~`(i60#+6n>^B7GWemnP~;j!Yd`~D|!4JKh{5RidaX?Q_3(^7kp z9^@-tyN@pZTv##^%->Fr{d%7s(|Uux=FwN*qJ}(E=J*Y%9oLU>b0$G9X!RyW+rYm~rT>!_+ zP#AObbg#9Zo({(qi0HMiD%Jobqh$Z~tX&(8PhnMWLs1dAp=@Ov{Ni}D_vR99!|7*D zad*RuQ}Qki$*Pc&V+Tk zJQ<6?P<{0w>JF%~Hmcp8e%G+(H~!W-lw&;+LC!p_XrgOo?Oz|GoGec%udn$b@V(gY z_4`RZyRtT9x4|sj1N$btBffriF(H`TpC|l=VtOq`!Ags4ouj+S=10U4@y9FlcU01)oT; z$JD>Lu*ys;T+^ubL<@Kv49Si;BBlj=DP8ERw%f;7Vvn<#l5@jWy+9lO721#6()+fl z_&WMn=xGqxYbN-DJi4(A&i#2D-j%U5R{!Dk6|bTgOXwlPxjfFYb)6Xcf`>aDhKNEW zmway7S6cpvGTy4Z&q>GzP&FLdr8pFXQrbEkH6!Rz#~$z8fk57j_5qCI{Xp8YZOhNA z!U@i+cMU-qYK|Z(Ga7S%xc^mQ{5!_IFGX4>x+ToP_y*XRsueV~pGRY_Xat|)hxQYI z%}@?w@x!cR_W0Doc>1fY-`l4$`7tIo_6Ut}Y+y-AWtG_hX^`Q4ukzUj!WdjJVQtJU zjnwg}>p_0shPS`1!*%!Klf~y_gsl<>T-Jx^Z^rmMtK3mR5a~BGFls{2qD0i7n7RE4 zRHp3aZO?ghlL|EH5okvsC^H8j=y4EwH;DrcG#_+@+zKIWn6A`b7CUYT@cdSA*43%MPPQLRm$4y#2&O}3P7yR}HQ%V!nSoo!{TwZs&g~6hLgBnvrjm_k|e+@It3C)+?z&pksLZQwi# zYmon`;Dq$@r)MO)$t)f9`tq`VG*zZ)$O8yQI$7CMC5l$ITj+s)Q5+PG1E9bj9Xid6 zG(oJwUPWg_Oq5ci3;@*x%(>7R5R{3$LVM7VdM>Kj8i$zc2S; zt~YEzCW>Uz5X0+Y6+lY~bl{~S{NK9_DP9-$Rs*6ln6jn1Wx0LZLi;7X$KCf-H}749 zO_fg%!kX&V!tGR8D51E;f)0u3PJ%1f{k-D8j0dO$rXk<1Fe@vCu?tcOq$zQ@`GG+>4sn(k5cZ;L1kCuD_#T5eJAiGcAUy6630_MQ-2N0()*UM