diff --git a/.github/workflows/build_dependencies.yml b/.github/workflows/build_dependencies.yml index 4c92419ec..12d2093de 100644 --- a/.github/workflows/build_dependencies.yml +++ b/.github/workflows/build_dependencies.yml @@ -211,6 +211,8 @@ jobs: - name: Code Coverage Run run: | + du -sh ~/.conan2/p/* + df -h conan build \ -o "sisl/*:prerelease=${{ inputs.prerelease }}" \ -o "sisl/*:malloc_impl=${{ inputs.malloc-impl }}" \ diff --git a/.jenkins/Dockerfile b/.jenkins/Dockerfile index 20c4489b0..dcfdd9d65 100644 --- a/.jenkins/Dockerfile +++ b/.jenkins/Dockerfile @@ -1,5 +1,5 @@ # ########## ####### ############ -FROM hub.tess.io/sds/sds_develop:4.x-latest +FROM hub.tess.io/sds/sds_develop:7.x-latest LABEL description="Automated HomeStore compilation" WORKDIR /output diff --git a/.jenkins/jenkinsfile_nightly b/.jenkins/jenkinsfile_nightly index 7efd9b935..8083c816b 100644 --- a/.jenkins/jenkinsfile_nightly +++ b/.jenkins/jenkinsfile_nightly @@ -1,5 +1,5 @@ pipeline { - agent { label 'sds-builder-2204' } + agent { label 'sds-builder-v5' } triggers { cron('TZ=US/Pacific\nH H(0-2) * * *') } @@ -8,7 +8,7 @@ pipeline { ORG = 'sds' ECR_URL = 'hub.tess.io' ARTIFACTORY_PASS = credentials('ARTIFACTORY_PASS') - CONAN_USER = 'sds' + CONAN_USER = 'oss' failed_stage = "" } stages { @@ -26,6 +26,7 @@ pipeline { VER = sh(script: "grep -m 1 ' version =' conanfile.py | awk '{print \$3}' | tr -d '\n' | tr -d '\"'", returnStdout: true) NIGHTLY_TAG = "master-nightly-debug-4.0" ECR_PATH = "${ECR_URL}/${ORG}/${PROJECT}" + CONAN_FLAGS="--name ${PROJECT} --user ${CONAN_USER} --channel ${NIGHTLY_TAG}" failed_stage = "" } } @@ -40,20 +41,25 @@ pipeline { } stage("Build") { steps { - sh "conan create --build missing -o homestore:sanitize=True -pr debug . ${PROJECT}/${VER}@" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_index_btree' -exec cp {} .jenkins/test_index_btree \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_index_crash_recovery' -exec cp {} .jenkins/test_index_crash_recovery \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_meta_blk_mgr' -exec cp {} .jenkins/test_meta_blk_mgr \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_log_store' -exec cp {} .jenkins/test_log_store \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_home_raft_logstore' -exec cp {} .jenkins/test_home_raft_logstore \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_log_store_long_run' -exec cp {} .jenkins/test_log_store_long_run \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_data_service' -exec cp {} .jenkins/test_data_service \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_raft_repl_dev' -exec cp {} .jenkins/test_raft_repl_dev \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_solo_repl_dev' -exec cp {} .jenkins/test_solo_repl_dev \\;" - sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/index_test.py' -exec install -Dm755 {} .jenkins/index_test.py \\; " - sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/log_meta_test.py' -exec install -Dm755 {} .jenkins/log_meta_test.py \\; " - sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/data_test.py' -exec install -Dm755 {} .jenkins/data_test.py \\; " - sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/long_running.py' -exec install -Dm755 {} .jenkins/long_running.py \\; " + sh ''' + hostname + echo $NODE_NAME + conan create --build missing -s:h build_type=Debug -o ${PROJECT}/*:sanitize=True ${CONAN_FLAGS} . + + find /home/jenkins -type f -wholename '*/test_index_btree' -exec cp {} .jenkins/test_index_btree \\; + find /home/jenkins -type f -wholename '*/test_index_crash_recovery' -exec cp {} .jenkins/test_index_crash_recovery \\; + find /home/jenkins -type f -wholename '*/test_meta_blk_mgr' -exec cp {} .jenkins/test_meta_blk_mgr \\; + find /home/jenkins -type f -wholename '*/test_log_store' -exec cp {} .jenkins/test_log_store \\; + find /home/jenkins -type f -wholename '*/test_home_raft_logstore' -exec cp {} .jenkins/test_home_raft_logstore \\; + find /home/jenkins -type f -wholename '*/test_log_store_long_run' -exec cp {} .jenkins/test_log_store_long_run \\; + find /home/jenkins -type f -wholename '*/test_data_service' -exec cp {} .jenkins/test_data_service \\; + find /home/jenkins -type f -wholename '*/test_raft_repl_dev' -exec cp {} .jenkins/test_raft_repl_dev \\; + find /home/jenkins -type f -wholename '*/test_solo_repl_dev' -exec cp {} .jenkins/test_solo_repl_dev \\; + find /home/jenkins -type f -wholename '*/test_scripts/index_test.py' -exec install -Dm755 {} .jenkins/index_test.py \\; + find /home/jenkins -type f -wholename '*/test_scripts/log_meta_test.py' -exec install -Dm755 {} .jenkins/log_meta_test.py \\; + find /home/jenkins -type f -wholename '*/test_scripts/data_test.py' -exec install -Dm755 {} .jenkins/data_test.py \\; + find /home/jenkins -type f -wholename '*/test_scripts/long_running.py' -exec install -Dm755 {} .jenkins/long_running.py \\; + ''' } post { failure { diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e90a498b..728a2bdbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,6 +87,18 @@ endif () add_flags("-DPACKAGE_NAME=\\\"${PROJECT_NAME}\\\"") add_flags("-DPACKAGE_VERSION=\\\"${PACKAGE_REVISION}\\\"") +# add replication flag +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + add_flags("-DREPLICATION_SUPPORT") + message(STATUS "Building with REPLICATION enabled") + else() + message(STATUS "Building with REPLICATION disabled") + endif() +else() + message(STATUS "Building with REPLICATION disabled") +endif() + if(UNIX) # enable proper pread/pwrite and large file add_flags("-D_POSIX_C_SOURCE=200809L -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE") diff --git a/cmake/test_mode.cmake b/cmake/test_mode.cmake index 486186bd5..4195a68b1 100644 --- a/cmake/test_mode.cmake +++ b/cmake/test_mode.cmake @@ -39,6 +39,9 @@ if (DEFINED TEST_TARGET) set(${ret} true) endif() endmacro() + macro(can_build_repl_tests ret) + set(${ret} false) + endmacro() else() macro(can_build_io_tests ret) set(${ret} false) @@ -55,4 +58,7 @@ else() macro(can_build_epoll_io_tests ret) set(${ret} false) endmacro() + macro(can_build_repl_tests ret) + set(${ret} false) + endmacro() endif() diff --git a/conanfile.py b/conanfile.py index 445bd4e0a..fab1039da 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "5.2.2" + version = "5.3.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" @@ -25,6 +25,7 @@ class HomestoreConan(ConanFile): "coverage": ['True', 'False'], "sanitize": ['True', 'False'], "testing" : ['full', 'min', 'off', 'epoll_mode', 'spdk_mode'], + "replication" : ['off', 'on'], } default_options = { 'shared': False, @@ -32,6 +33,7 @@ class HomestoreConan(ConanFile): 'coverage': False, 'sanitize': False, 'testing': 'epoll_mode', + 'replication': 'off', } exports_sources = "cmake/*", "src/*", "CMakeLists.txt", "test_wrap.sh", "LICENSE" @@ -54,18 +56,27 @@ def build_requirements(self): def requirements(self): self.requires("iomgr/[^12.1]@oss/master", transitive_headers=True) self.requires("sisl/[^13.3]@oss/master", transitive_headers=True) - self.requires("nuraft_mesg/[^4.1]@oss/main", transitive_headers=True) + if str(self.options.replication) == "on": + self.requires("nuraft_mesg/[^4.1]@oss/main", transitive_headers=True) self.requires("farmhash/cci.20190513@", transitive_headers=True) if self.settings.arch in ['x86', 'x86_64']: self.requires("isa-l/2.30.0", transitive_headers=True) + # Tests require OpenSSL 3.x + self.requires("openssl/[^3.1]", override=True) + def imports(self): self.copy(root_package="sisl", pattern="*", dst="bin/scripts/python/flip/", src="bindings/flip/python/", keep_path=False) def layout(self): self.folders.source = "." - self.folders.build = join("build", str(self.settings.build_type)) + if self.options.get_safe("sanitize"): + self.folders.build = join("build", "Sanitized") + elif self.options.get_safe("coverage"): + self.folders.build = join("build", "Coverage") + else: + self.folders.build = join("build", str(self.settings.build_type)) self.folders.generators = join(self.folders.build, "generators") self.cpp.source.includedirs = ["src/include"] @@ -94,6 +105,12 @@ def generate(self): tc.variables['BUILD_COVERAGE'] = 'ON' elif self.options.get_safe("sanitize"): tc.variables['MEMORY_SANITIZER_ON'] = 'ON' + tc.variables["CONAN_PACKAGE_NAME"] = self.name + tc.variables["CONAN_PACKAGE_VERSION"] = self.version + if str(self.options.replication) == "on": + tc.variables["REPLICATION"] = "ON" + else: + tc.variables["REPLICATION"] = "OFF" tc.generate() # This generates "boost-config.cmake" and "grpc-config.cmake" etc in self.generators_folder diff --git a/docs/imgs/HomeStore_Disk_Layout2.png b/docs/imgs/HomeStore_Disk_Layout2.png new file mode 100644 index 000000000..8775927ee Binary files /dev/null and b/docs/imgs/HomeStore_Disk_Layout2.png differ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c032ed95d..7b33a68e8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,15 +8,27 @@ find_package(isa-l QUIET) find_package(iomgr QUIET REQUIRED) find_package(farmhash QUIET REQUIRED) find_package(GTest QUIET REQUIRED) -find_package(NuraftMesg QUIET REQUIRED) +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + find_package(NuraftMesg QUIET REQUIRED) + endif() +endif() list(APPEND COMMON_DEPS iomgr::iomgr farmhash::farmhash - nuraft_mesg::proto - nuraft::nuraft sisl::sisl ) + +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + list(APPEND COMMON_DEPS + nuraft_mesg::proto + nuraft::nuraft + ) + endif() +endif() + if (${isa-l_FOUND}) list(APPEND COMMON_DEPS isa-l::isa-l) else () @@ -42,7 +54,11 @@ add_subdirectory(lib/logstore) add_subdirectory(lib/meta) add_subdirectory(lib/index) add_subdirectory(lib/blkdata_svc/) -add_subdirectory(lib/replication/) +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + add_subdirectory(lib/replication/) + endif() +endif() if(NOT DEFINED BUILD_TESTING OR BUILD_TESTING) add_subdirectory(tests) @@ -59,20 +75,19 @@ set(HOMESTORE_OBJECTS $ $ $ - $ lib/homestore.cpp lib/crc.cpp ) -#target_link_libraries(homestore_objs ${COMMON_DEPS}) -if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") - add_library(homestore STATIC - ${HOMESTORE_OBJECTS} - ) -else() - add_library(homestore STATIC - ${HOMESTORE_OBJECTS} - ) + +if (DEFINED REPLICATION) + if (${REPLICATION} STREQUAL "ON") + list(APPEND HOMESTORE_OBJECTS $) + endif() endif() +#target_link_libraries(homestore_objs ${COMMON_DEPS}) +add_library(homestore STATIC + ${HOMESTORE_OBJECTS} +) target_compile_definitions (homestore PRIVATE LOG_MODS_V2_SUPPORT) target_link_libraries(homestore PRIVATE ${COMMON_DEPS}) diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h index b9e22740c..a3e0a7768 100644 --- a/src/include/homestore/blk.h +++ b/src/include/homestore/blk.h @@ -251,9 +251,14 @@ VENUM(BlkAllocStatus, uint32_t, struct blk_alloc_hints { blk_temp_t desired_temp{0}; // Temperature hint for the device - std::optional< uint32_t > pdev_id_hint{std::nullopt}; // which physical device to pick (hint if any) - std::optional< chunk_num_t > chunk_id_hint{std::nullopt}; // any specific chunk id to pick for this allocation + std::optional< uint32_t > reserved_blks{std::nullopt}; // Reserved blks in a chunk + std::optional< uint32_t > pdev_id_hint{std::nullopt}; // which physical device to pick (hint if any) + std::optional< chunk_num_t > chunk_id_hint{std::nullopt}; // any specific chunk id to pick for this allocation + std::optional< MultiBlkId > committed_blk_id{ + std::nullopt}; // blk id indicates the blk was already allocated and committed, don't allocate and commit again std::optional< stream_id_t > stream_id_hint{std::nullopt}; // any specific stream to pick + std::optional< uint64_t > application_hint{ + std::nullopt}; // hints in uint64 what will be passed opaque to select_chunk bool can_look_for_other_chunk{true}; // If alloc on device not available can I pick other device bool is_contiguous{true}; // Should the entire allocation be one contiguous block bool partial_alloc_ok{false}; // ok to allocate only portion of nblks? Mutually exclusive with is_contiguous diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index b82ec886b..69b2f2ee4 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -56,17 +56,19 @@ class BlkDataService { /** * @brief Creates a new virtual device with the specified size and block size, using the specified - * block allocator and chunk selector types. The virtual device will be composed of the specified - * number of chunks. + * block allocator and chunk selector types. The virtual device will be composed of a number of chunks. + * Either `num_chunks` or `chunk_size` must be specified. + * Prioritize `num_chunks` over `chunk_size` if both are provided. * * @param size The size of the virtual device, in bytes. * @param blk_size The size of each block in the virtual device, in bytes. * @param alloc_type The type of block allocator to use for the virtual device. * @param chunk_sel_type The type of chunk selector to use for the virtual device. * @param num_chunks The number of chunks to use for the virtual device. + * @param chunk_size The size of chunks to use for the virtual device, in bytes. */ void create_vdev(uint64_t size, HSDevType devType, uint32_t blk_size, blk_allocator_type_t alloc_type, - chunk_selector_type_t chunk_sel_type, uint32_t num_chunks); + chunk_selector_type_t chunk_sel_type, uint32_t num_chunks, uint32_t chunk_size); /** * @brief Opens a virtual device with the specified virtual device information. @@ -112,6 +114,18 @@ class BlkDataService { folly::Future< std::error_code > async_write(sisl::sg_list const& sgs, MultiBlkId const& in_blkids, bool part_of_batch = false); + /** + * @brief : asynchronous write with input block ids; + * + * @param sgs : the data buffer that needs to be written + * @param hints : blk alloc hints + * @param in_blkids : input block ids that this write should be written to; + * @param cb : callback that will be triggered after write completes + * @param part_of_batch : is this write part of a batch; + */ + folly::Future< std::error_code > async_write(sisl::sg_list const& sgs, std::vector< MultiBlkId > const& in_blkids, + bool part_of_batch = false); + /** * @brief Asynchronously reads data from the specified block ID into the provided buffer. * @@ -137,6 +151,13 @@ class BlkDataService { folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch = false); + /** + * @brief Submit the io batch, which is a mandatory method to be called if read/write are issued with part_of_batch + * is set to true. In those cases, without this method, IOs might not be even issued. No-op if previous io requests + * are not part of batch. + * */ + void submit_io_batch(); + /** * @brief Commits the block with the given MultiBlkId. * @@ -145,7 +166,8 @@ class BlkDataService { BlkAllocStatus commit_blk(MultiBlkId const& bid); /** - * @brief Allocates a contiguous block of disk space of the given size. + * @brief Allocates a contiguous block of disk space of the given size. This API should be called that when consumer + * is expecting blks only allocated on same chunk. * * @param size The size of the block to allocate, in bytes. * @param hints Hints for how to allocate the block. @@ -154,6 +176,17 @@ class BlkDataService { */ BlkAllocStatus alloc_blks(uint32_t size, blk_alloc_hints const& hints, MultiBlkId& out_blkids); + /** + * @brief Allocates blocks of disk space of the given size.This API should be called when consumer is expecting blk + * allocation happen on different chunks is possible and acceptable. + * + * @param size The size of the block to allocate, in bytes. + * @param hints Hints for how to allocate the block. + * @param out_blkids Output parameter that will be filled with the IDs of the allocated blocks. + * @return The status of the block allocation attempt. + */ + BlkAllocStatus alloc_blks(uint32_t size, blk_alloc_hints const& hints, std::vector< BlkId >& out_blkids); + /** * @brief Asynchronously frees the specified block IDs. * It is asynchronous because it might need to wait for pending read to complete if same block is being read and not @@ -194,10 +227,35 @@ class BlkDataService { */ void start(); + /** + * @brief Gets the total capacity of the block data service. + * + * This function returns the total capacity of the block data service, in bytes. + * + * @return The total capacity of the block data service, in bytes. + */ uint64_t get_total_capacity() const; + /** + * @brief Gets the used capacity of the block data service. + * + * This function returns the used capacity of the block data service, in bytes. + * + * @return The used capacity of the block data service, in bytes. + */ uint64_t get_used_capacity() const; + /** + * @brief Gets the drive type of the data service. + * + * Data Service doesn't support mixed drive types. + * + * @return The drive type of the data service, HDD or NVME. + */ + HSDevType get_dev_type() const; + + void stop(); + private: /** * @brief Initializes the block data service. diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index ee65a8d0d..c159dc2f3 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -261,8 +261,10 @@ class Btree : public BtreeBase { btree_status_t do_sweep_query(BtreeNodePtr& my_node, BtreeQueryRequest< K >& qreq, std::vector< std::pair< K, V > >& out_values); + btree_status_t do_traversal_query(const BtreeNodePtr& my_node, BtreeQueryRequest< K >& qreq, std::vector< std::pair< K, V > >& out_values); + #ifdef SERIALIZABLE_QUERY_IMPLEMENTATION btree_status_t do_serialzable_query(const BtreeNodePtr& my_node, BtreeSerializableQueryRequest& qreq, std::vector< std::pair< K, V > >& out_values); diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp index 0df733575..0a8f57686 100644 --- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp +++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp @@ -357,6 +357,11 @@ btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const child_node1->inc_link_version(); // Update the existing parent node entry to point to second child ptr. + // Don't change the order. First update the parent node and then insert the new key. This is important for casee + // where the split key is the last key in the parent node. In this case, the split key should be inserted in the + // parent node. If we insert the split key first, then the split key will be inserted in the parent node and the + // last key in the parent node will be lost. This will lead to inconsistency in the tree. In case of empty parent + // (i.e., new root) or updating the edge, this order made sure that edge is updated. parent_node->update(parent_ind, child_node2->link_info()); parent_node->insert(parent_ind, *out_split_key, child_node1->link_info()); diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index b58174dc3..8bf83966c 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -364,6 +364,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { template < typename K > K get_first_key() const { + if (total_entries() == 0) { return K{}; } return get_nth_key< K >(0, true); } @@ -463,6 +464,12 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { } fmt::format_to(std::back_inserter(str), "]"); } + + // Should not happen + if (this->is_node_deleted()) { + fmt::format_to(std::back_inserter(str), " **DELETED** "); + } + return str; } diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp index 67acd7d5a..04e483377 100644 --- a/src/include/homestore/btree/detail/btree_remove_impl.ipp +++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp @@ -469,6 +469,9 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const BT_NODE_LOG_ASSERT_EQ(child->is_node_deleted(), false, child); old_nodes.push_back(child); + // Todo: need a more precise calculation considering compacted size for prefix nodes because when merge happens + // compaction will occur for both leftmost and new nodes. This calculation makes available size not be balanced + // for the leftmost node and new nodes. total_size += child->occupied_size(); } @@ -506,6 +509,13 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const auto const nentries = old_nodes[i]->num_entries_by_size(0, available_size); if ((old_nodes[i]->total_entries() - nentries) == 0) { // Entire node goes in available_size -= old_nodes[i]->occupied_size(); + // For prefix nodes, compaction will make the size smaller, so we can compact saving to available size; + // hence it cannot get negative. + if (old_nodes[i]->get_node_type() == btree_node_type::PREFIX) { + auto cur_node = static_cast< FixedPrefixNode< K, V >* >(old_nodes[i].get()); + available_size += cur_node->compact_saving(); + } + BT_NODE_DBG_ASSERT_EQ(available_size >= 0, true, leftmost_node, "negative available size"); if (i >= old_nodes.size() - 1) { src_cursor.ith_node = i + 1; src_cursor.nth_entry = std::numeric_limits< uint32_t >::max(); diff --git a/src/include/homestore/btree/node_variant/prefix_node.hpp b/src/include/homestore/btree/node_variant/prefix_node.hpp index 2892aec63..7db486f88 100644 --- a/src/include/homestore/btree/node_variant/prefix_node.hpp +++ b/src/include/homestore/btree/node_variant/prefix_node.hpp @@ -316,6 +316,7 @@ class FixedPrefixNode : public VariantNode< K, V > { ///////////////////////////// All overrides of BtreeNode /////////////////////////////////// void get_nth_key_internal(uint32_t idx, BtreeKey& out_key, bool) const override { + DEBUG_ASSERT_LT(idx, this->total_entries(), "node={}", to_string()); suffix_entry const* sentry = get_suffix_entry_c(idx); prefix_entry const* pentry = get_prefix_entry_c(sentry->prefix_slot); DEBUG_ASSERT(prefix_bitset_.is_bit_set(cbitset_blob(), sentry->prefix_slot), @@ -337,10 +338,16 @@ class FixedPrefixNode : public VariantNode< K, V > { } } + uint16_t get_nth_suffix_slot_num(uint32_t idx) const { return get_suffix_entry_c(idx)->prefix_slot; } + + uint16_t get_nth_prefix_ref_count(uint32_t idx) const { + return get_prefix_entry_c(get_suffix_entry_c(idx)->prefix_slot)->ref_count; + } + uint32_t available_size() const override { auto num_holes = num_prefix_holes(); if (num_holes > prefix_node_header::min_holes_to_compact) { - return available_size_without_compaction() + (num_holes * prefix_entry::size()); + return available_size_with_compaction(); } else { return available_size_without_compaction(); } @@ -424,7 +431,6 @@ class FixedPrefixNode : public VariantNode< K, V > { // part of Step 1, except generation count this->inc_gen(); dst_node.inc_gen(); - auto new_phdr = dst_node.prefix_header(); if (!this->is_leaf() && (dst_node.total_entries() != 0)) { // Incase this node is an edge node, move the stick to the right hand side node @@ -660,10 +666,10 @@ class FixedPrefixNode : public VariantNode< K, V > { } std::string to_string(bool print_friendly = false) const override { - auto str = fmt::format("{}id={} level={} nEntries={} {} next_node={} ", + auto str = fmt::format("{}id={} level={} nEntries={} {} next_node={} available_size={} ", (print_friendly ? "------------------------------------------------------------\n" : ""), this->node_id(), this->level(), this->total_entries(), - (this->is_leaf() ? "LEAF" : "INTERIOR"), this->next_bnode()); + (this->is_leaf() ? "LEAF" : "INTERIOR"), this->next_bnode(), this->available_size()); if (!this->is_leaf() && (this->has_valid_edge())) { fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid, this->edge_info().m_link_version); @@ -674,9 +680,10 @@ class FixedPrefixNode : public VariantNode< K, V > { prefix_bitset_.to_string(cbitset_blob())); for (uint32_t i{0}; i < this->total_entries(); ++i) { - fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={}]", (print_friendly ? "\n\t" : " "), i + 1, - BtreeNode::get_nth_key< K >(i, false).to_string(), - this->get_nth_value(i, false).to_string()); + fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={} slot#={} ref_count={}]", + (print_friendly ? "\n\t" : " "), i + 1, BtreeNode::get_nth_key< K >(i, false).to_string(), + this->get_nth_value(i, false).to_string(), this->get_nth_suffix_slot_num(i), + this->get_nth_prefix_ref_count(i)); } return str; } @@ -705,7 +712,10 @@ class FixedPrefixNode : public VariantNode< K, V > { auto phdr = prefix_header(); ++phdr->used_slots; - if (slot_num > phdr->tail_slot) { phdr->tail_slot = slot_num; } + if (s_cast< uint16_t >(slot_num) >= phdr->tail_slot) { phdr->tail_slot = slot_num + 1; } + + DEBUG_ASSERT_LE(phdr->used_slots, phdr->tail_slot, "Prefix slot number {} is not less than tail slot number {}", + slot_num, phdr->tail_slot); return slot_num; } @@ -720,9 +730,9 @@ class FixedPrefixNode : public VariantNode< K, V > { if (--pentry->ref_count == 0) { --phdr->used_slots; prefix_bitset_.reset_bit(sisl::blob{bitset_area(), uint32_cast(bitset_size())}, slot_num); - if ((slot_num != 0) && (slot_num == phdr->tail_slot)) { + if ((slot_num == phdr->tail_slot - 1)) { uint16_t prev_slot = prefix_bitset_.get_prev_set_bit(cbitset_blob(), slot_num); - if (prev_slot != std::numeric_limits< uint16_t >::max()) { phdr->tail_slot = prev_slot; } + phdr->tail_slot = prev_slot + 1u; } } } @@ -736,12 +746,14 @@ class FixedPrefixNode : public VariantNode< K, V > { uint32_t available_size_without_compaction() const { uint8_t const* suffix = r_cast< uint8_t const* >(get_suffix_entry_c(this->total_entries())); - uint8_t const* prefix = r_cast< uint8_t const* >(get_prefix_entry_c(cprefix_header()->tail_slot)); + uint8_t const* prefix = + r_cast< uint8_t const* >(get_prefix_entry_c(cprefix_header()->tail_slot)) + prefix_entry::size(); if (suffix <= prefix) { return prefix - suffix; } else { - DEBUG_ASSERT(false, "Node data is corrupted, suffix area is overlapping prefix area"); + DEBUG_ASSERT(false, "Node data is corrupted, suffix area is overlapping prefix area {}", + int64_t(suffix - prefix)); return 0; } } @@ -760,7 +772,8 @@ class FixedPrefixNode : public VariantNode< K, V > { uint32_t num_prefix_holes() const { auto phdr = cprefix_header(); - return (phdr->tail_slot + 1 - phdr->used_slots); + DEBUG_ASSERT_LE(phdr->used_slots, phdr->tail_slot, "Prefix slot number is not less than tail slot number"); + return (phdr->tail_slot - phdr->used_slots); } bool is_compaction_suggested() const { return (num_prefix_holes() > prefix_node_header::min_holes_to_compact); } @@ -803,6 +816,9 @@ class FixedPrefixNode : public VariantNode< K, V > { // Finally adjust the tail offset to the compacted area. auto phdr = prefix_header(); phdr->tail_slot = phdr->used_slots; + DEBUG_ASSERT_EQ(phdr->tail_slot, prefix_bitset_.get_next_reset_bit(cbitset_blob(), 0u), + "Tail slot is not equal to the next reset bit, not expected"); + DEBUG_ASSERT_EQ(this->num_prefix_holes(), 0, "Shouldn't be any hole after compression, not expected"); } #ifndef NDEBUG @@ -843,13 +859,15 @@ class FixedPrefixNode : public VariantNode< K, V > { uint8_t const* csuffix_kv_area() const { return cbitset_area() + bitset_size(); } prefix_entry* get_prefix_entry(uint16_t slot_num) { - return r_cast< prefix_entry* >(this->node_data_area() + - (this->node_data_size() - ((slot_num + 1) * prefix_entry::size()))); + return r_cast< prefix_entry* >( + this->node_data_area() + + (this->node_data_size() - (s_cast< uint16_t >(slot_num + 1) * prefix_entry::size()))); } prefix_entry const* get_prefix_entry_c(uint16_t slot_num) const { - return r_cast< prefix_entry const* >(this->node_data_area_const() + - (this->node_data_size() - ((slot_num + 1) * prefix_entry::size()))); + return r_cast< prefix_entry const* >( + this->node_data_area_const() + + (this->node_data_size() - (static_cast< uint16_t >(slot_num + 1) * prefix_entry::size()))); } suffix_entry* get_suffix_entry(uint16_t idx) { diff --git a/src/include/homestore/homestore.hpp b/src/include/homestore/homestore.hpp index 099cef8ac..1bdb86ab3 100644 --- a/src/include/homestore/homestore.hpp +++ b/src/include/homestore/homestore.hpp @@ -62,11 +62,6 @@ using HomeStoreSafePtr = std::shared_ptr< HomeStore >; using hs_before_services_starting_cb_t = std::function< void(void) >; -struct hs_stats { - uint64_t total_capacity{0ul}; - uint64_t used_capacity{0ul}; -}; - ENUM(ServiceType, uint32_t, // List of all services we support META = 0, // Meta Service LOG = 1, // Log Service @@ -83,6 +78,13 @@ ENUM(ServiceSubType, uint32_t, // All sub types within services. At this po INDEX_BTREE_MEMORY = 3, // Memory based index ); +using hs_before_services_starting_cb_t = std::function< void(void) >; + +struct hs_stats { + uint64_t total_capacity{0ul}; + uint64_t used_capacity{0ul}; +}; + VENUM(hs_vdev_type_t, uint32_t, DATA_VDEV = 1, INDEX_VDEV = 2, META_VDEV = 3, LOGDEV_VDEV = 4); #pragma pack(1) @@ -131,7 +133,9 @@ class HomeStore { std::unique_ptr< MetaBlkService > m_meta_service; std::unique_ptr< LogStoreService > m_log_service; std::unique_ptr< IndexService > m_index_service; +#ifdef REPLICATION_SUPPORT std::shared_ptr< ReplicationService > m_repl_service; +#endif std::unique_ptr< DeviceManager > m_dev_mgr; shared< sisl::logging::logger_t > m_periodic_logger; @@ -163,8 +167,10 @@ class HomeStore { HomeStore& with_log_service(); HomeStore& with_index_service(std::unique_ptr< IndexServiceCallbacks > cbs, std::vector< ServiceSubType > sub_types); +#ifdef REPLICATION_SUPPORT HomeStore& with_repl_data_service(cshared< ReplApplication >& repl_app, cshared< ChunkSelector >& custom_chunk_selector = nullptr); +#endif bool start(const hs_input_params& input, hs_before_services_starting_cb_t svcs_starting_cb = nullptr); void format_and_start(std::map< ServiceId, hs_format_params >&& format_opts); @@ -189,7 +195,9 @@ class HomeStore { if (!m_index_service) { throw std::runtime_error("index_service is nullptr"); } return *m_index_service; } +#ifdef REPLICATION_SUPPORT ReplicationService& repl_service() { return *m_repl_service; } +#endif DeviceManager* device_mgr() { return m_dev_mgr.get(); } ResourceMgr& resource_mgr() { return *m_resource_mgr.get(); } CPManager& cp_mgr() { return *m_cp_mgr.get(); } diff --git a/src/include/homestore/homestore_decl.hpp b/src/include/homestore/homestore_decl.hpp index 3d1f75135..859b4c59c 100644 --- a/src/include/homestore/homestore_decl.hpp +++ b/src/include/homestore/homestore_decl.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -169,6 +170,8 @@ struct hs_input_params { uint64_t app_mem_size{static_cast< uint64_t >(1024) * static_cast< uint64_t >(1024) * static_cast< uint64_t >(1024)}; // memory available for the app (including cache) uint64_t hugepage_size{0}; // memory available for the hugepage + int max_data_size{0}; // max data size in byte on the data plane + int max_snapshot_batch_size{0}; // max snapshot batch size in byte for the raft state machine bool is_read_only{false}; // Is read only bool auto_recovery{true}; // Recovery of data is automatic or controlled by the caller diff --git a/src/include/homestore/logstore/log_store.hpp b/src/include/homestore/logstore/log_store.hpp index a2091f114..91735be79 100644 --- a/src/include/homestore/logstore/log_store.hpp +++ b/src/include/homestore/logstore/log_store.hpp @@ -173,6 +173,15 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { logdev_key get_trunc_ld_key() const { return m_trunc_ld_key; } + /** + * @brief Get the truncation information for this log store. It is called during log device truncation + * + * @return tuple of (start_lsn, trunc_ld_key, tail_lsn) If the log store is empty, it will return + * an out_of_bound_ld_key as trunc_ld_key. + * + * @note ensure that no new logs are flushed between calling this function and completing the truncation, + * as this could result in an inaccurate out_of_bound_ld_key. + * */ std::tuple< logstore_seq_num_t, logdev_key, logstore_seq_num_t > truncate_info() const; sisl::StreamTracker< logstore_record >& log_records() { return m_records; } @@ -231,6 +240,8 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { bool rollback(logstore_seq_num_t to_lsn); auto start_lsn() const { return m_start_lsn.load(std::memory_order_acquire); } + auto tail_lsn() const { return m_tail_lsn.load(std::memory_order_acquire); } + auto next_lsn() const { return m_next_lsn.load(std::memory_order_acquire); } nlohmann::json dump_log_store(const log_dump_req& dump_req = log_dump_req()); diff --git a/src/include/homestore/logstore/log_store_internal.hpp b/src/include/homestore/logstore/log_store_internal.hpp index 551f15ea8..7768086ee 100644 --- a/src/include/homestore/logstore/log_store_internal.hpp +++ b/src/include/homestore/logstore/log_store_internal.hpp @@ -52,6 +52,12 @@ typedef std::function< void(std::shared_ptr< HomeLogStore >, logstore_seq_num_t) typedef int64_t logid_t; +VENUM(flush_mode_t, uint32_t, // Various flush modes (can be or'ed together) + INLINE = 1 << 0, // Allow flush inline with the append + TIMER = 1 << 1, // Allow timer based automatic flush + EXPLICIT = 1 << 2, // Allow explcitly user calling flush +); + struct logdev_key { logid_t idx; off_t dev_offset; @@ -85,7 +91,8 @@ struct logdev_key { std::string to_string() const { return fmt::format("Logid={} devoffset={}", idx, dev_offset); } static const logdev_key& out_of_bound_ld_key() { - static constexpr logdev_key s_out_of_bound_ld_key{std::numeric_limits< logid_t >::max(), 0}; + static constexpr logdev_key s_out_of_bound_ld_key{std::numeric_limits< logid_t >::max(), + std::numeric_limits< off_t >::max()}; return s_out_of_bound_ld_key; } }; @@ -171,4 +178,5 @@ struct logstore_superblk { logstore_seq_num_t m_first_seq_num{0}; }; #pragma pack() + } // namespace homestore \ No newline at end of file diff --git a/src/include/homestore/logstore_service.hpp b/src/include/homestore/logstore_service.hpp index 44ba1ab53..039e14114 100644 --- a/src/include/homestore/logstore_service.hpp +++ b/src/include/homestore/logstore_service.hpp @@ -93,7 +93,7 @@ class LogStoreService { * chunks. Logdev can start with zero chunks and dynamically add chunks based on write request. * @return Newly created log dev id. */ - logdev_id_t create_new_logdev(); + logdev_id_t create_new_logdev(flush_mode_t flush_mode); /** * @brief Open a log dev. @@ -101,7 +101,7 @@ class LogStoreService { * @param logdev_id: Logdev ID * @return Newly created log dev id. */ - void open_logdev(logdev_id_t logdev_id); + void open_logdev(logdev_id_t logdev_id, flush_mode_t flush_mode); /** * @brief Destroy a log dev. @@ -132,7 +132,8 @@ class LogStoreService { * @return std::shared_ptr< HomeLogStore > */ folly::Future< shared< HomeLogStore > > open_log_store(logdev_id_t logdev_id, logstore_id_t store_id, - bool append_mode); + bool append_mode, log_found_cb_t log_found_cb = nullptr, + log_replay_done_cb_t log_replay_done_cb = nullptr); /** * @brief Close the log store instance and free-up the resources @@ -176,7 +177,7 @@ class LogStoreService { void delete_unopened_logdevs(); private: - std::shared_ptr< LogDev > create_new_logdev_internal(logdev_id_t logdev_id); + std::shared_ptr< LogDev > create_new_logdev_internal(logdev_id_t logdev_id, flush_mode_t flush_mode); void on_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie); logdev_id_t get_next_logdev_id(); void logdev_super_blk_found(const sisl::byte_view& buf, void* meta_cookie); diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index 994da7d97..88a928aa3 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -15,20 +15,24 @@ namespace homestore { VENUM(ReplServiceError, int32_t, OK = 0, // Everything OK CANCELLED = -1, // Request was cancelled - TIMEOUT = -2, - NOT_LEADER = -3, - BAD_REQUEST = -4, - SERVER_ALREADY_EXISTS = -5, + TIMEOUT = -2, + NOT_LEADER = -3, + BAD_REQUEST = -4, + SERVER_ALREADY_EXISTS = -5, CONFIG_CHANGING = -6, - SERVER_IS_JOINING = -7, - SERVER_NOT_FOUND = -8, - CANNOT_REMOVE_LEADER = -9, + SERVER_IS_JOINING = -7, + SERVER_NOT_FOUND = -8, + CANNOT_REMOVE_LEADER = -9, SERVER_IS_LEAVING = -10, - TERM_MISMATCH = -11, - RESULT_NOT_EXIST_YET = -10000, + TERM_MISMATCH = -11, + RETRY_REQUEST = -12, + RESULT_NOT_EXIST_YET = -10000, NOT_IMPLEMENTED = -10001, NO_SPACE_LEFT = -20000, DRIVE_WRITE_ERROR = -20001, + DATA_DUPLICATED = -20002, + QUIENCE_STATE = -20003, + QUORUM_NOT_MET = -20004, FAILED = -32768); // clang-format on @@ -68,9 +72,20 @@ struct peer_info { // Peer ID. replica_id_t id_; // The last replication index that the peer has, from this server's point of view. - uint64_t replication_idx_; + uint64_t replication_idx_ = 0; // The elapsed time since the last successful response from this peer, set to 0 on leader - uint64_t last_succ_resp_us_; + uint64_t last_succ_resp_us_ = 0; + // The priority for leader election + uint32_t priority_ = 0; + // Whether the peer can vote. If a peer is learner, this will be false. Hide the raft details. + bool can_vote = true; +}; + +struct replica_member_info { + static constexpr uint64_t max_name_len = 128; + replica_id_t id; + char name[max_name_len]; + int32_t priority{0}; }; } // namespace homestore diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 9965ada5d..45e2488c6 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -11,6 +11,7 @@ #include #include #include +#include #include namespace nuraft { @@ -28,6 +29,7 @@ struct repl_req_ctx; using raft_buf_ptr_t = nuraft::ptr< nuraft::buffer >; using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; using repl_req_ptr_t = boost::intrusive_ptr< repl_req_ctx >; +using trace_id_t = u_int64_t; VENUM(repl_req_state_t, uint32_t, INIT = 0, // Initial state @@ -36,19 +38,27 @@ VENUM(repl_req_state_t, uint32_t, DATA_WRITTEN = 1 << 2, // Data has been written to the storage LOG_RECEIVED = 1 << 3, // Log is received and waiting for data LOG_FLUSHED = 1 << 4, // Log has been flushed - ERRORED = 1 << 5 // Error has happened and cleaned up + ERRORED = 1 << 5, // Error has happened and cleaned up + DATA_COMMITTED = 1 << 6 // Data has already been committed, used in duplication handling, will skip commit_blk ) VENUM(journal_type_t, uint16_t, HS_DATA_LINKED = 0, // Linked data where each entry will store physical blkid where data reside HS_DATA_INLINED = 1, // Data is inlined in the header of journal entry - HS_CTRL_DESTROY = 2 // Control message to destroy the repl_dev + HS_CTRL_DESTROY = 2, // Control message to destroy the repl_dev + HS_CTRL_START_REPLACE = 3, // Control message to start replace a member + HS_CTRL_COMPLETE_REPLACE = 4, // Control message to complete replace a member ) +// magic num comes from the first 8 bytes of 'echo homestore_resync_data | md5sum' +static constexpr uint64_t HOMESTORE_RESYNC_DATA_MAGIC = 0xa65dbd27c213f327; +static constexpr uint32_t HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1 = 0x01; + struct repl_key { - int32_t server_id{0}; // Server Id which this req is originated from - uint64_t term; // RAFT term number - uint64_t dsn{0}; // Data sequence number to tie the data with the raft journal entry + int32_t server_id{0}; // Server Id which this req is originated from + uint64_t term; // RAFT term number + uint64_t dsn{0}; // Data sequence number to tie the data with the raft journal entry + trace_id_t traceID{0}; // tracing ID provided by application that connects logs. struct Hasher { size_t operator()(repl_key const& rk) const { @@ -66,14 +76,13 @@ struct repl_key { using repl_snapshot = nuraft::snapshot; using repl_snapshot_ptr = nuraft::ptr< nuraft::snapshot >; -// Consumers of the ReplDevListener dont have to know what underlying -// snapshot implementation is used. Consumers can export and save the state -// of the snapshot using serialize and load the state using deserialize. +// Consumers of ReplDevListener don't have to know what underlying snapshot context implementation is used by the +// ReplDev. The state of the snapshot can be exported with serialize() and loaded with +// repl_dev.deserialize_snapshot_context(). class snapshot_context { public: snapshot_context(int64_t lsn) : lsn_(lsn) {} virtual ~snapshot_context() = default; - virtual void deserialize(const sisl::io_blob_safe& snp_ctx) = 0; virtual sisl::io_blob_safe serialize() = 0; int64_t get_lsn() { return lsn_; } @@ -81,74 +90,69 @@ class snapshot_context { int64_t lsn_; }; -class nuraft_snapshot_context : public snapshot_context { -public: - nuraft_snapshot_context(nuraft::snapshot& snp) : snapshot_context(snp.get_last_log_idx()) { - auto snp_buf = snp.serialize(); - snapshot_ = nuraft::snapshot::deserialize(*snp_buf); - } - - void deserialize(const sisl::io_blob_safe& snp_ctx) override { - // Load the context from the io blob to nuraft buffer. - auto snp_buf = nuraft::buffer::alloc(snp_ctx.size()); - nuraft::buffer_serializer bs(snp_buf); - bs.put_raw(snp_ctx.cbytes(), snp_ctx.size()); - snapshot_ = nuraft::snapshot::deserialize(bs); - lsn_ = snapshot_->get_last_log_idx(); - } - - sisl::io_blob_safe serialize() override { - // Dump the context from nuraft buffer to the io blob. - auto snp_buf = snapshot_->serialize(); - sisl::io_blob_safe blob{s_cast< size_t >(snp_buf->size())}; - std::memcpy(blob.bytes(), snp_buf->data_begin(), snp_buf->size()); - return blob; - } - - nuraft::ptr< nuraft::snapshot > nuraft_snapshot() { return snapshot_; } - -private: - nuraft::ptr< nuraft::snapshot > snapshot_; -}; - -struct snapshot_data { +struct snapshot_obj { void* user_ctx{nullptr}; - int64_t offset{0}; + uint64_t offset{0}; sisl::io_blob_safe blob; bool is_first_obj{false}; bool is_last_obj{false}; }; +// HomeStore has some meta information to be transmitted during the baseline resync, +// Although now only dsn needs to be synced, this structure is defined as a general message, and we can easily add data +// if needed in the future. +struct snp_repl_dev_data { + uint64_t magic_num{HOMESTORE_RESYNC_DATA_MAGIC}; + uint32_t protocol_version{HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1}; + uint32_t crc{0}; + uint64_t dsn{0}; +}; + struct repl_journal_entry; struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost::thread_safe_counter >, sisl::ObjLifeCounter< repl_req_ctx > { friend class SoloReplDev; public: - repl_req_ctx() {} + repl_req_ctx() { m_start_time = Clock::now(); } virtual ~repl_req_ctx(); - void init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size); + ReplServiceError init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, + sisl::blob const& key, uint32_t data_size, cshared< ReplDevListener >& listener); /////////////////////// All getters /////////////////////// repl_key const& rkey() const { return m_rkey; } uint64_t dsn() const { return m_rkey.dsn; } uint64_t term() const { return m_rkey.term; } + trace_id_t traceID() const { return m_rkey.traceID; } int64_t lsn() const { return m_lsn; } bool is_proposer() const { return m_is_proposer; } journal_type_t op_code() const { return m_op_code; } + bool is_volatile() const { return m_is_volatile.load(); } sisl::blob const& header() const { return m_header; } sisl::blob const& key() const { return m_key; } - MultiBlkId const& local_blkid() const { return m_local_blkid; } + MultiBlkId const& local_blkid() const { + // Currently used by raft repl dev only where a single blob is expected. + // Code checks if its a valid blkid so return a dummy blkid. + if (!m_local_blkids.empty()) + return m_local_blkids[0]; + else + return dummy_blkid; + } + + std::vector< MultiBlkId >& local_blkids() { return m_local_blkids; } RemoteBlkId const& remote_blkid() const { return m_remote_blkid; } - const char* data() const { return r_cast< const char* >(m_data); } + const char* data() const { + DEBUG_ASSERT(m_data != nullptr, + "m_data is nullptr, use before save_pushed/fetched_data or after release_data()"); + return r_cast< const char* >(m_data); + } repl_req_state_t state() const { return repl_req_state_t(m_state.load()); } bool has_state(repl_req_state_t s) const { return m_state.load() & uint32_cast(s); } repl_journal_entry const* journal_entry() const { return m_journal_entry; } uint32_t journal_entry_size() const; + uint32_t blkids_serialized_size() const; bool is_localize_pending() const { return m_is_jentry_localize_pending; } - bool is_data_inlined() const { return (m_op_code == journal_type_t::HS_DATA_INLINED); } bool has_linked_data() const { return (m_op_code == journal_type_t::HS_DATA_LINKED); } raft_buf_ptr_t& raft_journal_buf(); @@ -156,6 +160,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: /////////////////////// Non modifiers methods ////////////////// std::string to_string() const; std::string to_compact_string() const; + std::string blkids_to_string() const; Clock::time_point created_time() const { return m_start_time; } void set_created_time() { m_start_time = Clock::now(); } bool is_expired() const; @@ -202,12 +207,14 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: bool save_fetched_data(sisl::GenericClientResponse const& fetched_data, uint8_t const* data, uint32_t data_size); void set_remote_blkid(RemoteBlkId const& rbid) { m_remote_blkid = rbid; } - void set_local_blkid(MultiBlkId const& lbid) { m_local_blkid = lbid; } // Only used during recovery + void set_local_blkids(std::vector< MultiBlkId > const& lbids) { m_local_blkids = std::move(lbids); } + void set_is_volatile(bool is_volatile) { m_is_volatile.store(is_volatile); } void set_lsn(int64_t lsn); void add_state(repl_req_state_t s); bool add_state_if_not_already(repl_req_state_t s); void set_lentry(nuraft::ptr< nuraft::log_entry > const& lentry) { m_lentry = lentry; } void clear(); + void release_data(); flatbuffers::FlatBufferBuilder& create_fb_builder() { return m_fb_builder; } void release_fb_builder() { m_fb_builder.Release(); } @@ -228,11 +235,13 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: bool m_is_proposer{false}; // Is the repl_req proposed by this node Clock::time_point m_start_time; // Start time of the request journal_type_t m_op_code{journal_type_t::HS_DATA_INLINED}; // Operation code for this request + std::atomic< bool > m_is_volatile{true}; // Is the log still in memory and not flushed to disk yet /////////////// Data related section ///////////////// - MultiBlkId m_local_blkid; // Local BlkId for the data - RemoteBlkId m_remote_blkid; // Corresponding remote blkid for the data - uint8_t const* m_data; // Raw data pointer containing the actual data + static inline MultiBlkId dummy_blkid; + std::vector< MultiBlkId > m_local_blkids; // Local BlkId for the data + RemoteBlkId m_remote_blkid; // Corresponding remote blkid for the data + uint8_t const* m_data; // Raw data pointer containing the actual data /////////////// Journal/Buf related section ///////////////// std::variant< std::unique_ptr< uint8_t[] >, raft_buf_ptr_t > m_journal_buf; // Buf for the journal entry @@ -268,11 +277,19 @@ class ReplDevListener { /// @param lsn - The log sequence number /// @param header - Header originally passed with replica_set::write() api /// @param key - Key originally passed with replica_set::write() api - /// @param blkids - List of blkids where data is written to the storage engine. + /// @param blkids - List of independent blkids where data is written to the storage engine. /// @param ctx - Context passed as part of the replica_set::write() api /// - virtual void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, - cintrusive< repl_req_ctx >& ctx) = 0; + virtual void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) = 0; + + /// @brief periodically called to notify the lastest committed lsn to the listener. + /// NOTE: this callback will block the thread of flushing the latest committed lsn into repl_dev superblk as DC_LSN, + /// pls take care if there is any heavy or blocking operation in this callback. + /// + /// @param lsn - The lasted committed log sequence number so far + /// + virtual void notify_committed_lsn(int64_t lsn) = 0; /// @brief Called when the log entry has been received by the replica dev. /// @@ -311,6 +328,10 @@ class ReplDevListener { virtual void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, cintrusive< repl_req_ctx >& ctx) = 0; + /// @brief Called when the config log entry has been rolled back. + /// @param lsn - The log sequence number getting rolled back + virtual void on_config_rollback(int64_t lsn) = 0; + /// @brief Called when the replDev is created after restart. The consumer is expected to recover all the modules /// necessary to replay/commit the logs. virtual void on_restart() = 0; @@ -339,12 +360,21 @@ class ReplDevListener { /// @return Expected to return blk_alloc_hints for this write. If the hints are not available, then return the /// error. It is to be noted this method should return error only in very abnornal cases as in some code flow, an /// error would result in a crash or stall of the entire commit thread. - virtual ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) = 0; + virtual ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, + cintrusive< homestore::repl_req_ctx >& hs_ctx) = 0; /// @brief Called when the repl_dev is being destroyed. The consumer is expected to clean up any related resources. /// However, it is expected that this call be idempotent. It is possible in rare scenarios that this can be called /// after restart in case crash happened during the destroy. - virtual void on_destroy() = 0; + virtual void on_destroy(const group_id_t& group_id) = 0; + + /// @brief Called when start replace member. + virtual void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + trace_id_t tid) = 0; + + /// @brief Called when complete replace member. + virtual void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + trace_id_t tid) = 0; /// @brief Called when the snapshot is being created by nuraft virtual AsyncReplResult<> create_snapshot(shared< snapshot_context > context) = 0; @@ -359,18 +389,37 @@ class ReplDevListener { /// uses offset given by the follower to the know the current state of the follower. /// Leader sends the snapshot data to the follower in batch. This callback is called multiple /// times on the leader till all the data is transferred to the follower. is_last_obj in - /// snapshot_data will be true once all the data has been trasnferred. After this the raft on + /// snapshot_obj will be true once all the data has been trasnferred. After this the raft on /// the follower side can do the incremental resync. - virtual int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) = 0; + virtual int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_obj) = 0; /// @brief Called on the follower when the leader sends the data during the baseline resyc. - /// is_last_obj in in snapshot_data will be true once all the data has been transfered. + /// is_last_obj in in snapshot_obj will be true once all the data has been transfered. /// After this the raft on the follower side can do the incremental resync. - virtual void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) = 0; + virtual void write_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_obj) = 0; - /// @brief Free up user-defined context inside the snapshot_data that is allocated during read_snapshot_data. + /// @brief Free up user-defined context inside the snapshot_obj that is allocated during read_snapshot_obj. virtual void free_user_snp_ctx(void*& user_snp_ctx) = 0; + /// @brief ask upper layer to decide which data should be returned. + // @param header - header of the log entry. + // @param blkid - original blkid of the log entry + // @param sgs - sgs to be filled with data + // @param lsn - lsn of the log entry + virtual folly::Future< std::error_code > on_fetch_data(const int64_t lsn, const sisl::blob& header, + const MultiBlkId& blkid, sisl::sg_list& sgs) { + // default implementation is reading by blkid directly + return data_service().async_read(blkid, sgs, sgs.size); + } + + /// @brief ask upper layer to handle no_space_left event + // @param lsn - on which repl_lsn no_space_left happened + // @param chunk_id - on which chunk no_space_left happened + virtual void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) = 0; + + /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer + virtual void on_log_replay_done(const group_id_t& group_id) {}; + private: std::weak_ptr< ReplDev > m_repl_dev; }; @@ -380,6 +429,39 @@ class ReplDev { ReplDev() = default; virtual ~ReplDev() { detach_listener(); } + /// @brief Allocates blkids from the storage engine to write the value into. Storage + /// engine returns a blkid_list in cases where single contiguous blocks are not + /// available. + /// + /// @param data_size - Size of the data. + /// @param hints - Specify block allocation hints. + /// @param out_blkids - List of bilkid's which may not be contiguous. + virtual std::error_code alloc_blks(uint32_t data_size, const blk_alloc_hints& hints, + std::vector< MultiBlkId >& out_blkids) = 0; + + /// @brief Write data locally using the specified blkid's. Data is split across the blkids. + /// @param blkids - List of blkid's where data will be written. + /// @param value - vector of io buffers that contain value for the key. + /// @param part_of_batch - Is write is part of a batch. If part of the batch, then submit_batch needs to be called + /// at the end + /// @return A Future with std::error_code to notify if it has successfully write the data or any error code in case + /// of failure + virtual folly::Future< std::error_code > async_write(const std::vector< MultiBlkId >& blkids, + sisl::sg_list const& value, bool part_of_batch = false, + trace_id_t tid = 0) = 0; + + /// @brief Creates a log/journal entry with and calls the on_commit listener callback. + /// @param blkids - List of blkid's where data was written. + /// @param header - Blob representing the header (it is opaque and will be copied + /// as-is to the journal entry) + /// @param key - Blob representing the key (it is opaque and will be copied as-is to + /// the journal entry). + /// @param data_size - Size of the data. + /// @param ctx - User supplied context which will be passed to listener callbacks + virtual void async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header, + sisl::blob const& key, uint32_t data_size, repl_req_ptr_t ctx, + trace_id_t tid = 0) = 0; + /// @brief Replicate the data to the replica set. This method goes through the /// following steps: /// Step 1: Allocates blkid from the storage engine to write the value into. Storage @@ -397,10 +479,11 @@ class ReplDev { /// cases /// @param value - vector of io buffers that contain value for the key. It is an optional field and if the value /// list size is 0, then only key is written to replicadev without data. - /// @param ctx - User supplied context which will be passed to listener - /// callbacks + /// @param ctx - User supplied context which will be passed to listener callbacks + /// @param part_of_batch Is write is part of a batch. If part of the batch, then submit_batch needs to be called at + /// the end virtual void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx) = 0; + repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) = 0; /// @brief Reads the data and returns a future to continue on /// @param bid Block id to read @@ -411,13 +494,14 @@ class ReplDev { /// @return A Future with std::error_code to notify if it has successfully read the data or any error code in case /// of failure virtual folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch = false) = 0; + bool part_of_batch = false, trace_id_t tid = 0) = 0; /// @brief After data is replicated and on_commit to the listener is called. the blkids can be freed. /// /// @param lsn - LSN of the old blkids that is being freed /// @param blkids - blkids to be freed. - virtual void async_free_blks(int64_t lsn, MultiBlkId const& blkid) = 0; + virtual folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid, + trace_id_t tid = 0) = 0; /// @brief Try to switch the current replica where this method called to become a leader. /// @return True if it is successful, false otherwise. @@ -438,10 +522,30 @@ class ReplDev { /// @return group_id virtual group_id_t group_id() const = 0; + /// @brief Sets a custom name for the repldev. Users can assign a meaningful name to the repldev for easy debugging. + virtual void set_custom_rdev_name(std::string const& name) = 0; + /// @brief Gets the block size with which IO will happen on this device /// @return Block size virtual uint32_t get_blk_size() const = 0; + /// @brief Gets the last commit lsn of this repldev + /// @return last_commit_lsn + virtual repl_lsn_t get_last_commit_lsn() const = 0; + + /// @brief Gets the repl lsn of the last log in log store + /// @return last_append_repl_lsn + virtual repl_lsn_t get_last_append_lsn() = 0; + + /// @brief if this replica is ready for accepting client IO. + /// @return true if ready, false otherwise + virtual bool is_ready_for_traffic() const = 0; + + /// @brief Clean up resources on this repl dev. + virtual void purge() = 0; + + virtual std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) = 0; + virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } virtual void detach_listener() { @@ -451,6 +555,30 @@ class ReplDev { } } + virtual shared< ReplDevListener > get_listener() { return m_listener; } + + // we have no shutdown for repl_dev, since shutdown repl_dev is done by repl_service + void stop() { +#if 0 + start_stopping(); + while (true) { + auto pending_request_num = get_pending_request_num(); + if (!pending_request_num) break; + + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } +#endif + } + + // complete all the requests that are in progress and start refusing new reqs + virtual void quiesce_reqs() = 0; + + // start accepting new reqs + virtual void resume_accepting_reqs() = 0; + + // clear reqs that has allocated blks on the given chunk. + virtual void clear_chunk_req(chunk_num_t chunk_id) = 0; + protected: shared< ReplDevListener > m_listener; }; diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index 8f535b855..f28704546 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -41,9 +41,18 @@ class ReplicationService { /// @return A Future which gets called after schedule to release (before garbage collection is kicked in) virtual folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) = 0; - virtual AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const = 0; - + /// @brief Replace one of the members with a new one. + /// @param group_id Group where the replace member happens + /// @param member_out The member which is going to be replaced + /// @param member_in The member which is going to be added in place of member_out + /// @param commit_quorum Commit quorum to be used for this operation. If 0, it will use the default commit quorum. + /// @return A Future on replace the member accepted or Future ReplServiceError upon error + virtual AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const = 0; + + virtual AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, + bool wait_and_verify = true, uint64_t trace_id = 0) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened /// @param group_id Group id interested in /// @return ReplDev is opened or ReplServiceError::SERVER_NOT_FOUND if it doesn't exist @@ -74,6 +83,14 @@ class ReplApplication { // Listener corresponding to the ReplDev which will be used to perform the precommit/commit/rollback. virtual shared< ReplDevListener > create_repl_dev_listener(group_id_t group_id) = 0; + // Called when the repl dev is destroyed. This interface provides the application a chance to cleanup any resources + // assocated with this listener; + virtual void destroy_repl_dev_listener(group_id_t group_id) = 0; + + // Called after all the repl devs are found upon restart of the homestore instance. + // it is a nice place for upper layer to recovery anything depends on repl_devs + virtual void on_repl_devs_init_completed() = 0; + // Given the uuid of the peer, get their address and port virtual std::pair< std::string, uint16_t > lookup_peer(replica_id_t uuid) const = 0; diff --git a/src/include/homestore/vchunk.h b/src/include/homestore/vchunk.h index b52832faa..4b69b1332 100644 --- a/src/include/homestore/vchunk.h +++ b/src/include/homestore/vchunk.h @@ -35,6 +35,8 @@ class VChunk { uint32_t get_pdev_id() const; uint16_t get_chunk_id() const; cshared< Chunk > get_internal_chunk() const; + uint64_t size() const; + void reset(); private: shared< Chunk > m_internal_chunk; diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index 4a4c7fd18..2f6cec25c 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -67,14 +67,23 @@ BlkAllocStatus AppendBlkAllocator::alloc_contiguous(BlkId& bid) { return alloc(1 // If we want to change above design, we can open this api for vector allocation; // BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hints& hint, BlkId& out_bid) { - if (available_blks() < nblks) { + auto avail_blks = available_blks(); + if (hint.reserved_blks) { + avail_blks = avail_blks > hint.reserved_blks.value() ? avail_blks - hint.reserved_blks.value() : 0; + } + if (avail_blks < nblks) { // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); - LOGERROR("No space left to serve request nblks: {}, available_blks: {}", nblks, available_blks()); + LOGERROR("No space left to serve request nblks: {}, available_blks: {}, actual available_blks(exclude reserved " + "blks): {}", + nblks, available_blks(), avail_blks); + // the caller can know in which chunk no_space_left happened; + out_bid = BlkId{0, 0, m_chunk_id}; return BlkAllocStatus::SPACE_FULL; } else if (nblks > max_blks_per_blkid()) { // consumer(vdev) already handles this case. // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); LOGERROR("Can't serve request nblks: {} larger than max_blks_in_op: {}", nblks, max_blks_per_blkid()); + out_bid = BlkId{0, 0, m_chunk_id}; return BlkAllocStatus::FAILED; } @@ -127,33 +136,9 @@ void AppendBlkAllocator::cp_flush(CP* cp) { } } -// -// free operation does: -// 1. book keeping "total freeable" space -// 2. if the blk being freed happens to be last block, move last_append_offset backwards accordingly; -// +// free operation books keeping "total freeable" space void AppendBlkAllocator::free(const BlkId& bid) { - // If we are freeing the last block, just move the offset back - blk_num_t cur_last_offset = m_last_append_offset.load(); - auto const input_last_offset = bid.blk_num() + bid.blk_count(); - blk_num_t new_last_offset; - bool freeing_in_middle{false}; - do { - if (input_last_offset == cur_last_offset) { - new_last_offset = bid.blk_num(); - freeing_in_middle = false; - } else { - new_last_offset = cur_last_offset; - freeing_in_middle = true; - } - } while (!m_last_append_offset.compare_exchange_weak(cur_last_offset, new_last_offset)); - - if (freeing_in_middle) { - // Freeing something in the middle, increment the count - m_freeable_nblks.fetch_add(bid.blk_count()); - } else { - m_commit_offset.store(m_last_append_offset.load()); - } + m_freeable_nblks.fetch_add(bid.blk_count()); m_is_dirty.store(true); } @@ -162,6 +147,13 @@ bool AppendBlkAllocator::is_blk_alloced(const BlkId& in_bid, bool) const { return in_bid.blk_num() < get_used_blks(); } +void AppendBlkAllocator::reset() { + m_last_append_offset.store(0); + m_freeable_nblks.store(0); + m_commit_offset.store(0); + m_is_dirty.store(true); +} + bool AppendBlkAllocator::is_blk_alloced_on_disk(BlkId const& bid, bool) const { return bid.blk_num() < m_sb->commit_offset; } diff --git a/src/lib/blkalloc/append_blk_allocator.h b/src/lib/blkalloc/append_blk_allocator.h index 384a4936b..5e745c33a 100644 --- a/src/lib/blkalloc/append_blk_allocator.h +++ b/src/lib/blkalloc/append_blk_allocator.h @@ -38,21 +38,21 @@ struct append_blk_sb_t { }; #pragma pack() -//class AppendBlkAllocMetrics : public sisl::MetricsGroup { -//public: -// explicit AppendBlkAllocMetrics(const char* inst_name) : sisl::MetricsGroup("AppendBlkAlloc", inst_name) { -// REGISTER_COUNTER(num_alloc, "Number of blks alloc attempts"); -// REGISTER_COUNTER(num_alloc_failure, "Number of blk alloc failures"); +// class AppendBlkAllocMetrics : public sisl::MetricsGroup { +// public: +// explicit AppendBlkAllocMetrics(const char* inst_name) : sisl::MetricsGroup("AppendBlkAlloc", inst_name) { +// REGISTER_COUNTER(num_alloc, "Number of blks alloc attempts"); +// REGISTER_COUNTER(num_alloc_failure, "Number of blk alloc failures"); // -// register_me_to_farm(); -// } +// register_me_to_farm(); +// } // -// AppendBlkAllocMetrics(const AppendBlkAllocMetrics&) = delete; -// AppendBlkAllocMetrics(AppendBlkAllocMetrics&&) noexcept = delete; -// AppendBlkAllocMetrics& operator=(const AppendBlkAllocMetrics&) = delete; -// AppendBlkAllocMetrics& operator=(AppendBlkAllocMetrics&&) noexcept = delete; -// ~AppendBlkAllocMetrics() { deregister_me_from_farm(); } -//}; +// AppendBlkAllocMetrics(const AppendBlkAllocMetrics&) = delete; +// AppendBlkAllocMetrics(AppendBlkAllocMetrics&&) noexcept = delete; +// AppendBlkAllocMetrics& operator=(const AppendBlkAllocMetrics&) = delete; +// AppendBlkAllocMetrics& operator=(AppendBlkAllocMetrics&&) noexcept = delete; +// ~AppendBlkAllocMetrics() { deregister_me_from_farm(); } +// }; // // The assumption for AppendBlkAllocator: @@ -108,6 +108,11 @@ class AppendBlkAllocator : public BlkAllocator { std::string to_string() const override; + /** + * @brief : reset the allocator to initial state, so all the blks in this chunk are free. + */ + void reset() override; + void cp_flush(CP* cp) override; void recovery_completed() override {} nlohmann::json get_status(int log_level) const override; @@ -121,7 +126,7 @@ class AppendBlkAllocator : public BlkAllocator { std::atomic< blk_num_t > m_freeable_nblks{0}; // count of blks fragmentedly freed (both on-disk and in-memory) std::atomic< blk_num_t > m_commit_offset{0}; // offset in on-disk version std::atomic< bool > m_is_dirty{false}; - //AppendBlkAllocMetrics m_metrics; + // AppendBlkAllocMetrics m_metrics; superblk< append_blk_sb_t > m_sb; // only cp will be writing to this disk }; diff --git a/src/lib/blkalloc/bitmap_blk_allocator.h b/src/lib/blkalloc/bitmap_blk_allocator.h index 381767bef..a86e08757 100644 --- a/src/lib/blkalloc/bitmap_blk_allocator.h +++ b/src/lib/blkalloc/bitmap_blk_allocator.h @@ -77,6 +77,7 @@ class BitmapBlkAllocator : public BlkAllocator { void cp_flush(CP* cp) override; void recovery_completed() override {} + void reset() override {} blk_num_t get_num_portions() const { return (m_num_blks - 1) / m_blks_per_portion + 1; } blk_num_t get_blks_per_portion() const { return m_blks_per_portion; } diff --git a/src/lib/blkalloc/blk_allocator.h b/src/lib/blkalloc/blk_allocator.h index b381f71c5..8c64fc8e5 100644 --- a/src/lib/blkalloc/blk_allocator.h +++ b/src/lib/blkalloc/blk_allocator.h @@ -158,6 +158,7 @@ class BlkAllocator { virtual bool is_blk_alloced(BlkId const& b, bool use_lock = false) const = 0; virtual bool is_blk_alloced_on_disk(BlkId const& b, bool use_lock = false) const = 0; virtual void recovery_completed() = 0; + virtual void reset() = 0; virtual std::string to_string() const = 0; virtual void cp_flush(CP* cp) = 0; diff --git a/src/lib/blkalloc/fixed_blk_allocator.h b/src/lib/blkalloc/fixed_blk_allocator.h index fa28681f2..01f1e1138 100644 --- a/src/lib/blkalloc/fixed_blk_allocator.h +++ b/src/lib/blkalloc/fixed_blk_allocator.h @@ -41,6 +41,7 @@ class FixedBlkAllocator : public BitmapBlkAllocator { blk_num_t available_blks() const override; blk_num_t get_used_blks() const override; blk_num_t get_defrag_nblks() const override; + void reset() override{}; bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; std::string to_string() const override; diff --git a/src/lib/blkalloc/varsize_blk_allocator.h b/src/lib/blkalloc/varsize_blk_allocator.h index 1a90de8da..03a507b03 100644 --- a/src/lib/blkalloc/varsize_blk_allocator.h +++ b/src/lib/blkalloc/varsize_blk_allocator.h @@ -222,6 +222,7 @@ class VarsizeBlkAllocator : public BitmapBlkAllocator { blk_num_t get_used_blks() const override; bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; std::string to_string() const override; + void reset() override{}; nlohmann::json get_metrics_in_json(); private: diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index 4acd3d846..b17fc0a61 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -34,11 +34,12 @@ BlkDataService::BlkDataService(shared< ChunkSelector > chunk_selector) : m_custom_chunk_selector{std::move(chunk_selector)} { m_blk_read_tracker = std::make_unique< BlkReadTracker >(); } + BlkDataService::~BlkDataService() = default; // first-time boot path void BlkDataService::create_vdev(uint64_t size, HSDevType devType, uint32_t blk_size, blk_allocator_type_t alloc_type, - chunk_selector_type_t chunk_sel_type, uint32_t num_chunks) { + chunk_selector_type_t chunk_sel_type, uint32_t num_chunks, uint32_t chunk_size) { hs_vdev_context vdev_ctx; vdev_ctx.type = hs_vdev_type_t::DATA_VDEV; @@ -48,6 +49,7 @@ void BlkDataService::create_vdev(uint64_t size, HSDevType devType, uint32_t blk_ .vdev_size = size, .num_chunks = num_chunks, .blk_size = blk_size, + .chunk_size = chunk_size, .dev_type = devType, .alloc_type = alloc_type, .chunk_sel_type = chunk_sel_type, @@ -188,8 +190,28 @@ folly::Future< std::error_code > BlkDataService::async_write(sisl::sg_list const } } +folly::Future< std::error_code > +BlkDataService::async_write(sisl::sg_list const& sgs, std::vector< MultiBlkId > const& blkids, bool part_of_batch) { + static thread_local std::vector< folly::Future< std::error_code > > s_futs; + s_futs.clear(); + for (const auto& blkid : blkids) { + s_futs.emplace_back(async_write(sgs, blkid, part_of_batch)); + } + return collect_all_futures(s_futs); +} + +void BlkDataService::submit_io_batch() { m_vdev->submit_batch(); } + BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, MultiBlkId& out_blkids) { - HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested"); + HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested size={} blk_size={}", size, m_blk_size); + blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size); + + return m_vdev->alloc_blks(nblks, hints, out_blkids); +} + +BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, + std::vector< BlkId >& out_blkids) { + HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested size={} blk_size={}", size, m_blk_size); blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size); return m_vdev->alloc_blks(nblks, hints, out_blkids); @@ -234,10 +256,14 @@ void BlkDataService::start() { std::move(std::make_unique< DataSvcCPCallbacks >(m_vdev))); } +void BlkDataService::stop() {} + uint64_t BlkDataService::get_total_capacity() const { return m_vdev->size(); } uint64_t BlkDataService::get_used_capacity() const { return m_vdev->used_size(); } +HSDevType BlkDataService::get_dev_type() const { return static_cast< HSDevType >(m_vdev->get_dev_type()); } + uint32_t BlkDataService::get_align_size() const { return m_vdev->align_size(); } } // namespace homestore diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp index 33d22090a..a387b5da5 100644 --- a/src/lib/checkpoint/cp_mgr.cpp +++ b/src/lib/checkpoint/cp_mgr.cpp @@ -235,7 +235,8 @@ void CPManager::cp_start_flush(CP* cp) { for (size_t svcid = 0; svcid < (size_t)cp_consumer_t::SENTINEL; svcid++) { if (svcid == (size_t)cp_consumer_t::REPLICATION_SVC) { continue; } auto& consumer = m_cp_cb_table[svcid]; - if (consumer) { futs.emplace_back(std::move(consumer->cp_flush(cp))); } + bool participated = (cp->m_contexts[svcid] != nullptr); + if (consumer && participated) { futs.emplace_back(std::move(consumer->cp_flush(cp))); } } folly::collectAllUnsafe(futs).thenValue([this, cp](auto) { @@ -314,8 +315,12 @@ void CPManager::start_cp_thread() { }; auto ctx = std::make_shared< Context >(); - // Start a reactor with 9 fibers (8 for sync io) - iomanager.create_reactor("cp_io", iomgr::INTERRUPT_LOOP, 8u, [this, ctx](bool is_started) { + // Start a reactor with 2 fibers (1 for sync io) + // Prevent deadlock with sync_io fibers. + // Multiple sync_io fibers may acquire a thread-level mutex and perform synchronous I/O using io_uring. + // This can block the fiber and allow other fibers to be scheduled. + // If another fiber tries to acquire the same mutex, a deadlock can occur. + iomanager.create_reactor("cp_io", iomgr::INTERRUPT_LOOP, 2u, [this, ctx](bool is_started) { if (is_started) { { std::unique_lock< std::mutex > lk{ctx->mtx}; diff --git a/src/lib/common/crash_simulator.hpp b/src/lib/common/crash_simulator.hpp index 788de1eac..e8826b61d 100644 --- a/src/lib/common/crash_simulator.hpp +++ b/src/lib/common/crash_simulator.hpp @@ -42,8 +42,12 @@ class CrashSimulator { } } + bool will_crash() const { return m_will_crash.load(); } + void set_will_crash(bool crash) { m_will_crash.store(crash); } + private: std::function< void(void) > m_restart_cb{nullptr}; + std::atomic m_will_crash{false}; sisl::urcu_scoped_ptr< bool > m_crashed; }; } // namespace homestore diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index cd8858863..4a7f9bd8b 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -143,7 +143,11 @@ table Generic { cache_max_throttle_cnt : uint32 = 4; // writeback cache max q depth - cache_min_throttle_cnt : uint32 = 4; // writeback cache min q deoth + cache_min_throttle_cnt : uint32 = 4; // writeback cache min q depth + + cache_hashmap_nbuckets : uint32 = 1000000; // num buckets for sisl::SimpleHashmap used in wbcache + + cache_evictor_npartitions: uint32 = 1000; // num partitions for lru evictor in the cache // if this value is set to 0, no sanity check will be run; sanity_check_level: uint32 = 1 (hotswap); @@ -255,6 +259,10 @@ table Consensus { // Max append batch size max_append_batch_size: int32 = 64; + // Max grpc message size, use 64M (max data size on data channel) + 128M (max snasphot batch size) + 1M + // Please adjust it if data_fetch_max_size_kb is increased as well + max_grpc_message_size: int32 = 202375168; + // Threshold of log gap from leader to consider a replica as stale stale_log_gap_hi_threshold: int32 = 200; @@ -262,7 +270,8 @@ table Consensus { stale_log_gap_lo_threshold: int32 = 30; // Minimum log gap a replica has to be from leader before joining the replica set. - min_log_gap_to_join: int32 = 30; + // 0 indicates the new member will join in cluster immediately. + min_log_gap_to_join: int32 = 0; // amount of time in millis to wait on data write before fetch data from remote; wait_data_write_timer_ms: uint64 = 1500 (hotswap); @@ -279,11 +288,38 @@ table Consensus { // ReplDev Reqs timeout in seconds. repl_req_timeout_sec: uint32 = 300; + // Timeout for snapshot sync context in ms. If the follower doesn't response + // within this timeout during snapshot resync, the leader will release snapshot sync context. + snapshot_sync_ctx_timeout_ms: int32 = 60000; + // Frequency to flush durable commit LSN in millis flush_durable_commit_interval_ms: uint64 = 500; // Log difference to determine if the follower is in resync mode resync_log_idx_threshold: int64 = 100; + + // Log difference from leader's point of view, to determine if the + // follower is laggy and if so, leader will stop pushing data until it drops under this threshold. + laggy_threshold: int64 = 2000; + + // Reading snapshot objects will be done by a background thread asynchronously + // instead of synchronous read by Raft worker threads + use_bg_thread_for_snapshot_io: bool = true; + + // Maximum number of election timeout rounds to wait during a prioritized leader election process. + // Every election timeout will compare its priority with the target_priority(max priority of the peers initially) + // then decay the target_priority and wait again until its priority >= target_priority. This setting helps us to set proper priority for peers. + // 0 means all members have the same priority. + max_wait_rounds_of_priority_election: uint32 = 2; + + // Maximum number of retries when raft is undergoing config changing + config_changing_error_retries: int32 = 3; + + // The time to wait for config change to be applied in ms + wait_for_config_change_ms: uint32 = 500; + + // The interval in ms to check if the new member in replace_member is fully synced and ready to take over + replace_member_sync_check_interval_ms: uint64 = 60000; } table HomeStoreSettings { diff --git a/src/lib/common/homestore_utils.hpp b/src/lib/common/homestore_utils.hpp index 2ee51b03d..b6989ff48 100644 --- a/src/lib/common/homestore_utils.hpp +++ b/src/lib/common/homestore_utils.hpp @@ -53,4 +53,8 @@ class hs_utils { static bool topological_sort(std::unordered_map< std::string, std::vector< std::string > >& DAG, std::vector< std::string >& ordered_entries); }; + +static bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, + uint32_t interval_ms = 100); + } // namespace homestore diff --git a/src/lib/common/resource_mgr.cpp b/src/lib/common/resource_mgr.cpp index 8719089b9..8440d6f68 100644 --- a/src/lib/common/resource_mgr.cpp +++ b/src/lib/common/resource_mgr.cpp @@ -15,13 +15,15 @@ *********************************************************************************/ #include #include -#include #include #include #include "resource_mgr.hpp" #include "homestore_assert.hpp" -#include "replication/repl_dev/raft_repl_dev.h" +#ifdef REPLICATION_SUPPORT +#include +#include "replication/repl_dev/raft_repl_dev.h" +#endif namespace homestore { ResourceMgr& resource_mgr() { return hs()->resource_mgr(); } @@ -48,14 +50,16 @@ void ResourceMgr::stop() { // void ResourceMgr::trigger_truncate() { if (hs()->has_repl_data_service()) { - // first make sure all repl dev's underlying raft log store make corresponding reservation during - // truncate -- set the safe truncate boundary for each raft log store; - hs()->repl_service().iterate_repl_devs([](cshared< ReplDev >& rd) { - // lock is already taken by repl service layer; - std::dynamic_pointer_cast< RaftReplDev >(rd)->truncate( - HS_DYNAMIC_CONFIG(resource_limits.raft_logstore_reserve_threshold)); - }); - + /* + * DO NOT NEED : raft will truncate logs. + * // first make sure all repl dev's underlying raft log store make corresponding reservation during + * // truncate -- set the safe truncate boundary for each raft log store; + * hs()->repl_service().iterate_repl_devs([](cshared< ReplDev >& rd) { + * // lock is already taken by repl service layer; + * std::dynamic_pointer_cast< RaftReplDev >(rd)->truncate( + * HS_DYNAMIC_CONFIG(resource_limits.raft_logstore_reserve_threshold)); + * }); + */ // next do device truncate which go through all logdevs and truncate them; hs()->logstore_service().device_truncate(); } diff --git a/src/lib/device/README.md b/src/lib/device/README.md new file mode 100644 index 000000000..99f83ecc0 --- /dev/null +++ b/src/lib/device/README.md @@ -0,0 +1,7 @@ +# Device Manager + +## HomeStore 4.x Disk Layout +1. max_num_chunks is decided by device size and min_chunk_size which is configurable by HomeStore consumer +2. Super SuperBlk (SSB) is the first meta blk to load Meta Service. All other System Meta Blks are chained together by loading the SSB + +![HomeStore_Disk_Layout](../../../docs/imgs/HomeStore_Disk_Layout2.png) diff --git a/src/lib/device/chunk.cpp b/src/lib/device/chunk.cpp index 9eb8563de..4962be386 100644 --- a/src/lib/device/chunk.cpp +++ b/src/lib/device/chunk.cpp @@ -29,6 +29,10 @@ std::string Chunk::to_string() const { vdev_ordinal(), stream_id()); } +float Chunk::get_blk_usage() const { + return s_cast(m_blk_allocator->get_used_blks()) / s_cast(m_blk_allocator->get_total_blks()); +} + void Chunk::set_user_private(const sisl::blob& data) { std::unique_lock lg{m_mgmt_mutex}; m_chunk_info.set_user_private(data); diff --git a/src/lib/device/chunk.h b/src/lib/device/chunk.h index 77b275e4b..b9d84abdb 100644 --- a/src/lib/device/chunk.h +++ b/src/lib/device/chunk.h @@ -27,6 +27,7 @@ class Chunk { const uint32_t m_stream_id; uint32_t m_vdev_ordinal{0}; shared< BlkAllocator > m_blk_allocator; + float blk_usage_report_threshold{0.9}; public: static constexpr auto MAX_CHUNK_SIZE = std::numeric_limits< uint32_t >::max(); @@ -66,6 +67,8 @@ class Chunk { nlohmann::json get_status([[maybe_unused]] int log_level) const; const BlkAllocator* blk_allocator() const { return m_blk_allocator.get(); } BlkAllocator* blk_allocator_mutable() { return m_blk_allocator.get(); } + float get_blk_usage_report_threshold() const { return blk_usage_report_threshold; } + float get_blk_usage() const; ////////////// Setters ///////////////////// void set_user_private(const sisl::blob& data); diff --git a/src/lib/device/device.h b/src/lib/device/device.h index beefdfc7f..1c3843534 100644 --- a/src/lib/device/device.h +++ b/src/lib/device/device.h @@ -36,6 +36,7 @@ VENUM(vdev_multi_pdev_opts_t, uint8_t, // Indicates the style of vdev when multi struct vdev_info { static constexpr size_t size = 512; static constexpr size_t user_private_size = 256; + static constexpr size_t max_name_len = 64; uint64_t vdev_size{0}; // 0: Size of the vdev uint32_t vdev_id{0}; // 8: Id for this vdev. It is unique per homestore instance @@ -48,7 +49,7 @@ struct vdev_info { uint8_t failed{0}; // 30: set to true if disk is replaced uint8_t hs_dev_type{0}; // 31: PDev dev type (as in fast or data) uint8_t multi_pdev_choice{0}; // 32: Choice when multiple pdevs are present (vdev_multi_pdev_opts_t) - char name[64]; // 33: Name of the vdev + char name[max_name_len]; // 33: Name of the vdev uint16_t checksum{0}; // 97: Checksum of this entire Block uint8_t alloc_type; // 98: Allocator type of this vdev uint8_t chunk_sel_type; // 99: Chunk Selector type of this vdev_id @@ -59,7 +60,10 @@ struct vdev_info { uint32_t get_vdev_id() const { return vdev_id; } uint64_t get_size() const { return vdev_size; } - void set_name(const std::string& n) { std::strncpy(charptr_cast(name), n.c_str(), 63); } + void set_name(const std::string& n) { + std::strncpy(charptr_cast(name), n.c_str(), max_name_len - 1); + name[max_name_len - 1] = '\0'; + } std::string get_name() const { return std::string{c_charptr_cast(name)}; } void set_allocated() { slot_allocated = s_cast< uint8_t >(0x01); }; diff --git a/src/lib/device/device_manager.cpp b/src/lib/device/device_manager.cpp index cac91237f..28eb37e33 100644 --- a/src/lib/device/device_manager.cpp +++ b/src/lib/device/device_manager.cpp @@ -99,7 +99,8 @@ void DeviceManager::format_devices() { ++m_first_blk_hdr.gen_number; m_first_blk_hdr.version = first_block_header::CURRENT_SUPERBLOCK_VERSION; std::strncpy(m_first_blk_hdr.product_name, first_block_header::PRODUCT_NAME, - first_block_header::s_product_name_size); + first_block_header::s_product_name_size - 1); + m_first_blk_hdr.product_name[first_block_header::s_product_name_size - 1] = '\0'; m_first_blk_hdr.num_pdevs = uint32_cast(m_dev_infos.size()); m_first_blk_hdr.max_vdevs = hs_super_blk::MAX_VDEVS_IN_SYSTEM; m_first_blk_hdr.max_system_chunks = hs_super_blk::MAX_CHUNKS_IN_SYSTEM; diff --git a/src/lib/device/hs_super_blk.h b/src/lib/device/hs_super_blk.h index a539c1e56..9d0a3140d 100644 --- a/src/lib/device/hs_super_blk.h +++ b/src/lib/device/hs_super_blk.h @@ -75,7 +75,7 @@ struct disk_attr { }; struct first_block_header { - static constexpr const char* PRODUCT_NAME{"OmStore"}; + static constexpr const char* PRODUCT_NAME{"HomeStore4x"}; static constexpr size_t s_product_name_size{64}; static constexpr uint32_t CURRENT_SUPERBLOCK_VERSION{4}; @@ -128,7 +128,7 @@ struct first_block { static constexpr uint32_t s_atomic_fb_size{512}; // increase 512 to actual size if in the future first_block // can be larger; static constexpr uint32_t s_io_fb_size{4096}; // This is the size we do IO on, with padding - static constexpr uint32_t HOMESTORE_MAGIC{0xCEEDDEEB}; // Magic written as first bytes on each device + static constexpr uint32_t HOMESTORE_MAGIC{0xABBECDCD}; // Magic written as first bytes on each device public: uint64_t magic{0}; // Header magic expected to be at the top of block diff --git a/src/lib/device/journal_vdev.cpp b/src/lib/device/journal_vdev.cpp index 5c3e5b34f..6ca2678fc 100644 --- a/src/lib/device/journal_vdev.cpp +++ b/src/lib/device/journal_vdev.cpp @@ -24,8 +24,6 @@ #include #include #include -#include -#include "replication/repl_dev/raft_repl_dev.h" #include "device/chunk.h" #include "device/device.h" #include "device/physical_dev.hpp" diff --git a/src/lib/device/physical_dev.cpp b/src/lib/device/physical_dev.cpp index 1b6914cf5..ba52ba2f2 100644 --- a/src/lib/device/physical_dev.cpp +++ b/src/lib/device/physical_dev.cpp @@ -35,6 +35,8 @@ namespace homestore { static std::mutex s_cached_dev_mtx; static std::unordered_map< std::string, iomgr::io_device_ptr > s_cached_opened_devs; +__attribute__((no_sanitize_address)) static auto get_current_time() { return Clock::now(); } + iomgr::io_device_ptr open_and_cache_dev(const std::string& devname, int oflags) { std::unique_lock lg(s_cached_dev_mtx); @@ -136,26 +138,50 @@ void PhysicalDev::close_device() { close_and_uncache_dev(m_devname, m_iodev); } folly::Future< std::error_code > PhysicalDev::async_write(const char* data, uint32_t size, uint64_t offset, bool part_of_batch) { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - return m_drive_iface->async_write(m_iodev.get(), data, size, offset, part_of_batch); + auto const start_time = get_current_time(); + return m_drive_iface->async_write(m_iodev.get(), data, size, offset, part_of_batch) + .thenValue([this, start_time, size](std::error_code ec) { + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + COUNTER_INCREMENT(m_metrics, drive_async_write_count, 1); + return ec; + }); } folly::Future< std::error_code > PhysicalDev::async_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset, bool part_of_batch) { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - return m_drive_iface->async_writev(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch); + auto const start_time = get_current_time(); + return m_drive_iface->async_writev(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch) + .thenValue([this, start_time, size](std::error_code ec) { + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + COUNTER_INCREMENT(m_metrics, drive_async_write_count, 1); + return ec; + }); } folly::Future< std::error_code > PhysicalDev::async_read(char* data, uint32_t size, uint64_t offset, bool part_of_batch) { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - return m_drive_iface->async_read(m_iodev.get(), data, size, offset, part_of_batch); + auto const start_time = get_current_time(); + return m_drive_iface->async_read(m_iodev.get(), data, size, offset, part_of_batch) + .thenValue([this, start_time, size](std::error_code ec) { + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + COUNTER_INCREMENT(m_metrics, drive_async_read_count, 1); + return ec; + }); } folly::Future< std::error_code > PhysicalDev::async_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset, bool part_of_batch) { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - return m_drive_iface->async_readv(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch); + auto const start_time = get_current_time(); + return m_drive_iface->async_readv(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch) + .thenValue([this, start_time, size](std::error_code ec) { + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + COUNTER_INCREMENT(m_metrics, drive_async_read_count, 1); + return ec; + }); } folly::Future< std::error_code > PhysicalDev::async_write_zero(uint64_t size, uint64_t offset) { @@ -174,46 +200,50 @@ folly::Future< std::error_code > PhysicalDev::async_write_zero(uint64_t size, ui folly::Future< std::error_code > PhysicalDev::queue_fsync() { return m_drive_iface->queue_fsync(m_iodev.get()); } -__attribute__((no_sanitize_address)) static auto get_current_time() { return Clock::now(); } - std::error_code PhysicalDev::sync_write(const char* data, uint32_t size, uint64_t offset) { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); auto const start_time = get_current_time(); auto const ret = m_drive_iface->sync_write(m_iodev.get(), data, size, offset); HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); return ret; } std::error_code PhysicalDev::sync_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset) { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); auto const start_time = Clock::now(); auto const ret = m_drive_iface->sync_writev(m_iodev.get(), iov, iovcnt, size, offset); HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); + return ret; } std::error_code PhysicalDev::sync_read(char* data, uint32_t size, uint64_t offset) { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); auto const start_time = Clock::now(); auto const ret = m_drive_iface->sync_read(m_iodev.get(), data, size, offset); HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); return ret; } std::error_code PhysicalDev::sync_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset) { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); auto const start_time = Clock::now(); auto const ret = m_drive_iface->sync_readv(m_iodev.get(), iov, iovcnt, size, offset); HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); return ret; } std::error_code PhysicalDev::sync_write_zero(uint64_t size, uint64_t offset) { - return m_drive_iface->sync_write_zero(m_iodev.get(), size, offset); + auto const start_time = Clock::now(); + auto const ret = m_drive_iface->sync_write_zero(m_iodev.get(), size, offset); + HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + HISTOGRAM_OBSERVE(m_metrics, wirte_io_size, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); + return ret; } void PhysicalDev::submit_batch() { m_drive_iface->submit_batch(); } diff --git a/src/lib/device/vchunk.cpp b/src/lib/device/vchunk.cpp index 1a7aaeac5..a809450d1 100644 --- a/src/lib/device/vchunk.cpp +++ b/src/lib/device/vchunk.cpp @@ -25,6 +25,8 @@ const uint8_t* VChunk::get_user_private() const { return m_internal_chunk->user_ blk_num_t VChunk::get_total_blks() const { return m_internal_chunk->blk_allocator()->get_total_blks(); } +void VChunk::reset() { m_internal_chunk->blk_allocator_mutable()->reset(); } + blk_num_t VChunk::available_blks() const { return m_internal_chunk->blk_allocator()->available_blks(); } blk_num_t VChunk::get_defrag_nblks() const { return m_internal_chunk->blk_allocator()->get_defrag_nblks(); } @@ -33,5 +35,7 @@ uint32_t VChunk::get_pdev_id() const { return m_internal_chunk->physical_dev()-> uint16_t VChunk::get_chunk_id() const { return m_internal_chunk->chunk_id(); } +uint64_t VChunk::size() const { return m_internal_chunk->size(); } + cshared< Chunk > VChunk::get_internal_chunk() const { return m_internal_chunk; } } // namespace homestore diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp index 591540995..a3f060e4a 100644 --- a/src/lib/device/virtual_dev.cpp +++ b/src/lib/device/virtual_dev.cpp @@ -431,6 +431,8 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId con Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(), + dev_offset); if (sisl_unlikely(dev_offset == INVALID_DEV_OFFSET)) { return std::make_error_code(std::errc::resource_unavailable_try_again); } @@ -443,6 +445,9 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared< if (hs()->crash_simulator().is_in_crashing_phase()) { return std::error_code{}; } #endif + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(), + chunk->start_offset() + offset_in_chunk); + if (sisl_unlikely(!is_chunk_available(chunk))) { return std::make_error_code(std::errc::resource_unavailable_try_again); } @@ -464,6 +469,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId cons auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset); + COUNTER_INCREMENT(m_metrics, vdev_write_count, 1); if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) { COUNTER_INCREMENT(m_metrics, unalign_writes, 1); @@ -486,6 +493,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< C auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset); + COUNTER_INCREMENT(m_metrics, vdev_write_count, 1); if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) { COUNTER_INCREMENT(m_metrics, unalign_writes, 1); diff --git a/src/lib/device/virtual_dev.hpp b/src/lib/device/virtual_dev.hpp index 36032954e..eb6b63192 100644 --- a/src/lib/device/virtual_dev.hpp +++ b/src/lib/device/virtual_dev.hpp @@ -292,6 +292,7 @@ class VirtualDev { virtual nlohmann::json get_status(int log_level) const; virtual uint64_t get_total_chunk_num() const { return m_total_chunk_num; } + uint8_t get_dev_type() const { return m_vdev_info.hs_dev_type; } uint32_t align_size() const; uint32_t optimal_page_size() const; uint32_t atomic_page_size() const; diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index e2bbcbc21..f7e4f9019 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -39,7 +40,9 @@ #include "device/virtual_dev.hpp" #include "common/resource_mgr.hpp" #include "meta/meta_sb.hpp" +#ifdef REPLICATION_SUPPORT #include "replication/service/generic_repl_svc.h" +#endif #include "common/crash_simulator.hpp" /* @@ -57,6 +60,7 @@ HomeStoreSafePtr HomeStore::s_instance{nullptr}; static std::unique_ptr< IndexServiceCallbacks > s_index_cbs; static shared< ChunkSelector > s_custom_chunk_selector{nullptr}; static shared< ReplApplication > s_repl_app{nullptr}; +std::string version = PACKAGE_VERSION; HomeStore* HomeStore::instance() { if (s_instance == nullptr) { s_instance = std::make_shared< HomeStore >(); } @@ -92,6 +96,7 @@ HomeStore& HomeStore::with_log_service() { return *this; } +#ifdef REPLICATION_SUPPORT HomeStore& HomeStore::with_repl_data_service(cshared< ReplApplication >& repl_app, cshared< ChunkSelector >& custom_chunk_selector) { m_services[uint32_cast(ServiceType::REPLICATION)] = std::vector< ServiceSubType >{1u, ServiceSubType::DEFAULT}; @@ -101,6 +106,7 @@ HomeStore& HomeStore::with_repl_data_service(cshared< ReplApplication >& repl_ap s_custom_chunk_selector = std::move(custom_chunk_selector); return *this; } +#endif #ifdef _PRERELEASE HomeStore& HomeStore::with_crash_simulator(std::function< void(void) > cb) { @@ -149,6 +155,12 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ static std::once_flag flag1; std::call_once(flag1, [this]() { +#ifndef NDEBUG + LOGINFO("HomeStore DEBUG version: {}", version); +#else + LOGINFO("HomeStore RELEASE version: {}", version); +#endif + sisl::VersionMgr::addVersion(PACKAGE_NAME, version::Semver200_version(PACKAGE_VERSION)); m_periodic_logger = sisl::logging::CreateCustomLogger("homestore", "_periodic", false, true /* tee_to_stdout_stderr */); sisl::logging::SetLogPattern("[%D %T.%f] [%^%L%$] [%t] %v", m_periodic_logger); @@ -156,6 +168,19 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ HomeStoreDynamicConfig::init_settings_default(); + // Check if the max_grpc_message_size is large enough to hold the data and snapshot batch size + auto data_fetch_max_size_in_byte = HS_DYNAMIC_CONFIG(consensus.data_fetch_max_size_kb) * 1024ull; + RELEASE_ASSERT(data_fetch_max_size_in_byte <= INT_MAX, "data fetch size is larger than the grpc limit"); + if (HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < input.max_data_size || + HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < input.max_snapshot_batch_size || + HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < s_cast< int >(data_fetch_max_size_in_byte)) { + LOGERROR("max_grpc_message_size {} is too small to hold max_data_size {}, max_snapshot_batch_size {} and " + "data_fetch_max_size {}", + HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), input.max_data_size, + input.max_snapshot_batch_size, data_fetch_max_size_in_byte); + throw std::invalid_argument("max_grpc_message_size is insufficient for the configured data or snapshot sizes"); + } + #ifdef _PRERELEASE // Start a default crash simulator which raises SIGKILL, in case user has not provided with_crash_simulator() // callback @@ -171,7 +196,9 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ if (has_repl_data_service()) { m_log_service = std::make_unique< LogStoreService >(); m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_chunk_selector)); +#ifdef REPLICATION_SUPPORT m_repl_service = GenericReplService::create(std::move(s_repl_app)); +#endif } else { if (has_log_service()) { m_log_service = std::make_unique< LogStoreService >(); } if (has_data_service()) { @@ -243,11 +270,11 @@ void HomeStore::format_and_start(std::map< ServiceId, hs_format_params >&& forma } else if ((svc_id.type == ServiceType::DATA) && has_data_service()) { m_data_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type, fparams.block_size, fparams.alloc_type, fparams.chunk_sel_type, - fparams.num_chunks); + fparams.num_chunks, fparams.chunk_size); } else if ((svc_id.type == ServiceType::REPLICATION) && has_repl_data_service()) { m_data_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type, fparams.block_size, fparams.alloc_type, fparams.chunk_sel_type, - fparams.num_chunks); + fparams.num_chunks, fparams.chunk_size); } } @@ -265,7 +292,7 @@ void HomeStore::do_start() { const auto& inp_params = HomeStoreStaticConfig::instance().input; uint64_t cache_size = resource_mgr().get_cache_size(); - m_evictor = std::make_shared< sisl::LRUEvictor >(cache_size, 1000); + m_evictor = std::make_shared< sisl::LRUEvictor >(cache_size, HS_DYNAMIC_CONFIG(generic.cache_evictor_npartitions)); if (m_before_services_starting_cb) { m_before_services_starting_cb(); } @@ -279,7 +306,9 @@ void HomeStore::do_start() { if (has_index_service()) { m_index_service->start(); } if (has_repl_data_service()) { +#ifdef REPLICATION_SUPPORT s_cast< GenericReplService* >(m_repl_service.get())->start(); // Replservice starts logstore & data service +#endif } else { if (has_data_service()) { m_data_service->start(); } if (has_log_service() && inp_params.auto_recovery) { @@ -317,11 +346,13 @@ void HomeStore::shutdown() { m_resource_mgr->stop(); if (has_repl_data_service()) { +#ifdef REPLICATION_SUPPORT // Log and Data services are stopped by repl service s_cast< GenericReplService* >(m_repl_service.get())->stop(); m_log_service.reset(); m_data_service.reset(); m_repl_service.reset(); +#endif } else { if (has_log_service()) { m_log_service->stop(); @@ -346,8 +377,6 @@ void HomeStore::shutdown() { #ifdef _PRERELEASE flip::Flip::instance().stop_rpc_server(); #endif - - HomeStore::reset_instance(); LOGINFO("Homestore is completed its shutdown"); } diff --git a/src/lib/index/inplace_btree/index_cp.hpp b/src/lib/index/inplace_btree/index_cp.hpp index c8292c47f..b04b8f052 100644 --- a/src/lib/index/inplace_btree/index_cp.hpp +++ b/src/lib/index/inplace_btree/index_cp.hpp @@ -92,12 +92,12 @@ struct IndexCPContext : public VDevCPContext { } std::string parent_id_string() const { - return (has_inplace_parent == 0x1) ? fmt::format("chunk={}, blk={}", ids[0].second, ids[0].first) : "empty"; + return (has_inplace_parent == 0x1) ? fmt::format("{}", blk_id(0).to_integer()) : "empty"; } std::string child_id_string() const { auto const idx = (has_inplace_parent == 0x1) ? 1 : 0; - return (has_inplace_child == 0x1) ? fmt::format("chunk={}, blk={}", ids[idx].second, ids[idx].first) + return (has_inplace_child == 0x1) ? fmt::format("{}", blk_id(idx).to_integer()) : "empty"; } @@ -160,6 +160,7 @@ struct IndexCPContext : public VDevCPContext { std::optional< IndexBufferPtr > next_dirty(); std::string to_string(); std::string to_string_with_dags(); + uint16_t num_dags(); void to_string_dot(const std::string& filename); private: diff --git a/src/lib/index/inplace_btree/inplace_btree_store.h b/src/lib/index/inplace_btree/inplace_btree_store.h index 4552c2516..63e141bda 100644 --- a/src/lib/index/inplace_btree/inplace_btree_store.h +++ b/src/lib/index/inplace_btree/inplace_btree_store.h @@ -59,7 +59,28 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { superblk< index_table_sb > m_sb; shared< MetaIndexBuffer > m_sb_buffer; + // graceful shutdown +private: + std::atomic_bool m_stopping{false}; + mutable std::atomic_uint64_t pending_request_num{0}; + + bool is_stopping() const { return m_stopping.load(); } + void start_stopping() { m_stopping = true; } + + uint64_t get_pending_request_num() const { return pending_request_num.load(); } + + void incr_pending_request_num() const { pending_request_num++; } + void decr_pending_request_num() const { pending_request_num--; } + public: + void stop() { + start_stopping(); + while (true) { + if (!get_pending_request_num()) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + } + IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg) : Btree< K, V >{cfg}, m_sb{"index"} { // Create a superblk for the index table and create MetaIndexBuffer corresponding to that @@ -100,9 +121,20 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } } - void destroy() override { - Btree< K, V >::destroy_btree(nullptr); + void audit_tree() override { + cp_mgr().cp_guard(); + Btree< K, V >::sanity_sub_tree(); + } + + btree_status_t destroy() override { + if (is_stopping()) return btree_status_t::stopping; + incr_pending_request_num(); + auto cpg = cp_mgr().cp_guard(); + Btree< K, V >::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); m_sb.destroy(); + m_sb_buffer->m_valid = false; + decr_pending_request_num(); + return btree_status_t::success; } uuid_t uuid() const override { return m_sb->uuid; } @@ -114,6 +146,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { template < typename ReqT > btree_status_t put(ReqT& put_req) { + if (is_stopping()) return btree_status_t::stopping; + incr_pending_request_num(); auto ret = btree_status_t::success; do { auto cpg = cp_mgr().cp_guard(); @@ -121,11 +155,14 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { ret = Btree< K, V >::put(put_req); if (ret == btree_status_t::cp_mismatch) { LOGTRACEMOD(wbcache, "CP Mismatch, retrying put"); } } while (ret == btree_status_t::cp_mismatch); + decr_pending_request_num(); return ret; } template < typename ReqT > btree_status_t remove(ReqT& remove_req) { + if (is_stopping()) return btree_status_t::stopping; + incr_pending_request_num(); auto ret = btree_status_t::success; do { auto cpg = cp_mgr().cp_guard(); @@ -133,14 +170,66 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { ret = Btree< K, V >::remove(remove_req); if (ret == btree_status_t::cp_mismatch) { LOGTRACEMOD(wbcache, "CP Mismatch, retrying remove"); } } while (ret == btree_status_t::cp_mismatch); + decr_pending_request_num(); return ret; } + template < typename ReqT > + btree_status_t get(ReqT& greq) const { + if (is_stopping()) return btree_status_t::stopping; + incr_pending_request_num(); + auto ret = Btree< K, V >::get(greq); + decr_pending_request_num(); + return ret; + } + + void repair_root_node(IndexBufferPtr const& idx_buf) override { + LOGTRACEMOD(wbcache, "check if this was the previous root node {} for buf {} ", m_sb->root_node, + idx_buf->to_string()); + if (m_sb->root_node == idx_buf->blkid().to_integer()) { + // This is the root node, we need to update the root node in superblk + LOGTRACEMOD(wbcache, "{} is old root so we need to update the meta node ", idx_buf->to_string()); + BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */, + BtreeNode::identify_leaf_node(idx_buf->raw_buffer())); + static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf); + auto edge_id = n->next_bnode(); + + BT_DBG_ASSERT(!n->has_valid_edge(), + "root {} already has a valid edge {}, so we should have found the new root node", + n->to_string(), n->get_edge_value().bnode_id()); + n->set_next_bnode(empty_bnodeid); + n->set_edge_value(BtreeLinkInfo{edge_id, 0}); + LOGTRACEMOD(wbcache, "change root node {}: edge updated to {} and invalidate the next node! ", n->node_id(), + edge_id); + auto cpg = cp_mgr().cp_guard(); + write_node_impl(n, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + + } else { + LOGTRACEMOD(wbcache, "This is not the root node, so we can ignore this repair call for buf {}", + idx_buf->to_string()); + } + } + + void delete_stale_children(IndexBufferPtr const& idx_buf) override { + BtreeNode::identify_leaf_node(idx_buf->raw_buffer())); + static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf); + auto cpg = cp_mgr().cp_guard(); + idx_buf->m_dirtied_cp_id = cpg->id(); + BtreeNodePtr bn = BtreeNodePtr{n}; + + if (!bn->is_leaf()) { + LOGTRACEMOD(wbcache, "delete_stale_links cp={} buf={}", cpg->id(), idx_buf->to_string()); + delete_stale_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + } + } + } + void repair_node(IndexBufferPtr const& idx_buf) override { if (idx_buf->is_meta_buf()) { - // We cannot repair the meta buf on its own, we need to repair the root node which modifies the // meta_buf. It is ok to ignore this call, because repair will be done from root before meta_buf is // attempted to repair, which would have updated the meta_buf already. + LOGTRACEMOD(wbcache, "Ignoring repair on meta buf {} root id {} ", idx_buf->to_string(), + this->root_node_id()); return; } BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */, @@ -153,12 +242,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { idx_buf->m_dirtied_cp_id = cpg->id(); BtreeNodePtr bn = BtreeNodePtr{n}; - LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string()); - repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + // Only for interior nodes we need to repair its links + if (!bn->is_leaf()) { + LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string()); + repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + } if (idx_buf->m_up_buffer && idx_buf->m_up_buffer->is_meta_buf()) { // Our up buffer is a meta buffer, which means that we are the new root node, we need to update the // meta_buf with new root as well + LOGTRACEMOD(wbcache, "root change for after repairing {}\n\n", idx_buf->to_string()); on_root_changed(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); } } @@ -179,6 +272,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { node->set_checksum(); auto prev_state = idx_node->m_idx_buf->m_state.exchange(index_buf_state_t::DIRTY); + idx_node->m_idx_buf->m_node_level = node->level(); if (prev_state == index_buf_state_t::CLEAN) { // It was clean before, dirtying it first time, add it to the wb_cache list to flush if (idx_node->m_idx_buf->m_dirtied_cp_id != -1) { @@ -192,9 +286,9 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { (int)prev_state, (int)index_buf_state_t::FLUSHING, "Writing on a node buffer which was currently in flushing state on cur_cp={} buffer_cp_id={}", cp_ctx->id(), idx_node->m_idx_buf->m_dirtied_cp_id); + BT_DBG_ASSERT_EQ(idx_node->m_idx_buf->m_dirtied_cp_id, cp_ctx->id(), } return btree_status_t::success; - } btree_status_t transact_nodes(const BtreeNodeList& new_nodes, const BtreeNodeList& freed_nodes, const BtreeNodePtr& left_child_node, const BtreeNodePtr& parent_node, @@ -243,14 +337,19 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { void free_node_impl(const BtreeNodePtr& node, void* context) override { auto n = static_cast< IndexBtreeNode* >(node.get()); + n->m_idx_buf->m_node_level = node->level(); wb_cache().free_buf(n->m_idx_buf, r_cast< CPContext* >(context)); } btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override { + // todo: if(m_sb->root_node == new_root->node_id() && m_sb->root_link_version == new_root->link_version()){ + // return btree_status_t::success;} + LOGTRACEMOD(wbcache, "root changed for index old_root={} new_root={}", m_sb->root_node, new_root->node_id()); m_sb->root_node = new_root->node_id(); m_sb->root_link_version = new_root->link_version(); if (!wb_cache().refresh_meta_buf(m_sb_buffer, r_cast< CPContext* >(context))) { + LOGTRACEMOD(wbcache, "CP mismatch error - discard transact for meta node"); return btree_status_t::cp_mismatch; } @@ -259,23 +358,132 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return btree_status_t::success; } + btree_status_t delete_stale_links(BtreeNodePtr const& parent_node, void* cp_ctx) { + LOGTRACEMOD(wbcache, "deleting stale links for parent node [{}]", parent_node->to_string()); + BtreeNodeList free_nodes; + auto nentries = parent_node->total_entries(); + uint32_t deleted = 0; + for (uint32_t i = nentries; i-- > 0;) { + BtreeLinkInfo cur_child_info; + BtreeNodePtr child_node; + parent_node->get_nth_value(i, &cur_child_info, false /* copy */); + if (auto ret = read_node_impl(cur_child_info.bnode_id(), child_node); ret == btree_status_t::success) { + if (child_node->is_node_deleted()) { + LOGTRACEMOD(wbcache, "Deleting stale child node [{}] for parent node [{}]", child_node->to_string(), + parent_node->to_string()); + child_node->set_node_deleted(); + free_node_impl(child_node, cp_ctx); + + if (i > 0) { + BtreeLinkInfo pre_child_info; + parent_node->get_nth_value(i - 1, &pre_child_info, false /* copy */); + // auto ckey = parent_node->get_nth_key< K >(i-1, true); + // parent_node->set_nth_key(i-1, ckey); + parent_node->update(i, pre_child_info); + parent_node->remove(i - 1); + } else { + parent_node->remove(i); + } + + LOGTRACEMOD(wbcache, "so far parent node [{}]", parent_node->to_string()); + // free_nodes.push_back(child_node); + deleted++; + } + } else { + LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}", + cur_child_info.bnode_id(), parent_node->to_string(), ret); + } + } + if (parent_node->has_valid_edge()) { + auto edge_info = parent_node->get_edge_value(); + BtreeNodePtr edge_node; + if (auto ret = read_node_impl(edge_info.bnode_id(), edge_node); ret == btree_status_t::success) { + if (edge_node->is_node_deleted()) { + LOGTRACEMOD(wbcache, "Deleting stale edge node [{}] for parent node [{}]", edge_node->to_string(), + parent_node->to_string()); + edge_node->set_node_deleted(); + free_node_impl(edge_node, cp_ctx); + if (parent_node->total_entries() == 0) { + parent_node->invalidate_edge(); + } else { + BtreeLinkInfo last_child_info; + parent_node->get_nth_value(parent_node->total_entries() - 1, &last_child_info, + false /* copy */); + parent_node->set_edge_value(last_child_info); + parent_node->remove(parent_node->total_entries() - 1); + LOGTRACEMOD(wbcache, "Replacing edge with previous child node [{}] for parent node [{}]", + last_child_info.bnode_id(), parent_node->to_string()); + } + + deleted++; + } + } else { + LOGTRACEMOD(wbcache, "Failed to read edge node {} for parent node [{}] reason {}", + edge_node->to_string(), parent_node->to_string(), ret); + } + } + if (deleted /*free_nodes.size()*/) { + btree_status_t ret = btree_status_t::success; + + if ((parent_node->total_entries() == 0) && !parent_node->has_valid_edge()) { + parent_node->set_node_deleted(); + LOGTRACEMOD(wbcache, + "Freeing parent node=[{}] because it is empty and not an edge node but had stale children", + parent_node->to_string()); + ret = write_node_impl(parent_node, cp_ctx); + free_node_impl(parent_node, cp_ctx); + LOGTRACEMOD(wbcache, + "Accomplishing deleting stale links. After removing {} stale links, parent node is [{}]", + deleted, parent_node->to_string()); + } else { + ret = write_node_impl(parent_node, cp_ctx); + if (ret != btree_status_t::success) { + LOGTRACEMOD(wbcache, "Failed to write parent node [{}] after deleting stale links", + parent_node->to_string()); + } else { + LOGTRACEMOD( + wbcache, + "Accomplishing deleting stale links. After removing {} stale links, parent node is [{}]", + deleted, parent_node->to_string()); + } + } + // auto ret = transact_nodes({}, free_nodes, parent_node, nullptr, cp_ctx); + return ret; + } else { + LOGTRACEMOD(wbcache, "Accomplishing deleting stale links. No stale links found for parent node [{}]", + parent_node->to_string()); + } + return btree_status_t::success; + } + + // btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { - BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->to_string()); + LOGTRACEMOD(wbcache, "Repairing links for parent node [{}]", parent_node->to_string()); + // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this + // needs to be handled. Get the last key in the node - // Get the last key in the node - auto const last_parent_key = parent_node->get_last_key< K >(); + auto last_parent_key = parent_node->get_last_key< K >(); auto const is_parent_edge_node = parent_node->has_valid_edge(); if ((parent_node->total_entries() == 0) && !is_parent_edge_node) { BT_LOG_ASSERT(false, "Parent node={} is empty and not an edge node but was asked to repair", parent_node->node_id()); return btree_status_t::not_found; } - BT_LOG(INFO, "Repairing node={} with last_parent_key={}", parent_node->to_string(), - last_parent_key.to_string()); + + // Get all original child ids as a support to check if we are beyond the last child node + std::unordered_map< bnodeid_t, K > orig_child_infos; + for (uint32_t i = 0; i < parent_node->total_entries(); ++i) { + BtreeLinkInfo link_info; + parent_node->get_nth_value(i, &link_info, true); + orig_child_infos[link_info.bnode_id()] = parent_node->get_nth_key< K >(i, false /* copy */); + } + LOGTRACEMOD(wbcache, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), + last_parent_key.to_string()); // Get the first child node and its link info BtreeLinkInfo child_info; BtreeNodePtr child_node; + BtreeNodePtr pre_child_node; auto ret = this->get_child_and_lock_node(parent_node, 0, child_info, child_node, locktype_t::READ, locktype_t::READ, cp_ctx); if (ret != btree_status_t::success) { @@ -284,9 +492,122 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return ret; } + // update the last key of parent for issue + // start from first child and store the last key of the child node, then traverse to next sibling + // 2-1- if this is greater than parent last key, traverse for sibling of parent until reaches to + // siblings which has keys more than Y or end of list (name this parent sibling node F), + // 2-2- Put last key of F to last key of P + // 2-3 - set F as Next of A + BtreeNodeList siblings; + BtreeNodePtr next_cur_child; + BT_DBG_ASSERT(parent_node->has_valid_edge() || parent_node->total_entries(), + "parent node {} doesn't have valid edge and no entries ", parent_node->to_string()); + if (parent_node->total_entries() > 0) { + auto updated_last_key = last_parent_key; + K last_child_last_key; + K last_child_neighbor_key; + BtreeNodePtr cur_child; + BtreeLinkInfo cur_child_info; + + bool found_child = false; + uint32_t nentries = parent_node->total_entries() + parent_node->has_valid_edge() ? 1 : 0; + + for (uint32_t i = nentries; i-- > 0;) { + parent_node->get_nth_value(i, &cur_child_info, false /* copy */); + if (auto ret = read_node_impl(cur_child_info.bnode_id(), cur_child); ret == btree_status_t::success) { + if (!cur_child->is_node_deleted() && cur_child->total_entries()) { + last_child_last_key = cur_child->get_last_key< K >(); + if (cur_child->next_bnode() != empty_bnodeid && + read_node_impl(cur_child->next_bnode(), next_cur_child) == btree_status_t::success) { + LOGTRACEMOD( + wbcache, + "Last child last key {} for child_node [{}] parent node [{}], next neigbor is [{}]", + last_child_last_key.to_string(), cur_child->to_string(), parent_node->to_string(), + next_cur_child->to_string()); + found_child = true; + break; + } + found_child = true; + break; + } + LOGTRACEMOD(wbcache, "PASSING child node {} so we need to check next child node", + cur_child->to_string()); + } + } + + if (found_child) { + LOGTRACEMOD(wbcache, "Last child last key {} for parent node {}, child_node {}", + last_child_last_key.to_string(), parent_node->to_string(), cur_child->to_string()); + if (last_child_last_key.compare(last_parent_key) > 0) { + if (next_cur_child) { + last_child_neighbor_key = next_cur_child->get_last_key< K >(); + LOGTRACEMOD(wbcache, + "Voila !! last child_key of child [{}] is greater than its parents [{}] and its " + "next neighbor key is {}", + cur_child->to_string(), parent_node->to_string(), + last_child_neighbor_key.to_string()); + } else { + LOGTRACEMOD( + wbcache, + "Last child_key of child [{}] is greater than its parents [{}] and it has no next neighbor", + cur_child->to_string(), parent_node->to_string()); + } + + // 2-1 traverse for sibling of parent until reaches to siblings which has keys more than 7563 + // or end + // of list (put all siblings in a list, here is F) , + BtreeNodePtr sibling; + BtreeNodePtr true_sibling; + BtreeLinkInfo sibling_info; + + auto sibling_node_id = parent_node->next_bnode(); + while (sibling_node_id != empty_bnodeid) { + if (auto ret = read_node_impl(sibling_node_id, sibling); ret == btree_status_t::success) { + if (sibling->is_node_deleted()) { + // Do we need to free the sibling node here? + siblings.push_back(sibling); + sibling_node_id = sibling->next_bnode(); + LOGTRACEMOD(wbcache, "Sibling node [{}] is deleted, continue to next sibling", + sibling->to_string()); + continue; + } + auto sibling_last_key = sibling->get_last_key< K >(); + if (next_cur_child && sibling_last_key.compare(last_child_neighbor_key) < 0) { + siblings.push_back(sibling); + sibling_node_id = sibling->next_bnode(); + } else { + true_sibling = sibling; + break; + } + } + } + if (true_sibling) { + LOGTRACEMOD(wbcache, "True sibling [{}] for parent_node {}", true_sibling->to_string(), + parent_node->to_string()); + } else { + LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", parent_node->to_string()); + } + if (sibling_node_id != empty_bnodeid) { + last_parent_key = last_child_last_key; + parent_node->set_next_bnode(true_sibling->node_id()); + for (auto sibling : siblings) { + LOGTRACEMOD(wbcache, "Sibling list [{}]", sibling->to_string()); + } + LOGTRACEMOD(wbcache, "True sibling [{}]", true_sibling->to_string()); + BtreeLinkInfo first_child_info; + parent_node->get_nth_value(0, &first_child_info, false); + } + } else { + LOGTRACEMOD(wbcache, + "No undeleted child found for parent_node [{}], keep normal repair (regular recovery)", + parent_node->to_string()); + next_cur_child = nullptr; + } + } + } + // Keep a copy of the node buffer, in case we need to revert back uint8_t* tmp_buffer = new uint8_t[this->m_node_size]; - std::memcpy(tmp_buffer, parent_node->m_phys_node_buf, this->m_node_size); // Remove all the entries in parent_node and let walk across child_nodes rebuild this node parent_node->remove_all(); @@ -295,22 +616,111 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { auto cur_parent = parent_node; BtreeNodeList new_parent_nodes; do { - if (child_node->has_valid_edge() || - (child_node->is_leaf() && (child_node->next_bnode() == empty_bnodeid))) { - BT_DBG_ASSERT(is_parent_edge_node, - "Child node={} is an edge node but parent_node={} is not an edge node", - child_node->node_id(), cur_parent->node_id()); - cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) { + if (child_node->is_node_deleted()) { + // Edge node is merged, we need to set the current last entry as edge + if (cur_parent->total_entries() > 0) { + auto prev_val = V{}; + cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true); + cur_parent->remove(cur_parent->total_entries() - 1); + cur_parent->set_edge_value(prev_val); + LOGTRACEMOD(wbcache, + "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}", + cur_parent->node_id(), child_node->to_string(), prev_val.to_string()); + } else { + LOGTRACEMOD(wbcache, "Found an empty interior node {} with maybe all childs deleted", + cur_parent->node_id()); + } + } else { + // Update edge and finish + if (is_parent_edge_node) { + cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + } else { + auto tsib_id = find_true_sibling(cur_parent); + if (tsib_id != empty_bnodeid) { + cur_parent->set_next_bnode(tsib_id); + LOGTRACEMOD(wbcache, + "True sibling [{}] for parent_node [{}], So don't add child [{}] here ", + tsib_id, cur_parent->to_string(), child_node->to_string()); + } else { + cur_parent->set_next_bnode(empty_bnodeid); + // if this child node previously belonged to this parent node, we need to add it but as edge + // o.w, not this node + if (orig_child_infos.contains(child_node->node_id())) { + cur_parent->set_edge_value( + BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + LOGTRACEMOD(wbcache, + "Child node [{}] is an edge node and previously belong to this parent, so " + "we need to add it as edge", + child_node->to_string()); + } else { + LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", + cur_parent->to_string()); + } + BT_REL_ASSERT(cur_parent->total_entries() != 0 || cur_parent->has_valid_edge(), + "Parent node [{}] cannot be empty", cur_parent->to_string()); + } + } + + // + // } + break; + } break; } - auto const child_last_key = child_node->get_last_key< K >(); - BT_LOG(INFO, "Repairing node={} child_node={} child_last_key={}", cur_parent->node_id(), - child_node->to_string(), child_last_key.to_string()); + auto child_last_key = child_node->get_last_key< K >(); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(), + child_node->to_string(), child_last_key.to_string()); + + // Check if we are beyond the last child node. + // + // There can be cases where the child level merge is successfully persisted but the parent level is + // not. In this case, you may have your rightmost child node with last key greater than the + // last_parent_key. That's why here we have to check if the child node is one of the original child + // nodes first. + if (!is_parent_edge_node && !orig_child_infos.contains(child_node->node_id())) { + if (child_last_key.compare(last_parent_key) > 0) { + // We have reached a child beyond this parent, we can stop now + // TODO this case if child last key is less than last parent key to update the parent node. + // this case can potentially break the btree for put and remove op. + break; + } + if (child_node->total_entries() == 0) { + // this child has no entries, but maybe in the middle of the parent node, we need to update the key + // of parent as previous one and go on + LOGTRACEMOD(wbcache, + "Reach to an empty child node {}, and this child doesn't belong to this parent; Hence " + "loop ends", + child_node->to_string()); + // now update the next of parent node by skipping all deleted siblings of this parent node + auto valid_sibling = cur_parent->next_bnode(); + while (valid_sibling != empty_bnodeid) { + BtreeNodePtr sibling; + if (read_node_impl(valid_sibling, sibling) == btree_status_t::success) { + if (sibling->is_node_deleted()) { + valid_sibling = sibling->next_bnode(); + continue; + } + // cur_parent->set_next_bnode(sibling->node_id()); + break; + } + LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}", + valid_sibling, cur_parent->to_string(), ret); + } + if (valid_sibling != empty_bnodeid) { + cur_parent->set_next_bnode(valid_sibling); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", + cur_parent->node_id(), child_node->to_string()); + + } else { + cur_parent->set_next_bnode(empty_bnodeid); + LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop", + cur_parent->node_id(), child_node->to_string()); + } - if (child_last_key.compare(last_parent_key) > 0) { - // We have reached the last key, we can stop now - break; + break; + } } if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(), @@ -332,33 +742,135 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } // Insert the last key of the child node into parent node - cur_parent->insert(cur_parent->total_entries(), child_last_key, - BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (!child_node->is_node_deleted()) { + if (child_node->total_entries() == 0) { + if (orig_child_infos.contains(child_node->node_id())) { + child_last_key = orig_child_infos[child_node->node_id()]; + LOGTRACEMOD(wbcache, + "Reach to an empty child node [{}], but not the end of the parent node, so we need " + "to update the key of parent node as original one {}", + child_node->to_string(), child_last_key.to_string()); + } else { + LOGTRACEMOD(wbcache, + "Reach to an empty child node [{}] but not belonging to this parent (probably next " + "parent sibling); Hence end loop", + child_node->to_string()); + break; + } + } + cur_parent->insert(cur_parent->total_entries(), child_last_key, + BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + } else { + // Node deleted indicates it's freed & no longer used during recovery + LOGTRACEMOD(wbcache, "Repairing node={}, child node=[{}] is deleted, skipping the insert", + cur_parent->node_id(), child_node->to_string()); + if (pre_child_node) { + // We need to update the next of the previous child node to this child node + + LOGTRACEMOD(wbcache, + "Repairing node={}, child_node=[{}] is deleted, set next of previous child node [{}] " + "to this child node [{}]", + cur_parent->node_id(), child_node->to_string(), pre_child_node->to_string(), + child_node->next_bnode()); + pre_child_node->set_next_bnode(child_node->next_bnode()); + // repairing the next of previous child node + // We need to set the state of the previous child node to clean, so that it can be flushed + IndexBtreeNode* idx_node = static_cast< IndexBtreeNode* >(pre_child_node.get()); + idx_node->m_idx_buf->set_state(index_buf_state_t::CLEAN); + write_node_impl(pre_child_node, cp_ctx); + // update the key of last entry of the parent with the last key of deleted child + child_last_key = orig_child_infos[child_node->node_id()]; + LOGTRACEMOD(wbcache, "updating parent [{}] current last key with {}", cur_parent->to_string(), + child_last_key.to_string()); + // update it here to go to the next child node and unlock this node + LOGTRACEMOD(wbcache, "update the child node next to the next of previous child node"); + child_node->set_next_bnode(child_node->next_bnode()); + } + } - BT_LOG(INFO, "Repairing node={}, repaired so_far={}", cur_parent->node_id(), cur_parent->to_string()); + LOGTRACEMOD(wbcache, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), + cur_parent->to_string()); // Move to the next child node - this->unlock_node(child_node, locktype_t::READ); auto const next_node_id = child_node->next_bnode(); + this->unlock_node(child_node, locktype_t::READ); + if (!child_node->is_node_deleted()) { + // We need to free the child node + pre_child_node = child_node; + } if (next_node_id == empty_bnodeid) { - BT_LOG_ASSERT(false, - "Child node={} next_node_id is empty, while its not a edge node, parent_node={} " - "repair is partial", - child_node->node_id(), parent_node->node_id()); - ret = btree_status_t::not_found; + // This can be a deleted edge node - only check if it is still valid + if (!child_node->is_node_deleted()) { + BT_LOG_ASSERT(false, + "Child node={} next_node_id is empty, while its not a edge node, parent_node={} " + "repair is partial", + child_node->node_id(), parent_node->node_id()); + ret = btree_status_t::not_found; + } + child_node = nullptr; + break; + } + if (next_cur_child && next_node_id == next_cur_child->node_id()) { + // We are at the last child node, we can stop now + LOGTRACEMOD( + wbcache, + "REACH Repairing node={}, child_node=[{}] is the true child of sibling parent; Hence, end loop", + child_node->node_id(), next_cur_child->to_string()); + child_node = nullptr; break; } - ret = this->read_and_lock_node(next_node_id, child_node, locktype_t::READ, locktype_t::READ, cp_ctx); if (ret != btree_status_t::success) { BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}", parent_node->node_id(), enum_name(ret)); + child_node = nullptr; break; } + } while (true); - this->unlock_node(child_node, locktype_t::READ); + + if (child_node) { this->unlock_node(child_node, locktype_t::READ); } + // if last parent has the key less than the last child key, then we need to update the parent node with + // the last child key if it doesn't have edge. + auto last_parent = parent_node; + if (new_parent_nodes.size() > 0) { last_parent = new_parent_nodes[new_parent_nodes.size() - 1]; } + if (last_parent->total_entries() && !last_parent->has_valid_edge()) { + if (last_parent->compare_nth_key(last_parent_key, last_parent->total_entries() - 1) < 0) { + BtreeLinkInfo child_info; + last_parent->get_nth_value(last_parent->total_entries() - 1, &child_info, false /* copy */); + parent_node->update(parent_node->total_entries() - 1, last_parent_key, child_info); + LOGTRACEMOD(wbcache, "Repairing parent node={} with last_parent_key={} and child_info={}", + parent_node->node_id(), last_parent_key.to_string(), child_info.to_string()); + } + // if last key of children is less than the last key of parent, then we need to update the last key of non + // interior child + if (last_parent->level() > 1 && !last_parent->has_valid_edge()) { + // read last child + BtreeNodePtr last_child; + BtreeLinkInfo child_info; + auto total_entries = last_parent->total_entries(); + last_parent->get_nth_value(total_entries - 1, &child_info, false /* copy */); + if (ret = read_node_impl(child_info.bnode_id(), last_child); ret == btree_status_t::success) { + // get last key of cur child + auto last_child_key = last_child->get_last_key< K >(); + BtreeLinkInfo last_child_info; + last_child->get_nth_value(last_child->total_entries() - 1, &last_child_info, false /* copy*/); + if (last_parent->compare_nth_key(last_child_key, total_entries - 1) > 0) { + auto cur_child_st = last_child->to_string(); + last_child->update(last_child->total_entries() - 1, last_parent_key, last_child_info); + LOGTRACEMOD(wbcache, + "Updating interior child node={} with last_parent_key={} and child_info={}", + cur_child_st, last_parent_key.to_string(), last_child_info.to_string()); + write_node_impl(last_child, cp_ctx); + } + } + } + } if (ret == btree_status_t::success) { + // Make write_buf happy for the parent node in case of multiple write (stale repair and link repair) + IndexBtreeNode* p_node = static_cast< IndexBtreeNode* >(parent_node.get()); + p_node->m_idx_buf->set_state(index_buf_state_t::CLEAN); ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx); } @@ -371,6 +883,49 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { delete[] tmp_buffer; return ret; } + + bnodeid_t find_true_sibling(BtreeNodePtr const& node) { + if (node == nullptr) return empty_bnodeid; + bnodeid_t sibling_id = empty_bnodeid; + if (node->has_valid_edge()) { + sibling_id = node->get_edge_value().bnode_id(); + } else { + sibling_id = node->next_bnode(); + } + if (sibling_id == empty_bnodeid) { + return empty_bnodeid; + } else { + BtreeNodePtr sibling_node; + if (read_node_impl(sibling_id, sibling_node) != btree_status_t::success) { return empty_bnodeid; } + + if (sibling_node->is_node_deleted()) { + LOGTRACEMOD(wbcache, "Sibling node [{}] is not the sibling for parent_node {}", + sibling_node->to_string(), node->to_string()); + return find_true_sibling(sibling_node); + } else { + return sibling_id; + } + } + return sibling_id; + } + + K get_last_true_child_key(BtreeNodePtr const& parent_node) { + uint32_t nentries = parent_node->total_entries() + parent_node->has_valid_edge() ? 1 : 0; + BtreeLinkInfo cur_child_info; + BtreeNodePtr cur_child; + for (uint32_t i = nentries; i-- > 0;) { + parent_node->get_nth_value(i, &cur_child_info, false /* copy */); + if (auto ret = read_node_impl(cur_child_info.bnode_id(), cur_child); ret == btree_status_t::success) { + if (!cur_child->is_node_deleted()) { + if (cur_child->total_entries()) { + return cur_child->get_last_key< K >(); + } else { + LOGTRACEMOD(wbcache, "Last valid child {} has no entries", cur_child->to_string()); + } + } + } + } + } }; } // namespace homestore diff --git a/src/lib/index/inplace_btree/wb_cache.cpp b/src/lib/index/inplace_btree/wb_cache.cpp index 899d7475a..b888a8f71 100644 --- a/src/lib/index/inplace_btree/wb_cache.cpp +++ b/src/lib/index/inplace_btree/wb_cache.cpp @@ -43,13 +43,13 @@ IndexWBCacheBase& wb_cache() { IndexWBCache::IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size) : m_vdev{vdev}, - m_cache{evictor, 100000, node_size, + m_cache{evictor, HS_DYNAMIC_CONFIG(generic.cache_hashmap_nbuckets), node_size, [](const BtreeNodePtr& node) -> BlkId { return static_cast< IndexBtreeNode* >(node.get())->m_idx_buf->m_blkid; }, [](const sisl::CacheRecord& rec) -> bool { const auto& hnode = (sisl::SingleEntryHashNode< BtreeNodePtr >&)rec; - return (hnode.m_value->m_refcount.test_le(1)); + return static_cast< IndexBtreeNode* >(hnode.m_value.get())->m_idx_buf->is_clean(); }}, m_node_size{node_size}, m_meta_blk{sb.first} { @@ -194,14 +194,19 @@ bool IndexWBCache::refresh_meta_buf(shared< MetaIndexBuffer >& meta_buf, CPConte return false; // meta_buf modified by a newer CP, we shouldn't overwrite that } else if (meta_buf->m_dirtied_cp_id == cp_ctx->id()) { // Modified by the same cp, no need to create new index buffer, but we only copy the superblk to the buffer + LOGTRACEMOD(wbcache, "meta buf {} is already dirtied in cp {} now is in recovery {}", meta_buf->to_string(), + cp_ctx->id(), m_in_recovery); meta_buf->copy_sb_to_buf(); + // TODO: corner case , meta buffer is dirtied by the same cp but not added to dirty list due to previously + // recovery mode } else { // We always create a new meta index buffer on every meta buf update, which copies the superblk auto new_buf = std::make_shared< MetaIndexBuffer >(meta_buf); new_buf->m_dirtied_cp_id = cp_ctx->id(); write_buf(nullptr, new_buf, cp_ctx); meta_buf = new_buf; // Replace the meta_buf with new buf - LOGTRACEMOD(wbcache, "meta buf {} is created in cp {}", meta_buf->to_string(), cp_ctx->id()); + LOGTRACEMOD(wbcache, "meta buf {} is created in cp {} in recovery = {}", meta_buf->to_string(), cp_ctx->id(), + m_in_recovery); } return true; } @@ -211,39 +216,55 @@ static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr con IndexBufferPtrList const& new_node_bufs, IndexBufferPtrList const& freed_node_bufs) { // TODO: Need an API from flip to quickly check if flip is enabled, so this method doesn't check flip_enabled a // bunch of times. + // TODO: Need an API to check if a flip is triggered easilly to avoid the use of several atomics. if (parent_buf && parent_buf->is_meta_buf()) { // Split or merge happening on root if (iomgr_flip::instance()->test_flip("crash_flush_on_meta")) { parent_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_root")) { child_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } } else if ((new_node_bufs.size() == 1) && freed_node_bufs.empty()) { // Its a split node situation if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_parent")) { parent_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_left_child")) { child_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_right_child")) { new_node_bufs[0]->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } } else if (!freed_node_bufs.empty() && (new_node_bufs.size() != freed_node_bufs.size())) { // Its a merge nodes sitation if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_parent")) { parent_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_left_child")) { child_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_right_child")) { - if (!new_node_bufs.empty()) { new_node_bufs[0]->set_crash_flag(); } + if (!new_node_bufs.empty()) { + new_node_bufs[0]->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); + } } } else if (!freed_node_bufs.empty() && (new_node_bufs.size() == freed_node_bufs.size())) { // Its a rebalance node situation if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_parent")) { parent_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_left_child")) { child_buf->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_right_child")) { - if (!new_node_bufs.empty()) { new_node_bufs[0]->set_crash_flag(); } + if (!new_node_bufs.empty()) { + new_node_bufs[0]->set_crash_flag(); + hs()->crash_simulator().set_will_crash(true); + } } } } @@ -282,18 +303,52 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p } icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {}); } else { - icp_ctx->add_to_txn_journal(index_ordinal, // Ordinal - child_buf->m_up_buffer, // real up buffer - new_node_bufs.empty() ? freed_node_bufs[0]->m_up_buffer - : new_node_bufs[0]->m_up_buffer, // real in place child - new_node_bufs, // new node bufs - freed_node_bufs // free_node_bufs - ); + icp_ctx->add_to_txn_journal(index_ordinal, child_buf->m_up_buffer /* real up buffer */, child_buf, + new_node_bufs, freed_node_bufs); + } +#ifdef _PRERELEASE + // log new nodes and freed nodes and parent and child + static uint32_t txn_id = 0; + static int last_cp_id = -2; + static std::string txn = ""; + if (last_cp_id != icp_ctx->id()) { + last_cp_id = icp_ctx->id(); + txn_id = 0; + txn = ""; } + + if (new_node_bufs.empty() && freed_node_bufs.empty()) { + fmt::format_to(std::back_inserter(txn), "\n{} - parent=[{}] child=[{}] new=[{}] freed=[{}]", txn_id, + (parent_buf && parent_buf->blkid().to_integer() != 0) + ? std::to_string(parent_buf->blkid().to_integer()) + : "empty", + child_buf->blkid().to_integer(), "empty", "empty"); + } else { + std::string new_nodes; + for (auto const& buf : new_node_bufs) { + new_nodes += std::to_string(buf->blkid().to_integer()) + ", "; + } + std::string freed_nodes; + for (auto const& buf : freed_node_bufs) { + freed_nodes += std::to_string(buf->blkid().to_integer()) + ", "; + } + std::string parent_str = (parent_buf && parent_buf->blkid().to_integer() != 0) + ? std::to_string(parent_buf->blkid().to_integer()) + : "empty"; + std::string child_str = (child_buf && child_buf->blkid().to_integer() != 0) + ? std::to_string(child_buf->blkid().to_integer()) + : "empty"; + + fmt::format_to(std::back_inserter(txn), "\n{} - parent={} child={} new=[{}] freed=[{}]", txn_id, parent_str, + child_str, new_nodes, freed_nodes); + } + LOGTRACEMOD(wbcache, "\ttranasction till now: cp: {} \n{}\n", icp_ctx->id(), txn); + txn_id++; +#endif #if 0 static int id = 0; - auto filename = "transact_bufs_"+std::to_string(id++)+ "_" +std::to_string(rand()%100)+".dot"; - LOGINFO("Transact cp is in cp\n{} and storing in {}\n\n\n", icp_ctx->to_string(), filename); + auto filename = fmt::format("txn_buf_{}_{}.dot", icp_ctx->id(), id++); + LOGTRACEMOD(wbcache,"Writing txn to file: {}", filename); icp_ctx->to_string_dot(filename); #endif } @@ -355,25 +410,20 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const& HS_DBG_ASSERT((real_up_buf->m_dirtied_cp_id == down_buf->m_dirtied_cp_id) || (real_up_buf->is_meta_buf()), "Up buffer is not modified by current cp, but down buffer is linked to it"); #ifndef NDEBUG - bool found{false}; - for (auto const& dbuf : real_up_buf->m_down_buffers) { - if (dbuf.lock() == down_buf) { - found = true; - break; - } - } - HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); + HS_DBG_ASSERT(real_up_buf->is_in_down_buffers(down_buf), + "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); #endif return; } } // Now we link the down_buffer to the real up_buffer - real_up_buf->m_wait_for_down_buffers.increment(1); + if (down_buf->m_up_buffer) { + // release existing up_buffer's wait count + down_buf->m_up_buffer->remove_down_buffer(down_buf); + } down_buf->m_up_buffer = real_up_buf; -#ifndef NDEBUG - real_up_buf->m_down_buffers.emplace_back(down_buf); -#endif + real_up_buf->add_down_buffer(down_buf); } void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { @@ -382,12 +432,92 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { bool done = m_cache.remove(buf->m_blkid, node); HS_REL_ASSERT_EQ(done, true, "Race on cache removal of btree blkid?"); } - + buf->m_node_freed = true; resource_mgr().inc_free_blk(m_node_size); m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(cp_ctx)); } //////////////////// Recovery Related section ///////////////////////////////// +void IndexWBCache::load_buf(IndexBufferPtr const& buf) { + if (buf->m_bytes == nullptr) { + buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); + m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid()); + buf->m_dirtied_cp_id = BtreeNode::get_modified_cp_id(buf->m_bytes); + } +} + +struct DagNode { + IndexBufferPtr buffer; + std::vector< shared< DagNode > > children; +}; + +using DagPtr = std::shared_ptr< DagNode >; +using DagMap = std::map< IndexBufferPtr, DagPtr >; + +static DagMap generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap) { + std::vector< IndexBufferPtr > bufs; + std::ranges::transform(bufmap, std::back_inserter(bufs), [](const auto& pair) { return pair.second; }); + + auto buildReverseMapping = [](const std::vector< IndexBufferPtr >& buffers) { + std::unordered_map< IndexBufferPtr, std::vector< IndexBufferPtr > > parentToChildren; + for (const auto& buffer : buffers) { + if (buffer->m_up_buffer) { parentToChildren[buffer->m_up_buffer].push_back(buffer); } + } + return parentToChildren; + }; + + std::function< DagPtr(IndexBufferPtr, std::unordered_map< IndexBufferPtr, std::vector< IndexBufferPtr > >&) > + buildDag; + buildDag = + [&buildDag](IndexBufferPtr buffer, + std::unordered_map< IndexBufferPtr, std::vector< IndexBufferPtr > >& parentToChildren) -> DagPtr { + auto dagNode = std::make_shared< DagNode >(); + dagNode->buffer = buffer; + if (parentToChildren.count(buffer)) { + for (const auto& child : parentToChildren[buffer]) { + dagNode->children.push_back(buildDag(child, parentToChildren)); + } + } + return dagNode; + }; + + auto generateDagMap = [&](const std::vector< IndexBufferPtr >& buffers) { + DagMap dagMap; + auto parentToChildren = buildReverseMapping(buffers); + for (const auto& buffer : buffers) { + if (!buffer->m_up_buffer) { // This is a root buffer + auto dagRoot = buildDag(buffer, parentToChildren); + dagMap[buffer] = dagRoot; + } + } + return dagMap; + }; + + return generateDagMap(bufs); +} + +static std::string to_string_dag_bufs(DagMap& dags, cp_id_t cp_id = 0) { + std::string str{fmt::format("#_of_dags={}\n", dags.size())}; + int cnt = 1; + for (const auto& [_, dag] : dags) { + std::vector< std::tuple< std::shared_ptr< DagNode >, int, int > > stack; + stack.emplace_back(dag, 0, cnt++); + while (!stack.empty()) { + auto [node, level, index] = stack.back(); + stack.pop_back(); + auto snew = node->buffer->m_created_cp_id == cp_id ? "NEW" : ""; + auto sfree = node->buffer->m_node_freed ? "FREED" : ""; + fmt::format_to(std::back_inserter(str), "{}{}-{} {} {}\n", std::string(level * 4, ' '), index, + node->buffer->to_string(), snew, sfree); + int c = node->children.size(); + for (const auto& d : node->children) { + stack.emplace_back(d, level + 1, c--); + } + } + } + return str; +} + void IndexWBCache::recover(sisl::byte_view sb) { // If sb is empty, its possible a first time boot. if ((sb.bytes() == nullptr) || (sb.size() == 0)) { @@ -406,6 +536,31 @@ void IndexWBCache::recover(sisl::byte_view sb) { LOGINFOMOD(wbcache, "Detected unclean shutdown, prior cp={} had to flush {} nodes, recovering... ", icp_ctx->id(), bufs.size()); +#ifdef _PRERELEASE + auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, + std::vector< IndexBufferPtr > const& pending_bufs) { + std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); + for (auto const& [_, buf] : bufs) { + load_buf(buf); + fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); + } + + // list of new_bufs + if (!pending_bufs.empty()) { + fmt::format_to(std::back_inserter(log), "\n\tpending_bufs (#of bufs = {})\n", pending_bufs.size()); + for (auto const& buf : pending_bufs) { + fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); + } + } + return log; + }; + + std::string log = fmt::format("Recovering bufs (#of bufs = {}) before processing them\n", bufs.size()); + LOGTRACEMOD(wbcache, "{}\n{}", log, detailed_log(bufs, {})); + auto dags = generate_dag_buffers(bufs); + LOGTRACEMOD(wbcache, "Before recovery: {}", to_string_dag_bufs(dags, icp_ctx->id())); +#endif + // At this point, we have the DAG structure (up/down dependency graph), exactly the same as prior to crash, with one // addition of all freed buffers also put in the DAG structure. // @@ -416,6 +571,68 @@ void IndexWBCache::recover(sisl::byte_view sb) { // This has to be done before doing any repair, because repair can allocate blkids and we don't want to allocate // the same blkid which could clash with the blkid next in the buf list. // + // On the second pass, we only take part of the parents/siblings and then repair them, if needed. + std::vector< IndexBufferPtr > pending_bufs; + std::vector< IndexBufferPtr > deleted_bufs; + for (auto const& [_, buf] : bufs) { + if (buf->m_node_freed) { + // Freed node + load_buf(buf); + if (was_node_committed(buf)) { + // Mark this buffer as deleted, so that we can avoid using it anymore when repairing its parent's link + r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted = true; + write_buf(nullptr, buf, icp_ctx); + deleted_bufs.push_back(buf); + pending_bufs.push_back(buf->m_up_buffer); + } else { + // (Up) buffer is not committed, node need to be kept and (potentially) repaired later + buf->m_node_freed = false; + if (buf->m_created_cp_id == icp_ctx->id()) { + // New nodes need to be commited first + m_vdev->commit_blk(buf->m_blkid); + // it can happen when children moved to one of right parent sibling and then the previous node is + // deleted but not commited during crash (upbuffer is not committed). but its children already + // committed. and freed (or changed) + if (buf->m_node_level) { potential_parent_recovered_bufs.insert(buf); } + } else { + LOGINFO("deleting and creating new buf {}", buf->to_string()); + deleted_bufs.push_back(buf); + } + // 1- upbuffer was dirtied by the same cp, so it is not commited, so we don't need to repair it. + // remove it from down_waiting list (probably recursively going up) 2- upbuffer was created and + // freed at the same cp, so it is not commited, so we don't need to repair it. + if (buf->m_up_buffer) { + LOGTRACEMOD(wbcache, "remove_down_buffer {} from up buffer {}", buf->to_string(), + buf->m_up_buffer->to_string()); + buf->m_up_buffer->remove_down_buffer(buf); + if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) { + // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers + LOGINFOMOD(wbcache, + "\n\npruning up_buffer due to zero dependency of child\n up buffer {}\n buffer {}", + buf->m_up_buffer ? buf->m_up_buffer->to_string() : std::string("nullptr"), + buf->to_string()); + update_up_buffer_counters(buf->m_up_buffer /*,visited_bufs*/); + } + buf->m_up_buffer = nullptr; + } + pending_bufs.push_back(buf); + buf->m_wait_for_down_buffers.increment(1); // Purely for recover_buf() counter consistency + } + } else if (buf->m_created_cp_id == icp_ctx->id()) { + // New node + if (was_node_committed(buf) && was_node_committed(buf->m_up_buffer)) { + // Both current and up buffer is commited, we can safely commit the current block + m_vdev->commit_blk(buf->m_blkid); + pending_bufs.push_back(buf->m_up_buffer); + } else { + // Up buffer is not committed, we need to repair it first + buf->m_up_buffer->remove_down_buffer(buf); + // buf->m_up_buffer = nullptr; + if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) { + // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers + update_up_buffer_counters(buf->m_up_buffer); + } + // On the second pass, we only take the new nodes/bufs and then repair their up buffers, if needed. std::vector< IndexBufferPtr > l0_bufs; for (auto const& [_, buf] : bufs) { @@ -436,38 +653,65 @@ void IndexWBCache::recover(sisl::byte_view sb) { } } +#ifdef _PRERELEASE LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", - l0_bufs.size(), bufs.size(), icp_ctx->id()); - - auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, - std::vector< IndexBufferPtr > const& l0_bufs) { - // Logs to detect down_waits are set correctly for up buffers list of all recovered bufs - std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); - for (auto const& [_, buf] : bufs) { - fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); - } + pending_bufs.size(), bufs.size(), icp_ctx->id()); + LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, pending_bufs)); + LOGTRACEMOD(wbcache, "After recovery: {}", to_string_dag_bufs(dags, icp_ctx->id())); +#endif - // list of new_bufs - fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size()); - for (auto const& buf : l0_bufs) { - fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); + uint32_t cnt = 0; + LOGTRACEMOD(wbcache, "Potential parent recovered bufs (#of bufs = {})", potential_parent_recovered_bufs.size()); + for (auto const& buf : potential_parent_recovered_bufs) { + LOGTRACEMOD(wbcache, " {} - check stale recovered buf {}", cnt++, buf->to_string()); + } + // This step is needed since there is a case where all(or some) children of an interior node is freed (after moving + // to a previous sibling parent) and after crash, this node has stale links to its children + cnt = 0; + std::vector< IndexBufferPtr > buffers_to_repair; + for (auto const& buf : potential_parent_recovered_bufs) { + LOGTRACEMOD(wbcache, " {} - potential parent recovered buf {}", cnt, buf->to_string()); + parent_recover(buf); + if (buf->m_bytes == nullptr || r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted) { + // This buffer was marked as deleted during repair, so we also need to free it + deleted_bufs.push_back(buf); + } else { + // This buffer was not marked as deleted during repair, so we need to repair it + buffers_to_repair.push_back(buf); } - return log; - }; - LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, l0_bufs)); + } - // Second iteration we start from the lowest levels (which are all new_bufs) and check if up_buffers need to be - // repaired. All L1 buffers are not needed to repair, because they are sibling nodes and so we pass false in - // do_repair flag. - for (auto const& buf : l0_bufs) { - recover_buf(buf->m_up_buffer); + for (auto const& buf : deleted_bufs) { + m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); } + m_in_recovery = false; m_vdev->recovery_completed(); } +// if buf->m_wait_for_down_buffers.testz() is true (which means that it has no dependency on any other buffer) then we +// can decrement the wait_for_down_buffers of its up buffer. If the up buffer has up buffer, then we need to decrement +// its wait_for_down_buffers. If the up buffer of up buffer has wait_for_down_buffers as 0, then we need to decrement +// its wait_for_down_buffers. This process continues until we reach the root buffer. If the root buffer has +// wait_for_down_buffers as 0, then we need to decrement its wait_for_down_buffers. +void IndexWBCache::update_up_buffer_counters(IndexBufferPtr const& buf) { + if (buf == nullptr || !buf->m_wait_for_down_buffers.testz() || buf->m_up_buffer == nullptr) { + LOGINFOMOD(wbcache, "Finish decrementing wait_for_down_buffers"); + return; + } + auto grand_buf = buf->m_up_buffer; + grand_buf->remove_down_buffer(buf); + LOGINFOMOD(wbcache, + "Decrementing wait_for_down_buffers for buffer {} due to zero dependency of child {}, Keep going up", + grand_buf->to_string(), buf->to_string()); + update_up_buffer_counters(grand_buf); +} + void IndexWBCache::recover_buf(IndexBufferPtr const& buf) { - if (!buf->m_wait_for_down_buffers.decrement_testz()) { return; } + if (!buf->m_wait_for_down_buffers.decrement_testz()) { + // TODO: remove the buf_>m_up_buffer from down_buffers list of buf->m_up_buffer + return; + } // All down buffers are completed and given a nod saying that they are committed. If this buffer is not committed, // then we need to repair this node/buffer. After that we will keep going to the next up level to repair them if @@ -478,6 +722,12 @@ void IndexWBCache::recover_buf(IndexBufferPtr const& buf) { } else { LOGTRACEMOD(wbcache, "Index Recovery detected up node [{}] as committed no need to repair that", buf->to_string()); + if (buf->m_up_buffer && buf->m_up_buffer->is_meta_buf()) { + // Our up buffer is a meta buffer, which means old root is dirtied and may need no repair but possible of + // new root on upper level so needs to be retore the edge + LOGTRACEMOD(wbcache, "check root change for without repairing {}", buf->to_string()); + index_service().update_root(buf->m_index_ordinal, buf); + } } if (buf->m_up_buffer) { recover_buf(buf->m_up_buffer); } @@ -493,21 +743,21 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) { } // All down_buf has indicated that they have seen this up buffer, now its time to repair them. - if (buf->m_bytes == nullptr) { - // Read the btree node and get its modified cp_id - buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); - m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid()); - if (!BtreeNode::is_valid_node(sisl::blob{buf->m_bytes, m_node_size})) { return false; } - - buf->m_dirtied_cp_id = BtreeNode::get_modified_cp_id(buf->m_bytes); - } - auto cpg = cp_mgr().cp_guard(); - return (buf->m_dirtied_cp_id == cpg->id()); + load_buf(buf); + if (!BtreeNode::is_valid_node(sisl::blob{buf->m_bytes, m_node_size})) { return false; } + return (buf->m_dirtied_cp_id == cp_mgr().cp_guard()->id()); } //////////////////// CP Related API section ///////////////////////////////// folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { - LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp context={}", cp_ctx->to_string_with_dags()); + LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp \ndag={}\n\n cp context {}", cp_ctx->to_string_with_dags(), + cp_ctx->to_string()); + // #ifdef _PRERELEASE + // static int id = 0; + // auto filename = "cp_" + std::to_string(id++) + "_" + std::to_string(rand() % 100) + ".dot"; + // LOGTRACEMOD(wbcache, "Transact cp storing in file {}\n\n\n", filename); + // cp_ctx->to_string_dot(filename); + // #endif if (!cp_ctx->any_dirty_buffers()) { if (cp_ctx->id() == 0) { // For the first CP, we need to flush the journal buffer to the meta blk @@ -521,17 +771,20 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { #ifdef _PRERELEASE if (hs()->crash_simulator().is_crashed()) { - LOGINFOMOD(wbcache, "crash simulation is ongoing, so skip the cp flush"); + LOGINFO("crash simulation is ongoing, so skip the cp flush"); return folly::makeFuture< bool >(true); } #endif - // First thing is to flush the new_blks created as part of the CP. + // First thing is to flush the journal created as part of the CP. auto const& journal_buf = cp_ctx->journal_buf(); + auto txn = r_cast< IndexCPContext::txn_journal const* >(journal_buf.cbytes()); if (journal_buf.size() != 0) { if (m_meta_blk) { + LOGTRACEMOD(wbcache, " journal {} ", txn->to_string()); meta_service().update_sub_sb(journal_buf.cbytes(), journal_buf.size(), m_meta_blk); } else { + LOGTRACEMOD(wbcache, " First time journal {} ", txn->to_string()); meta_service().add_sub_sb("wb_cache", journal_buf.cbytes(), journal_buf.size(), m_meta_blk); } } @@ -554,44 +807,44 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) { #ifdef _PRERELEASE + static std::once_flag flag; + if (hs()->crash_simulator().is_crashed()) { + std::call_once(flag, []() { LOGINFO("Crash simulation is ongoing; aid simulation by not flushing."); }); + return; + } if (buf->m_crash_flag_on) { -// std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; -// LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename); -// cp_ctx->to_string_dot(filename); - LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}", buf->to_string()); + std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; + LOGINFO("Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename); + // cp_ctx->to_string_dot(filename); hs()->crash_simulator().crash(); cp_ctx->complete(true); return; - } else if (hs()->crash_simulator().is_crashed()) { - LOGINFOMOD(wbcache, "crash simulation is ongoing, aid simulation by not flushing"); - return; } #endif - LOGTRACEMOD(wbcache, "cp={} {}", cp_ctx->id(), buf->to_string()); buf->set_state(index_buf_state_t::FLUSHING); if (buf->is_meta_buf()) { - LOGTRACEMOD(wbcache, "flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), + LOGTRACEMOD(wbcache, "Flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), buf->to_string()); - auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; - meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + auto const sb_buf = r_cast< MetaIndexBuffer* >(buf.get()); + if (sb_buf->m_valid) { + auto const& sb = sb_buf->m_sb; + if (!sb.is_empty()) { meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); } + } process_write_completion(cp_ctx, buf); } else if (buf->m_node_freed) { LOGTRACEMOD(wbcache, "Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), buf->to_string()); process_write_completion(cp_ctx, buf); } else { - LOGTRACEMOD(wbcache, "flushing cp {} buf {} info: {}", cp_ctx->id(), buf->to_string(), - BtreeNode::to_string_buf(buf->raw_buffer())); + LOGTRACEMOD(wbcache, "Flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) .thenValue([buf, cp_ctx](auto) { try { auto& pthis = s_cast< IndexWBCache& >(wb_cache()); pthis.process_write_completion(cp_ctx, buf); - } catch (const std::runtime_error& e) { - LOGERROR("Failed to access write-back cache: {}", e.what()); - } + } catch (const std::runtime_error& e) { LOGERROR("Failed to access write-back cache: {}", e.what()); } }); if (!part_of_batch) { m_vdev->submit_batch(); } @@ -600,8 +853,10 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferPtr const& buf) { #ifdef _PRERELEASE + static std::once_flag flag; if (hs()->crash_simulator().is_crashed()) { - LOGINFOMOD(wbcache, "Crash simulation is ongoing, ignore all process_write_completion"); + std::call_once( + flag, []() { LOGINFOMOD(wbcache, "Crash simulation is ongoing, ignore all process_write_completion"); }); return; } #endif @@ -635,7 +890,10 @@ std::pair< IndexBufferPtr, bool > IndexWBCache::on_buf_flush_done_internal(Index IndexBufferPtr const& buf) { IndexBufferPtrList buf_list; #ifndef NDEBUG - buf->m_down_buffers.clear(); + { + std::lock_guard lg(buf->m_down_buffers_mtx); + buf->m_down_buffers.clear(); + } #endif buf->set_state(index_buf_state_t::CLEAN); @@ -683,7 +941,7 @@ void IndexWBCache::get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_c std::optional< IndexBufferPtr > buf = cp_ctx->next_dirty(); if (!buf) { break; } // End of list - if ((*buf)->m_wait_for_down_buffers.testz()) { + if ((*buf)->state() == index_buf_state_t::DIRTY && (*buf)->m_wait_for_down_buffers.testz()) { bufs.emplace_back(std::move(*buf)); ++count; } else { diff --git a/src/lib/index/inplace_btree/wb_cache.hpp b/src/lib/index/inplace_btree/wb_cache.hpp index 209d3845e..7d10d7f54 100644 --- a/src/lib/index/inplace_btree/wb_cache.hpp +++ b/src/lib/index/inplace_btree/wb_cache.hpp @@ -78,5 +78,7 @@ class IndexWBCache : public IndexWBCacheBase { void recover_buf(IndexBufferPtr const& buf); bool was_node_committed(IndexBufferPtr const& buf); + void load_buf(IndexBufferPtr const& buf); + void update_up_buffer_counters(IndexBufferPtr const& buf); }; } // namespace homestore diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 7cae168f3..2b3f88c30 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -65,10 +65,11 @@ void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) { // First read the info block if (format) { HS_LOG_ASSERT(m_logdev_meta.is_empty(), "Expected meta to be not present"); - m_logdev_meta.create(m_logdev_id); + m_logdev_meta.create(m_logdev_id, m_flush_mode); m_vdev_jd->update_data_start_offset(0); } else { - HS_LOG_ASSERT(!m_logdev_meta.is_empty(), "Expected meta data to be read already before loading"); + HS_LOG_ASSERT(!m_logdev_meta.is_empty(), + "Expected meta data to be read already before loading this log dev id: {}", m_logdev_id); auto const store_list = m_logdev_meta.load(); // Notify to the caller that a new log store was reserved earlier and it is being loaded, with its meta info @@ -133,6 +134,7 @@ void LogDev::stop() { m_log_idx.store(0); m_pending_flush_size.store(0); m_last_flush_idx = -1; + m_last_flush_ld_key = logdev_key{0, 0}; m_last_truncate_idx = -1; m_last_crc = INVALID_CRC32_VALUE; @@ -144,10 +146,37 @@ void LogDev::stop() { m_hs.reset(); } -bool LogDev::is_stopped() { - std::unique_lock lg = flush_guard(); - return m_stopped; +#if 0 +void LogDev::stop() { + start_stopping(); + while (true) { + if (!get_pending_request_num()) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + { + std::unique_lock lg = flush_guard(); + // waiting under lock to make sure no new flush is started + while (m_pending_callback.load() > 0) { + THIS_LOGDEV_LOG(INFO, "Waiting for pending callbacks to complete, pending callbacks {}", + m_pending_callback.load()); + std::this_thread::sleep_for(std::chrono::milliseconds{1000}); + } + } + + folly::SharedMutexWritePriority::ReadHolder holder(m_store_map_mtx); + for (auto& [_, store] : m_id_logstore_map) { + store.log_store->stop(); + } + + // after we call stop, we need to do any pending device truncations + truncate(); + m_id_logstore_map.clear(); + if (allow_timer_flush()) { + auto f = stop_timer(); + std::move(f).get(); + } } +#endif void LogDev::destroy() { THIS_LOGDEV_LOG(INFO, "Logdev destroy metablks log_dev={}", m_logdev_id); @@ -164,12 +193,19 @@ void LogDev::start_timer() { }); } -void LogDev::stop_timer() { - if (m_flush_timer_hdl != iomgr::null_timer_handle) { - // cancel the timer - iomanager.run_on_wait(logstore_service().flush_thread(), - [this]() { iomanager.cancel_timer(m_flush_timer_hdl, true); }); - } +folly::Future< int > LogDev::stop_timer() { + // return future to the caller; + // this future will be completed when the timer is stopped + auto p = std::make_shared< folly::Promise< int > >(); + auto f = p->getFuture(); + iomanager.run_on_forget(logstore_service().flush_thread(), [this, p]() mutable { + if (m_flush_timer_hdl != iomgr::null_timer_handle) { + iomanager.cancel_timer(m_flush_timer_hdl, true); + m_flush_timer_hdl = iomgr::null_timer_handle; + } + p->setValue(0); + }); + return f; } void LogDev::do_load(off_t device_cursor) { @@ -202,8 +238,7 @@ void LogDev::do_load(off_t device_cursor) { // Loop through each record within the log group and do a callback decltype(header->nrecords()) i{0}; HS_REL_ASSERT_GT(header->nrecords(), 0, "nrecords greater then zero"); - const auto flush_ld_key = - logdev_key{header->start_idx() + header->nrecords(), group_dev_offset + header->total_size()}; + const auto flush_ld_key = logdev_key{header->start_idx(), group_dev_offset}; while (i < header->nrecords()) { const auto* rec = header->nth_record(i); const uint32_t data_offset = (rec->offset + (rec->get_inlined() ? 0 : header->oob_data_offset)); @@ -262,6 +297,7 @@ int64_t LogDev::append_async(logstore_id_t store_id, logstore_seq_num_t seq_num, } log_buffer LogDev::read(const logdev_key& key) { + std::unique_lock lg = flush_guard(); auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread); auto ec = m_vdev_jd->sync_pread(buf->bytes(), initial_read_size, key.dev_offset); if (ec) { @@ -290,6 +326,7 @@ log_buffer LogDev::read(const logdev_key& key) { } void LogDev::read_record_header(const logdev_key& key, serialized_log_record& return_record_header) { + std::unique_lock lg = flush_guard(); auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread); auto ec = m_vdev_jd->sync_pread(buf->bytes(), initial_read_size, key.dev_offset); if (ec) LOGERROR("Failed to read from Journal vdev log_dev={} {} {}", m_logdev_id, ec.value(), ec.message()); @@ -497,11 +534,12 @@ void LogDev::on_flush_completion(LogGroup* lg) { free_log_group(lg); m_log_records->truncate(upto_indx); m_last_flush_idx = upto_indx; + m_last_flush_ld_key = logdev_key{from_indx, dev_offset}; // since we support out-of-order lsn write, so no need to guarantee the order of logstore write completion for (auto const& [idx, req] : req_map) { m_pending_callback++; - iomanager.run_on_forget(iomgr::reactor_regex::random_worker, iomgr::fiber_regex::syncio_only, + iomanager.run_on_forget(iomgr::reactor_regex::random_worker, /* iomgr::fiber_regex::syncio_only, */ [this, dev_offset, idx, req]() { auto ld_key = logdev_key{idx, dev_offset}; auto comp_cb = req->log_store->get_comp_cb(); @@ -526,20 +564,31 @@ uint64_t LogDev::truncate() { auto lstore = store.log_store; if (lstore == nullptr) { continue; } auto const [trunc_lsn, trunc_ld_key, tail_lsn] = lstore->truncate_info(); - if (trunc_lsn == tail_lsn) { - THIS_LOGDEV_LOG(DEBUG, "Store_id={} didn't have any writes since last truncation, skipping ", store_id); - m_logdev_meta.remove_all_rollback_records(store_id, m_stopped /* persist_now */); - continue; - } - HS_DBG_ASSERT_GE(trunc_ld_key.idx, m_last_truncate_idx, "Trying to truncate logid which is already truncated"); m_logdev_meta.update_store_superblk(store_id, logstore_superblk(trunc_lsn + 1), m_stopped /* persist_now */); - // We found a new minimum logdev_key that we can truncate to - if (trunc_ld_key.idx > 0 && trunc_ld_key.idx < min_safe_ld_key.idx) { min_safe_ld_key = trunc_ld_key; } + if (trunc_ld_key.idx < min_safe_ld_key.idx) { min_safe_ld_key = trunc_ld_key; } + } + + // All log stores are empty, we can truncate logs depends on the last flushed logdev_key + if (min_safe_ld_key == logdev_key::out_of_bound_ld_key()) { + min_safe_ld_key = m_last_flush_ld_key; } // There are no writes or no truncation called for any of the store, so we can't truncate anything - if (min_safe_ld_key == logdev_key::out_of_bound_ld_key() || min_safe_ld_key.idx <= m_last_truncate_idx) return 0; + if (min_safe_ld_key.idx <= 0 || min_safe_ld_key.idx <= m_last_truncate_idx) { + // Persist the logstore superblock to ensure correct start LSN during recovery. Avoid such scenario: + // 1. Follower1 appends logs up to 100, then is stopped by a sigkill. + // 2. Upon restart, a baseline resync is triggered using snapshot 2000. + // 3. Baseline resync completed with start_lsn=2001, but m_trunc_ld_key remains {0,0} since we cannot get a + // valid + // device offset for LSN 2000 to update it. + // 4. Follower1 appends logs from 2001 to 2500, making tail_lsn > 2000. + // 5. Get m_trunc_ld_key={0,0}, goto here and return 0 without persist. + // 6. Follower1 is killed again, after restart, its start index remains 0, misinterpreting the range as + // [1,2500]. + m_logdev_meta.persist(); + return 0; + } uint64_t const num_records_to_truncate = uint64_cast(min_safe_ld_key.idx - m_last_truncate_idx); @@ -615,7 +664,9 @@ std::shared_ptr< HomeLogStore > LogDev::create_new_log_store(bool append_mode) { return lstore; } -folly::Future< shared< HomeLogStore > > LogDev::open_log_store(logstore_id_t store_id, bool append_mode) { +folly::Future< shared< HomeLogStore > > LogDev::open_log_store(logstore_id_t store_id, bool append_mode, + log_found_cb_t log_found_cb, + log_replay_done_cb_t log_replay_done_cb) { folly::SharedMutexWritePriority::WriteHolder holder(m_store_map_mtx); auto it = m_id_logstore_map.find(store_id); if (it == m_id_logstore_map.end()) { @@ -624,6 +675,8 @@ folly::Future< shared< HomeLogStore > > LogDev::open_log_store(logstore_id_t sto logstore_info{ .log_store = nullptr, .append_mode = append_mode, + .log_found_cb = log_found_cb, + .log_replay_done_cb = log_replay_done_cb, })); HS_REL_ASSERT_EQ(happened, true, "Unable to insert logstore into id_logstore_map"); } @@ -635,7 +688,10 @@ void LogDev::remove_log_store(logstore_id_t store_id) { { folly::SharedMutexWritePriority::WriteHolder holder(m_store_map_mtx); auto ret = m_id_logstore_map.erase(store_id); - HS_REL_ASSERT((ret == 1), "try to remove invalid store_id {}-{}", m_logdev_id, store_id); + if (ret == 0) { + LOGWARN("try to remove invalid store_id {}-{}", m_logdev_id, store_id); + return; + } } unreserve_store_id(store_id); } @@ -656,6 +712,8 @@ void LogDev::on_log_store_found(logstore_id_t store_id, const logstore_superblk& logstore_info& info = it->second; info.log_store = std::make_shared< HomeLogStore >(shared_from_this(), store_id, info.append_mode, sb.m_first_seq_num); + info.log_store->register_log_found_cb(info.log_found_cb); + info.log_store->register_log_replay_done_cb(info.log_replay_done_cb); info.promise.setValue(info.log_store); } @@ -729,7 +787,7 @@ nlohmann::json LogDev::get_status(int verbosity) const { /////////////////////////////// LogDevMetadata Section /////////////////////////////////////// LogDevMetadata::LogDevMetadata() : m_sb{logdev_sb_meta_name}, m_rollback_sb{logdev_rollback_sb_meta_name} {} -logdev_superblk* LogDevMetadata::create(logdev_id_t id) { +logdev_superblk* LogDevMetadata::create(logdev_id_t id, flush_mode_t flush_mode) { logdev_superblk* sb = m_sb.create(logdev_sb_size_needed(0)); rollback_superblk* rsb = m_rollback_sb.create(rollback_superblk::size_needed(1)); @@ -738,6 +796,7 @@ logdev_superblk* LogDevMetadata::create(logdev_id_t id) { m_id_reserver = std::make_unique< sisl::IDReserver >(); m_sb->logdev_id = id; + m_sb->flush_mode = flush_mode; m_sb.write(); m_rollback_sb->logdev_id = id; diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index cf09e57bc..f3cc03f1d 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -404,6 +404,8 @@ struct logdev_superblk { uint32_t num_stores{0}; uint64_t start_dev_offset{0}; logid_t key_idx{0}; + flush_mode_t flush_mode; + // The meta data starts immediately after the super block // Equivalent of: // logstore_superblk meta[0]; @@ -481,7 +483,7 @@ class LogDevMetadata { LogDevMetadata& operator=(LogDevMetadata&&) noexcept = delete; ~LogDevMetadata() = default; - logdev_superblk* create(logdev_id_t id); + logdev_superblk* create(logdev_id_t id, flush_mode_t); void reset(); std::vector< std::pair< logstore_id_t, logstore_superblk > > load(); void persist(); @@ -564,18 +566,14 @@ class log_stream_reader { struct logstore_info { std::shared_ptr< HomeLogStore > log_store; bool append_mode; + log_found_cb_t log_found_cb{nullptr}; + log_replay_done_cb_t log_replay_done_cb{nullptr}; folly::SharedPromise< std::shared_ptr< HomeLogStore > > promise{}; }; static std::string const logdev_sb_meta_name{"Logdev_sb"}; static std::string const logdev_rollback_sb_meta_name{"Logdev_rollback_sb"}; -VENUM(flush_mode_t, uint32_t, // Various flush modes (can be or'ed together) - INLINE = 1 << 0, // Allow flush inline with the append - TIMER = 1 << 1, // Allow timer based automatic flush - EXPLICIT = 1 << 2, // Allow explcitly user calling flush -); - class LogDev : public std::enable_shared_from_this< LogDev > { friend class HomeLogStore; @@ -708,7 +706,9 @@ class LogDev : public std::enable_shared_from_this< LogDev > { /// @param append_mode Is this log store is append mode or not. If append mode, write_async call fails and only /// append_async calls succeed. /// @return future< shared< HomeLogStore > > : Future which will be set with the log store once it is opened - folly::Future< shared< HomeLogStore > > open_log_store(logstore_id_t store_id, bool append_mode); + folly::Future< shared< HomeLogStore > > open_log_store(logstore_id_t store_id, bool append_mode, + log_found_cb_t log_found_cb = nullptr, + log_replay_done_cb_t log_replay_done_cb = nullptr); /// @brief Remove the log store and its associated resources /// @param store_id Store id that was created/opened @@ -727,7 +727,7 @@ class LogDev : public std::enable_shared_from_this< LogDev > { private: void start_timer(); - void stop_timer(); + folly::Future< int > stop_timer(); bool allow_inline_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::INLINE); } bool allow_timer_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::TIMER); } @@ -791,8 +791,9 @@ class LogDev : public std::enable_shared_from_this< LogDev > { std::multimap< logid_t, logstore_id_t > m_garbage_store_ids; Clock::time_point m_last_flush_time; - logid_t m_last_flush_idx{-1}; // Track last flushed, last device offset and truncated log idx - logid_t m_last_truncate_idx{std::numeric_limits< logid_t >::min()}; // logdev truncate up to this idx + logid_t m_last_flush_idx{-1}; // Track last flushed, last device offset and truncated log idx + logdev_key m_last_flush_ld_key{0,0}; // Left interval of the last flush, 0 indicates the very beginning of logdev + logid_t m_last_truncate_idx{-1}; // Logdev truncate up to this idx crc32_t m_last_crc{INVALID_CRC32_VALUE}; // LogDev Info block related fields diff --git a/src/lib/logstore/log_store.cpp b/src/lib/logstore/log_store.cpp index e2ea0f333..1e3a1bea6 100644 --- a/src/lib/logstore/log_store.cpp +++ b/src/lib/logstore/log_store.cpp @@ -188,12 +188,27 @@ void HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate #endif + // In normal write and compact path, upto_lsn is expected to be no larger than m_tail_lsn after the flush. + // So upto_lsn > m_tail_lsn is expected to exist only in baseline resync path. + // In baseline resync path, we truncate all entries up to upto_lsn, and update m_tail_lsn and m_next_lsn + // to make sure logstore's idx is always = raft's idx - 1. if (upto_lsn > m_tail_lsn) { THIS_LOGSTORE_LOG(WARN, - "Truncating issued on lsn={} which is greater than tail_lsn={}, truncating upto tail_lsn", + "Truncating issued on lsn={} which is greater than tail_lsn={}", upto_lsn, m_tail_lsn.load(std::memory_order_relaxed)); - m_trunc_ld_key = m_records.at(m_tail_lsn).m_trunc_key; - upto_lsn = m_tail_lsn; + // update m_tail_lsn if it is less than upto_lsn + auto current_tail_lsn = m_tail_lsn.load(std::memory_order_relaxed); + while (current_tail_lsn < upto_lsn && + !m_tail_lsn.compare_exchange_weak(current_tail_lsn, upto_lsn, std::memory_order_relaxed)) {} + + // update m_next_lsn if it is less than upto_lsn + 1 + auto current_next_lsn = m_next_lsn.load(std::memory_order_relaxed); + while (current_next_lsn < upto_lsn + 1 && + !m_next_lsn.compare_exchange_weak(current_next_lsn, upto_lsn + 1, std::memory_order_relaxed)) {} + + // insert an empty record to make sure m_records has enough size to truncate + logdev_key empty_ld_key; + m_records.create_and_complete(upto_lsn, logstore_record(empty_ld_key, empty_ld_key)); } else { m_trunc_ld_key = m_records.at(upto_lsn).m_trunc_key; THIS_LOGSTORE_LOG(TRACE, "Truncating logstore upto lsn={} , m_trunc_ld_key index {} offset {}", upto_lsn, @@ -206,7 +221,12 @@ void HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate std::tuple< logstore_seq_num_t, logdev_key, logstore_seq_num_t > HomeLogStore::truncate_info() const { auto const trunc_lsn = m_start_lsn.load(std::memory_order_relaxed) - 1; - return std::make_tuple(trunc_lsn, m_trunc_ld_key, m_tail_lsn.load(std::memory_order_relaxed)); + auto const tail_lsn = m_tail_lsn.load(std::memory_order_relaxed); + + // If the store is empty, return out_of_bound_ld_key as trunc_ld_key, allowing the caller to truncate freely. + // Otherwise, return the actual trunc_ld_key. + return (trunc_lsn == tail_lsn) ? std::make_tuple(trunc_lsn, logdev_key::out_of_bound_ld_key(), tail_lsn) + : std::make_tuple(trunc_lsn, m_trunc_ld_key, tail_lsn); } void HomeLogStore::fill_gap(logstore_seq_num_t seq_num) { @@ -276,10 +296,7 @@ void HomeLogStore::flush(logstore_seq_num_t upto_lsn) { return; } - if (upto_lsn == invalid_lsn()) { upto_lsn = m_records.active_upto(); } - - // if we have flushed already, we are done, else issue a flush - if (m_records.status(upto_lsn).is_active) m_logdev->flush_under_guard(); + m_logdev->flush_under_guard(); } bool HomeLogStore::rollback(logstore_seq_num_t to_lsn) { diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index c44291d69..7270a6184 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -135,10 +135,10 @@ logdev_id_t LogStoreService::get_next_logdev_id() { return id; } -logdev_id_t LogStoreService::create_new_logdev() { +logdev_id_t LogStoreService::create_new_logdev(flush_mode_t flush_mode) { folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); logdev_id_t logdev_id = get_next_logdev_id(); - auto logdev = create_new_logdev_internal(logdev_id); + auto logdev = create_new_logdev_internal(logdev_id, flush_mode); logdev->start(true /* format */, m_logdev_vdev); COUNTER_INCREMENT(m_metrics, logdevs_count, 1); HS_LOG(INFO, logstore, "Created log_dev={}", logdev_id); @@ -146,6 +146,8 @@ logdev_id_t LogStoreService::create_new_logdev() { } void LogStoreService::destroy_log_dev(logdev_id_t logdev_id) { + HS_LOG(INFO, logstore, "Destroying logdev {}", logdev_id); + folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); const auto it = m_id_logdev_map.find(logdev_id); if (it == m_id_logdev_map.end()) { @@ -155,20 +157,20 @@ void LogStoreService::destroy_log_dev(logdev_id_t logdev_id) { // Stop the logdev and release all the chunks from the journal vdev. auto& logdev = it->second; - if (!logdev->is_stopped()) { - // Stop the logdev if its started. - logdev->stop(); - } + // if (!logdev->is_stopped()) { + // Stop the logdev if its started. + logdev->stop(); + //} - // First release all chunks. - m_logdev_vdev->destroy(logdev_id); + // First release all chunks. + m_logdev_vdev->destroy(logdev_id); - // Destroy the metablks for logdev. - logdev->destroy(); + // Destroy the metablks for logdev. + logdev->destroy(); - m_id_logdev_map.erase(it); - COUNTER_DECREMENT(m_metrics, logdevs_count, 1); - HS_LOG(INFO, logstore, "Removed log_dev={}", logdev_id); + m_id_logdev_map.erase(it); + COUNTER_DECREMENT(m_metrics, logdevs_count, 1); + HS_LOG(INFO, logstore, "Removed log_dev={}", logdev_id); } void LogStoreService::delete_unopened_logdevs() { @@ -179,19 +181,20 @@ void LogStoreService::delete_unopened_logdevs() { m_unopened_logdev.clear(); } -std::shared_ptr< LogDev > LogStoreService::create_new_logdev_internal(logdev_id_t logdev_id) { - auto logdev = std::make_shared< LogDev >(logdev_id); +std::shared_ptr< LogDev > LogStoreService::create_new_logdev_internal(logdev_id_t logdev_id, flush_mode_t flush_mode) { + auto logdev = std::make_shared< LogDev >(logdev_id, flush_mode); const auto it = m_id_logdev_map.find(logdev_id); HS_REL_ASSERT((it == m_id_logdev_map.end()), "logdev id {} already exists", logdev_id); m_id_logdev_map.insert(std::make_pair<>(logdev_id, logdev)); + LOGINFO("Created logdev {}", logdev_id); return logdev; } -void LogStoreService::open_logdev(logdev_id_t logdev_id) { +void LogStoreService::open_logdev(logdev_id_t logdev_id, flush_mode_t flush_mode) { folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); const auto it = m_id_logdev_map.find(logdev_id); if (it == m_id_logdev_map.end()) { - auto logdev = std::make_shared< LogDev >(logdev_id); + auto logdev = std::make_shared< LogDev >(logdev_id, flush_mode); m_id_logdev_map.emplace(logdev_id, logdev); LOGDEBUGMOD(logstore, "log_dev={} does not exist, created!", logdev_id); } @@ -224,13 +227,14 @@ void LogStoreService::logdev_super_blk_found(const sisl::byte_view& buf, void* m folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); std::shared_ptr< LogDev > logdev; auto id = sb->logdev_id; + auto flush_mode = sb->flush_mode; const auto it = m_id_logdev_map.find(id); // We could update the logdev map either with logdev or rollback superblks found callbacks. if (it != m_id_logdev_map.end()) { logdev = it->second; HS_LOG(DEBUG, logstore, "Log dev superblk found log_dev={}", id); } else { - logdev = std::make_shared< LogDev >(id); + logdev = std::make_shared< LogDev >(id, flush_mode); m_id_logdev_map.emplace(id, logdev); // when recover logdev meta blk, we get all the logdevs from the superblk. we put them in m_unopened_logdev // too. after logdev meta blks are all recovered, when a client opens a logdev, we remove it from @@ -272,20 +276,28 @@ std::shared_ptr< HomeLogStore > LogStoreService::create_new_log_store(logdev_id_ } folly::Future< shared< HomeLogStore > > LogStoreService::open_log_store(logdev_id_t logdev_id, logstore_id_t store_id, - bool append_mode) { + bool append_mode, log_found_cb_t log_found_cb, + log_replay_done_cb_t log_replay_done_cb) { folly::SharedMutexWritePriority::ReadHolder holder(m_logdev_map_mtx); const auto it = m_id_logdev_map.find(logdev_id); HS_REL_ASSERT((it != m_id_logdev_map.end()), "logdev id {} doesnt exist", logdev_id); COUNTER_INCREMENT(m_metrics, logstores_count, 1); - return it->second->open_log_store(store_id, append_mode); + return it->second->open_log_store(store_id, append_mode, log_found_cb, log_replay_done_cb); } void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t store_id) { + HS_LOG(INFO, logstore, "Removing logstore {} from logdev {}", store_id, logdev_id); + folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); COUNTER_INCREMENT(m_metrics, logstores_count, 1); const auto it = m_id_logdev_map.find(logdev_id); - HS_REL_ASSERT((it != m_id_logdev_map.end()), "logdev id {} doesnt exist", logdev_id); + if (it == m_id_logdev_map.end()) { + HS_LOG(WARN, logstore, "logdev id {} doesnt exist", logdev_id); + return; + } it->second->remove_log_store(store_id); + HS_LOG(INFO, logstore, "Successfully removed logstore {} from logdev {}", store_id, logdev_id); + COUNTER_DECREMENT(m_metrics, logstores_count, 1); } diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index 4d80987d1..37ef04bee 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -47,6 +47,9 @@ static uint64_t extract_term(const log_buffer& log_bytes) { return (*r_cast< uint64_t const* >(raw_ptr)); } +#if 0 +// Since truncate_lsn can not accross compact_lsn passed down by raft server +// and compact will truncate logs upto compact_lsn, we don't need to re-truncate in this function now. void HomeRaftLogStore::truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_lsn) { auto const last_lsn = last_index(); auto const start_lsn = start_index(); @@ -77,6 +80,7 @@ void HomeRaftLogStore::truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_ls m_log_store->truncate(truncate_lsn); } } +#endif HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore_id, log_found_cb_t const& log_found_cb, log_replay_done_cb_t const& log_replay_done_cb) : @@ -86,7 +90,7 @@ HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore m_dummy_log_entry = nuraft::cs_new< nuraft::log_entry >(0, nuraft::buffer::alloc(0), nuraft::log_val_type::app_log); if (logstore_id == UINT32_MAX) { - m_logdev_id = logstore_service().create_new_logdev(); + m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); m_log_store = logstore_service().create_new_log_store(m_logdev_id, true); if (!m_log_store) { throw std::runtime_error("Failed to create log store"); } m_logstore_id = m_log_store->get_store_id(); @@ -95,15 +99,13 @@ HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore m_logdev_id = logdev_id; m_logstore_id = logstore_id; LOGDEBUGMOD(replication, "Opening existing home log_dev={} log_store={}", m_logdev_id, logstore_id); - logstore_service().open_logdev(m_logdev_id); + logstore_service().open_logdev(m_logdev_id, flush_mode_t::EXPLICIT); m_log_store_future = logstore_service() - .open_log_store(m_logdev_id, logstore_id, true) - .thenValue([this, log_found_cb, log_replay_done_cb](auto log_store) { + .open_log_store(m_logdev_id, logstore_id, true, log_found_cb, log_replay_done_cb) + .thenValue([this](auto log_store) { m_log_store = std::move(log_store); DEBUG_ASSERT_EQ(m_logstore_id, m_log_store->get_store_id(), "Mismatch in passed and create logstore id"); - m_log_store->register_log_found_cb(log_found_cb); - m_log_store->register_log_replay_done_cb(log_replay_done_cb); REPL_STORE_LOG(DEBUG, "Home Log store created/opened successfully"); }); } @@ -147,8 +149,11 @@ nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::last_entry() const { auto log_bytes = m_log_store->read_sync(max_seq); nle = to_nuraft_log_entry(log_bytes); } catch (const std::exception& e) { - REPL_STORE_LOG(ERROR, "last_entry() out_of_range={}", max_seq); - throw e; + // all the log entries are truncated, so we should return a dummy log entry. + REPL_STORE_LOG(ERROR, "last_entry() out_of_range={}, {}", max_seq, e.what()); + // according to the contract, we should return a dummy log entry if the index is out of range. + // https://github.com/eBay/NuRaft/blob/50e2f949503081262cb21923e633eaa8dacad8fa/include/libnuraft/log_store.hxx#L56 + nle = m_dummy_log_entry; } return nle; @@ -182,6 +187,20 @@ void HomeRaftLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& e m_log_store->append_async(sisl::io_blob{buf->data_begin(), uint32_cast(buf->size()), false /* is_aligned */}, nullptr /* cookie */, [buf](int64_t, sisl::io_blob&, logdev_key, void*) {}); + + auto position_in_cache = index % m_log_entry_cache.size(); + { + std::unique_lock lk(m_mutex); + m_log_entry_cache[position_in_cache] = std::make_pair(index, entry); + + // remove all cached entries after this index + for (size_t i{0}; i < m_log_entry_cache.size(); ++i) { + if (m_log_entry_cache[i].first > index) { m_log_entry_cache[i] = std::make_pair(0, nullptr); } + } + } + + // flushing the log before returning to ensure new(over-written) log is persisted to disk. + end_of_append_batch(index, 1); } void HomeRaftLogStore::end_of_append_batch(ulong start, ulong cnt) { @@ -205,6 +224,31 @@ nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > HomeRaftLogStore: return out_vec; } +nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > +HomeRaftLogStore::log_entries_ext(ulong start, ulong end, int64_t batch_size_hint_in_bytes) { + // WARNING: we interpret batch_size_hint_in_bytes as count as of now. + auto batch_size_hint_cnt = batch_size_hint_in_bytes; + auto new_end = end; + // batch_size_hint_in_bytes < 0 indicats that follower is busy now and do not want to receive any more log entry. + if (batch_size_hint_cnt < 0) + new_end = start; + else if (batch_size_hint_cnt > 0) { + // limit to the hint, also prevent overflow by a huge batch_size_hint_cnt + if (sisl_unlikely(start + (uint64_t)batch_size_hint_cnt < start)) { + new_end = end; + } else { + new_end = start + (uint64_t)batch_size_hint_cnt; + } + // limit to original end + new_end = std::min(new_end, end); + } + DEBUG_ASSERT(new_end <= end, "new end {} should be <= original end {}", new_end, end); + DEBUG_ASSERT(start <= new_end, "start {} should be <= new_end {}", start, new_end); + REPL_STORE_LOG(TRACE, "log_entries_ext, start={} end={}, hint {}, adjusted range {} ~ {}, cnt {}", start, end, + batch_size_hint_cnt, start, new_end, new_end - start); + return log_entries(start, new_end); +} + nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::entry_at(ulong index) { auto positio_in_cache = index % m_log_entry_cache.size(); { @@ -315,14 +359,12 @@ bool HomeRaftLogStore::compact(ulong compact_lsn) { // release this assert if for some use case, we should tolorant this case; // for now, don't expect this case to happen. // RELEASE_ASSERT(false, "compact_lsn={} is beyond the current max_lsn={}", compact_lsn, cur_max_lsn); - REPL_STORE_LOG(DEBUG, "Adding dummy entries during compact from={} upto={}", cur_max_lsn + 1, - to_store_lsn(compact_lsn)); - // We need to fill the remaining entries with dummy data. - for (auto lsn{cur_max_lsn + 1}; lsn <= to_store_lsn(compact_lsn); ++lsn) { - append(m_dummy_log_entry); - } + + // if compact_lsn is beyond the current max_lsn, it indicates a hole from cur_max_lsn to compact_lsn. + // we directly compact and truncate up to compact_lsn assuming there are dummy logs. + REPL_STORE_LOG(DEBUG, "Compact with log holes from {} to={}", cur_max_lsn + 1, to_store_lsn(compact_lsn)); } - m_log_store->truncate(to_store_lsn(compact_lsn)); + m_log_store->truncate(to_store_lsn(compact_lsn), false); return true; } @@ -336,6 +378,13 @@ ulong HomeRaftLogStore::last_durable_index() { return to_repl_lsn(m_last_durable_lsn); } +void HomeRaftLogStore::purge_all_logs() { + auto last_lsn = m_log_store->get_contiguous_issued_seq_num(m_last_durable_lsn); + REPL_STORE_LOG(INFO, "Store={} LogDev={}: Purging all logs in the log store, last_lsn={}", m_logstore_id, + m_logdev_id, last_lsn); + m_log_store->truncate(last_lsn, false /* in_memory_truncate_only */); +} + void HomeRaftLogStore::wait_for_log_store_ready() { m_log_store_future.wait(); } void HomeRaftLogStore::set_last_durable_lsn(repl_lsn_t lsn) { m_last_durable_lsn = to_store_lsn(lsn); } diff --git a/src/lib/replication/log_store/home_raft_log_store.h b/src/lib/replication/log_store/home_raft_log_store.h index ccf46ef92..846b1de3c 100644 --- a/src/lib/replication/log_store/home_raft_log_store.h +++ b/src/lib/replication/log_store/home_raft_log_store.h @@ -99,12 +99,34 @@ class HomeRaftLogStore : public nuraft::log_store { /** * Get log entries with index [start, end). * + * Return nullptr to indicate error if any log entry within the requested range + * could not be retrieved (e.g. due to external log truncation). + * * @param start The start log index number (inclusive). * @param end The end log index number (exclusive). * @return The log entries between [start, end). */ virtual nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > log_entries(ulong start, ulong end) override; + /** + * Get log entries with index [start, end). + * + * The total size of the returned entries is limited by batch_size_hint. + * + * Return nullptr to indicate error if any log entry within the requested range + * could not be retrieved (e.g. due to external log truncation). + * + * @param start The start log index number (inclusive). + * @param end The end log index number (exclusive). + * @param batch_size_hint_in_bytes Total size (in bytes) of the returned entries, + * see the detailed comment at + * `state_machine::get_next_batch_size_hint_in_bytes()`. + * @return The log entries between [start, end) and limited by the total size + * given by the batch_size_hint_in_bytes. + */ + virtual nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > + log_entries_ext(ulong start, ulong end, int64_t batch_size_hint_in_bytes = 0) override; + /** * Get the log entry at the specified log index number. * @@ -182,6 +204,7 @@ class HomeRaftLogStore : public nuraft::log_store { */ ulong last_index() const; +#if 0 /** * Truncates the log store * @@ -190,6 +213,13 @@ class HomeRaftLogStore : public nuraft::log_store { * LSN; */ void truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_lsn); +#endif + + /** + * Purge all logs in the log store + * It is a dangerous operation and not be used currently. + */ + void purge_all_logs(); void wait_for_log_store_ready(); void set_last_durable_lsn(repl_lsn_t lsn); diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp index 4271d8b88..f9b3d454e 100644 --- a/src/lib/replication/log_store/repl_log_store.cpp +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -10,16 +10,16 @@ uint64_t ReplLogStore::append(nuraft::ptr< nuraft::log_entry >& entry) { // We don't want to transform anything that is not an app log if (entry->get_val_type() != nuraft::log_val_type::app_log || entry->get_buf_ptr()->size() == 0) { ulong lsn = HomeRaftLogStore::append(entry); - RD_LOGD("append entry term={}, log_val_type={} lsn={} size={}", entry->get_term(), + RD_LOGD(NO_TRACE_ID, "None-APP log: append entry term={}, log_val_type={} lsn={} size={}", entry->get_term(), static_cast< uint32_t >(entry->get_val_type()), lsn, entry->get_buf().size()); return lsn; } repl_req_ptr_t rreq = m_sm.localize_journal_entry_finish(*entry); + RELEASE_ASSERT_NE(nullptr != rreq, "Failed to localize journal entry before appending log"); ulong lsn = HomeRaftLogStore::append(entry); m_sm.link_lsn_to_req(rreq, int64_cast(lsn)); - - RD_LOGD("Raft Channel: Received append log entry rreq=[{}]", rreq->to_compact_string()); + RD_LOGT(rreq->traceID(), "Raft Channel: Received append log entry rreq=[{}]", rreq->to_compact_string()); return lsn; } @@ -31,9 +31,10 @@ void ReplLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry } repl_req_ptr_t rreq = m_sm.localize_journal_entry_finish(*entry); + RELEASE_ASSERT(nullptr != rreq, "Failed to localize journal entry before overwriting log at index {}", index); HomeRaftLogStore::write_at(index, entry); m_sm.link_lsn_to_req(rreq, int64_cast(index)); - RD_LOGD("Raft Channel: Received write_at log entry rreq=[{}]", rreq->to_compact_string()); + RD_LOGT(rreq->traceID(), "Raft Channel: Received write_at log entry rreq=[{}]", rreq->to_compact_string()); } void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { @@ -44,11 +45,8 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { auto proposer_reqs = sisl::VectorPool< repl_req_ptr_t >::alloc(); for (int64_t lsn = int64_cast(start_lsn); lsn <= end_lsn; ++lsn) { auto rreq = m_sm.lsn_to_req(lsn); - // Skip this call in proposer, since this method will synchronously flush the data, which is not required for - // leader. Proposer will call the flush as part of commit after receiving quorum, upon which time, there is a - // high possibility the log entry is already flushed. Skip it for rreq == nullptr which is the case for raft - // config entries. - if ((rreq == nullptr) /*|| rreq->is_proposer()*/) { + // Skip it for rreq == nullptr which is the case for raft config entries. + if ((rreq == nullptr)) { continue; } else if (rreq->is_proposer()) { proposer_reqs->emplace_back(std::move(rreq)); @@ -57,49 +55,66 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { } } - RD_LOGT("Raft Channel: end_of_append_batch start_lsn={} count={} num_data_to_be_written={} {}", start_lsn, count, - reqs->size(), proposer_reqs->size()); + RD_LOGT(NO_TRACE_ID, "Raft Channel: end_of_append_batch start_lsn={} count={} num_data_to_be_written={} {}", + start_lsn, count, reqs->size(), proposer_reqs->size()); - // All requests are from proposer for data write, so as mentioned above we can skip the flush for now if (!reqs->empty()) { // Check the map if data corresponding to all of these requsts have been received and written. If not, schedule // a fetch and write. Once all requests are completed and written, these requests are poped out of the map and // the future will be ready. - auto fut = m_rd.notify_after_data_written(reqs); - - // In the meanwhile, we can flush the journal for this lsn batch. It is ok to flush the entries in log before - // actual data is written, because, even if we have the log, it doesn't mean data is committed, until state - // machine reports that. This way the flush and fetch both can run in parallel. auto cur_time = std::chrono::steady_clock::now(); - HomeRaftLogStore::end_of_append_batch(start_lsn, count); - HISTOGRAM_OBSERVE(m_rd.metrics(), raft_end_of_append_batch_latency_us, get_elapsed_time_us(cur_time)); - - cur_time = std::chrono::steady_clock::now(); + auto fut = m_rd.notify_after_data_written(reqs); // Wait for the fetch and write to be completed successfully. + // It is essential to complete the data write before appending to the log. If the logs are flushed + // before the data is written, a restart and subsequent log replay occurs, as the in-memory state is lost, + // it leaves us uncertain about whether the data was actually written, potentially leading to data + // inconsistency. std::move(fut).wait(); HISTOGRAM_OBSERVE(m_rd.metrics(), data_channel_wait_latency_us, get_elapsed_time_us(cur_time)); + } + + // Flushing logs now. + auto cur_time = std::chrono::steady_clock::now(); + HomeRaftLogStore::end_of_append_batch(start_lsn, count); + HISTOGRAM_OBSERVE(m_rd.metrics(), raft_end_of_append_batch_latency_us, get_elapsed_time_us(cur_time)); - // Mark all the reqs also completely written - for (auto const& rreq : *reqs) { - if (rreq) { rreq->add_state(repl_req_state_t::LOG_FLUSHED); } + // Mark all the reqs completely written + for (auto const& rreq : *reqs) { + if (rreq) { rreq->add_state(repl_req_state_t::LOG_FLUSHED); } + } + + // Data corresponding to proposer reqs have already been written before propose reqs to raft, + // so skip waiting data written and mark reqs as flushed here. + for (auto const& rreq : *proposer_reqs) { + if (rreq) { + RD_LOGT(rreq->traceID(), + "Raft Channel: end_of_append_batch, I am proposer for lsn {}, only flushed log for it", + rreq->lsn()); + rreq->add_state(repl_req_state_t::LOG_FLUSHED); } - } else if (!proposer_reqs->empty()) { - RD_LOGT("Raft Channel: end_of_append_batch, I am proposer, only flush log s from {} , count {}", start_lsn, - count); - // Mark all the reqs also completely written - HomeRaftLogStore::end_of_append_batch(start_lsn, count); - for (auto const& rreq : *proposer_reqs) { - if (rreq) { rreq->add_state(repl_req_state_t::LOG_FLUSHED); } + } + + // Convert volatile logs to non-volatile logs in state machine. + for (int64_t lsn = int64_cast(start_lsn); lsn <= end_lsn; ++lsn) { + auto rreq = m_sm.lsn_to_req(lsn); + if (rreq != nullptr) { + if (rreq->has_state(repl_req_state_t::ERRORED)) { + RD_LOGE(rreq->traceID(), "Raft Channel: rreq=[{}] met some errors before", rreq->to_compact_string()); + continue; + } + rreq->set_is_volatile(false); } } + sisl::VectorPool< repl_req_ptr_t >::free(reqs); sisl::VectorPool< repl_req_ptr_t >::free(proposer_reqs); } std::string ReplLogStore::rdev_name() const { return m_rd.rdev_name(); } +std::string ReplLogStore::identify_str() const { return m_rd.identify_str(); } bool ReplLogStore::compact(ulong compact_upto_lsn) { - RD_LOG(DEBUG, "Raft Channel: compact_to_lsn={}", compact_upto_lsn); + RD_LOGD(NO_TRACE_ID, "Raft Channel: compact_to_lsn={}", compact_upto_lsn); m_rd.on_compact(compact_upto_lsn); return HomeRaftLogStore::compact(compact_upto_lsn); } diff --git a/src/lib/replication/log_store/repl_log_store.h b/src/lib/replication/log_store/repl_log_store.h index a386d397b..bb19df119 100644 --- a/src/lib/replication/log_store/repl_log_store.h +++ b/src/lib/replication/log_store/repl_log_store.h @@ -30,6 +30,7 @@ class ReplLogStore : public HomeRaftLogStore { private: std::string rdev_name() const; + std::string identify_str() const; }; } // namespace homestore diff --git a/src/lib/replication/push_data_rpc.fbs b/src/lib/replication/push_data_rpc.fbs index 1f6d20546..d9a981e7c 100644 --- a/src/lib/replication/push_data_rpc.fbs +++ b/src/lib/replication/push_data_rpc.fbs @@ -2,6 +2,7 @@ native_include "boost/uuid/uuid.hpp"; namespace homestore; table PushDataRequest { + trace_id: uint64; // traceID for the REQ issuer_replica_id : int32; // Replica id of the issuer raft_term : uint64; // Raft term number dsn : uint64; // Data Sequence number diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp index 71927a3ad..6b8ce122b 100644 --- a/src/lib/replication/repl_dev/common.cpp +++ b/src/lib/replication/repl_dev/common.cpp @@ -6,11 +6,13 @@ #include #include "replication/repl_dev/common.h" #include +#include namespace homestore { -void repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size) { +ReplServiceError repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer, + sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, + cshared< ReplDevListener >& listener) { m_rkey = std::move(rkey); #ifndef NDEBUG if (data_size > 0) { @@ -24,6 +26,36 @@ void repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer, m_header = user_header; m_key = key; m_is_jentry_localize_pending = (!is_proposer && (data_size > 0)); // Pending on the applier and with linked data + + // We need to allocate the block if the req has data linked, since entry doesn't exist or if it exist, two + // threads(data channel and raft channel) are trying to do the same thing. So take state mutex and allocate the blk + std::unique_lock< std::mutex > lg(m_state_mtx); + if (has_linked_data() && !has_state(repl_req_state_t::BLK_ALLOCATED)) { + ReplServiceError alloc_status; +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("simulate_no_space_left") && !is_proposer) { + LOGERROR("Simulate no space left on follower for testing purposes"); + // TODO: support `simulate_no_space_left` for the leader, do not throw exception in on-error in the test + // framework, it will cause the leader to fail and exit. + alloc_status = ReplServiceError::NO_SPACE_LEFT; + } else { + alloc_status = alloc_local_blks(listener, data_size); + if (alloc_status != ReplServiceError::OK) { + LOGERRORMOD(replication, "[traceID={}] Allocate blk for rreq failed error={}", m_rkey.traceID, + alloc_status); + } + } +#else + alloc_status = alloc_local_blks(listener, data_size); + if (alloc_status != ReplServiceError::OK) { + LOGERRORMOD(replication, "[traceID={}] Allocate blk for rreq failed error={}", m_rkey.traceID, + alloc_status); + } +#endif + return alloc_status; + } + + return ReplServiceError::OK; } repl_req_ctx::~repl_req_ctx() { @@ -31,7 +63,7 @@ repl_req_ctx::~repl_req_ctx() { } void repl_req_ctx::create_journal_entry(bool is_raft_buf, int32_t server_id) { - uint32_t val_size = has_linked_data() ? m_local_blkid.serialized_size() : 0; + uint32_t val_size = has_linked_data() ? blkids_serialized_size() : 0; uint32_t entry_size = sizeof(repl_journal_entry) + m_header.size() + m_key.size() + val_size; if (is_raft_buf) { @@ -43,6 +75,7 @@ void repl_req_ctx::create_journal_entry(bool is_raft_buf, int32_t server_id) { } m_journal_entry->code = m_op_code; + m_journal_entry->traceID = m_rkey.traceID; m_journal_entry->server_id = server_id; m_journal_entry->dsn = m_rkey.dsn; m_journal_entry->user_header_size = m_header.size(); @@ -61,14 +94,25 @@ void repl_req_ctx::create_journal_entry(bool is_raft_buf, int32_t server_id) { } if (has_linked_data()) { - auto const b = m_local_blkid.serialize(); - std::memcpy(raw_ptr, b.cbytes(), b.size()); + for (const auto& blkid : m_local_blkids) { + auto const b = blkid.serialize(); + std::memcpy(raw_ptr, b.cbytes(), b.size()); + raw_ptr += b.size(); + } } } uint32_t repl_req_ctx::journal_entry_size() const { return sizeof(repl_journal_entry) + m_header.size() + m_key.size() + - (has_linked_data() ? m_local_blkid.serialized_size() : 0); + (has_linked_data() ? blkids_serialized_size() : 0); +} + +uint32_t repl_req_ctx::blkids_serialized_size() const { + uint32_t blkids_serialized_size = 0; + for (const auto& blkid : m_local_blkids) { + blkids_serialized_size += blkid.serialized_size(); + } + return blkids_serialized_size; } void repl_req_ctx::change_raft_journal_buf(raft_buf_ptr_t new_buf, bool adjust_hdr_key) { @@ -88,15 +132,36 @@ void repl_req_ctx::change_raft_journal_buf(raft_buf_ptr_t new_buf, bool adjust_h ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& listener, uint32_t data_size) { DEBUG_ASSERT(has_linked_data(), "Trying to allocate a block for non-inlined block"); - auto const hints_result = listener->get_blk_alloc_hints(m_header, data_size); + auto const hints_result = listener->get_blk_alloc_hints(m_header, data_size, repl_req_ptr_t(this)); if (hints_result.hasError()) { return hints_result.error(); } + if (hints_result.value().committed_blk_id.has_value()) { + // if the committed_blk_id is already present, use it and skip allocation and commitment + LOGINFOMOD(replication, "[traceID={}] For Repl_key=[{}] data already exists, skip", rkey().traceID, + rkey().to_string()); + m_local_blkids.emplace_back(hints_result.value().committed_blk_id.value()); + add_state(repl_req_state_t::BLK_ALLOCATED); + add_state(repl_req_state_t::DATA_RECEIVED); + add_state(repl_req_state_t::DATA_WRITTEN); + add_state(repl_req_state_t::DATA_COMMITTED); + m_data_received_promise.setValue(); + m_data_written_promise.setValue(); + return ReplServiceError::OK; + } + + std::vector< BlkId > blkids; auto status = data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), - hints_result.value(), m_local_blkid); + hints_result.value(), blkids); if (status != BlkAllocStatus::SUCCESS) { + LOGWARNMOD(replication, "[traceID={}] block allocation failure, repl_key=[{}], status=[{}]", rkey().traceID, + rkey(), status); DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks"); return ReplServiceError::NO_SPACE_LEFT; } + + for (auto& blkid : blkids) { + m_local_blkids.emplace_back(blkid); + } add_state(repl_req_state_t::BLK_ALLOCATED); return ReplServiceError::OK; } @@ -109,7 +174,7 @@ void repl_req_ctx::set_lsn(int64_t lsn) { "Changing lsn for request={} on the fly can cause race condition, not expected. lsn {}, m_lsn {}", to_string(), lsn, m_lsn); m_lsn = lsn; - LOGTRACEMOD(replication, "Setting lsn={} for request={}", lsn, to_string()); + LOGTRACEMOD(replication, "[traceID={}] Setting lsn={} for request={}", rkey().traceID, lsn, to_string()); } bool repl_req_ctx::save_pushed_data(intrusive< sisl::GenericRpcData > const& pushed_data, uint8_t const* data, @@ -164,12 +229,21 @@ bool repl_req_ctx::add_state_if_not_already(repl_req_state_t s) { void repl_req_ctx::clear() { m_header = sisl::blob{}; m_key = sisl::blob{}; + m_pkts.clear(); +} + +// FIXME: Use lock to avoid concurrent release of data. +void repl_req_ctx::release_data() { + m_data = nullptr; + // explicitly clear m_buf_for_unaligned_data as unaligned pushdata/fetchdata will be saved here + m_buf_for_unaligned_data = sisl::io_blob_safe{}; if (m_pushed_data) { + LOGTRACEMOD(replication, "[traceID={}] m_pushed_data addr={}, m_rkey={}, m_lsn={}", rkey().traceID, + static_cast< void* >(m_pushed_data.get()), m_rkey.to_string(), m_lsn); m_pushed_data->send_response(); m_pushed_data = nullptr; } m_fetched_data = sisl::GenericClientResponse{}; - m_pkts.clear(); } static std::string req_state_name(uint32_t state) { @@ -188,15 +262,25 @@ std::string repl_req_ctx::to_string() const { return fmt::format("repl_key=[{}], lsn={} state=[{}] m_headersize={} m_keysize={} is_proposer={} " "local_blkid={} remote_blkid={}", m_rkey.to_string(), m_lsn, req_state_name(uint32_cast(state())), m_header.size(), m_key.size(), - m_is_proposer, m_local_blkid.to_string(), m_remote_blkid.blkid.to_string()); + m_is_proposer, blkids_to_string(), m_remote_blkid.blkid.to_string()); } std::string repl_req_ctx::to_compact_string() const { - if (m_op_code == journal_type_t::HS_CTRL_DESTROY) { + if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_START_REPLACE || m_op_code == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { return fmt::format("term={} lsn={} op={}", m_rkey.term, m_lsn, enum_name(m_op_code)); } + return fmt::format("dsn={} term={} lsn={} op={} local_blkid={} state=[{}]", m_rkey.dsn, m_rkey.term, m_lsn, - enum_name(m_op_code), m_local_blkid.to_string(), req_state_name(uint32_cast(state()))); + enum_name(m_op_code), blkids_to_string(), req_state_name(uint32_cast(state()))); +} + +std::string repl_req_ctx::blkids_to_string() const { + std::string str = fmt::format("["); + for (const auto& blkid : m_local_blkids) { + fmt::format_to(std::back_inserter(str), "{} ", blkid.to_string()); + } + fmt::format_to(std::back_inserter(str), "]"); + return str; } bool repl_req_ctx::is_expired() const { diff --git a/src/lib/replication/repl_dev/common.h b/src/lib/replication/repl_dev/common.h index cb8a57931..c3433083f 100644 --- a/src/lib/replication/repl_dev/common.h +++ b/src/lib/replication/repl_dev/common.h @@ -15,7 +15,7 @@ #pragma once #include - +#include #include #include #include @@ -35,8 +35,9 @@ struct repl_journal_entry { uint16_t minor_version{JOURNAL_ENTRY_MINOR}; journal_type_t code; - int32_t server_id; // Server id from where journal entry is originated - uint64_t dsn; // Data seq number + trace_id_t traceID; // traceID provided by application, mostly for consolidate logs. + int32_t server_id; // Server id from where journal entry is originated + uint64_t dsn; // Data seq number uint32_t user_header_size; uint32_t key_size; uint32_t value_size; @@ -57,6 +58,7 @@ struct repl_journal_entry { struct repl_dev_superblk { static constexpr uint64_t REPL_DEV_SB_MAGIC = 0xABCDF00D; static constexpr uint32_t REPL_DEV_SB_VERSION = 1; + static constexpr size_t max_name_len = 64; uint64_t magic{REPL_DEV_SB_MAGIC}; uint32_t version{REPL_DEV_SB_VERSION}; @@ -67,9 +69,14 @@ struct repl_dev_superblk { repl_lsn_t checkpoint_lsn; // LSN upto which this replica have checkpointed the Data repl_lsn_t compact_lsn; // maximum LSN that can be compacted to uint64_t group_ordinal; // Ordinal number which will be used to indicate the rdevXYZ for debugging + char rdev_name[max_name_len]; // Short name for the group for easy debugging uint64_t get_magic() const { return magic; } uint32_t get_version() const { return version; } + void set_rdev_name(std::string const& name) { + std::strncpy(rdev_name, name.c_str(), max_name_len - 1); + rdev_name[max_name_len - 1] = '\0'; + } }; #pragma pack() @@ -88,4 +95,11 @@ auto make_async_success() { return folly::makeSemiFuture< ReplResult< folly::Unit > >(folly::Unit{}); } +inline uint64_t generateRandomTraceId() { + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution< uint64_t > dis; + return dis(gen); +} + } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp index 45a018d92..2303fda68 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.cpp +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -15,9 +15,10 @@ #include "common/homestore_assert.hpp" #include "common/homestore_config.hpp" -// #include "common/homestore_flip.hpp" +#include "common/homestore_utils.hpp" #include "replication/service/raft_repl_service.h" #include "replication/repl_dev/raft_repl_dev.h" +#include "device/chunk.h" #include "device/device.h" #include "push_data_rpc_generated.h" #include "fetch_data_rpc_generated.h" @@ -39,14 +40,16 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_data_journal = std::make_shared< ReplLogStore >( *this, *m_state_machine, m_rd_sb->logdev_id, m_rd_sb->logstore_id, [this](logstore_seq_num_t lsn, log_buffer buf, void* key) { on_log_found(lsn, buf, key); }, - [this](std::shared_ptr< HomeLogStore > hs, logstore_seq_num_t lsn) { m_log_store_replay_done = true; }); + [this](std::shared_ptr< HomeLogStore > hs, logstore_seq_num_t lsn) { + m_log_store_replay_done = true; + set_log_store_last_durable_lsn(hs->tail_lsn()); + }); m_next_dsn = m_rd_sb->last_applied_dsn + 1; m_commit_upto_lsn = m_rd_sb->durable_commit_lsn; m_last_flushed_commit_lsn = m_commit_upto_lsn; m_compact_lsn = m_rd_sb->compact_lsn; - m_rdev_name = fmt::format("rdev{}", m_rd_sb->group_ordinal); - + m_rdev_name = m_rd_sb->rdev_name; // Its ok not to do compare exchange, because loading is always single threaded as of now if (m_rd_sb->group_ordinal >= s_next_group_ordinal.load()) { s_next_group_ordinal.store(m_rd_sb->group_ordinal + 1); @@ -66,54 +69,502 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk m_rd_sb->logstore_id = m_data_journal->logstore_id(); m_rd_sb->last_applied_dsn = 0; m_rd_sb->destroy_pending = 0x0; + m_rd_sb->last_snapshot_lsn = 0; m_rd_sb->group_ordinal = s_next_group_ordinal.fetch_add(1); m_rdev_name = fmt::format("rdev{}", m_rd_sb->group_ordinal); + m_rd_sb->set_rdev_name(m_rdev_name); if (m_rd_sb->is_timeline_consistent) { m_free_blks_journal = logstore_service().create_new_log_store(m_rd_sb->logdev_id, false /* append_mode */); m_rd_sb->free_blks_journal_id = m_free_blks_journal->get_store_id(); } m_rd_sb.write(); + bind_data_service(); } - RD_LOG(INFO, - "Started {} RaftReplDev group_id={}, replica_id={}, raft_server_id={} commited_lsn={}, " - "compact_lsn={}, checkpoint_lsn:{}, next_dsn={} " - "log_dev={} log_store={}", - (load_existing ? "Existing" : "New"), group_id_str(), my_replica_id_str(), m_raft_server_id, - m_commit_upto_lsn.load(), m_compact_lsn.load(), m_rd_sb->checkpoint_lsn, m_next_dsn.load(), - m_rd_sb->logdev_id, m_rd_sb->logstore_id); + m_identify_str = m_rdev_name + ":" + group_id_str(); + + RD_LOGI(NO_TRACE_ID, + "Started {} RaftReplDev group_id={}, replica_id={}, raft_server_id={} commited_lsn={}, " + "compact_lsn={}, checkpoint_lsn:{}, next_dsn={} " + "log_dev={} log_store={}", + (load_existing ? "Existing" : "New"), group_id_str(), my_replica_id_str(), m_raft_server_id, + m_commit_upto_lsn.load(), m_compact_lsn.load(), m_rd_sb->checkpoint_lsn, m_next_dsn.load(), + m_rd_sb->logdev_id, m_rd_sb->logstore_id); +} +bool RaftReplDev::bind_data_service() { + RD_LOGI(NO_TRACE_ID, "Starting data channel, group_id={}, replica_id={}", group_id_str(), my_replica_id_str()); + bool success = false; #ifdef _PRERELEASE - m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, [this](intrusive< sisl::GenericRpcData >& rpc_data) { - if (iomgr_flip::instance()->delay_flip("slow_down_data_channel", [this, rpc_data]() mutable { - RD_LOGI("Resuming after slow down data channel flip"); + success = + m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, [this](intrusive< sisl::GenericRpcData >& rpc_data) { + if (iomgr_flip::instance()->delay_flip("slow_down_data_channel", [this, rpc_data]() mutable { + RD_LOGI(NO_TRACE_ID, "Resuming after slow down data channel flip"); + on_push_data_received(rpc_data); + })) { + RD_LOGI(NO_TRACE_ID, "Slow down data channel flip is enabled, scheduling to call later"); + } else { on_push_data_received(rpc_data); - })) { - RD_LOGI("Slow down data channel flip is enabled, scheduling to call later"); - } else { - on_push_data_received(rpc_data); - } - }); + } + }); #else - m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1)); + success = + m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1)); #endif - - m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 1)); + if (!success) { + RD_LOGE(NO_TRACE_ID, "Failed to bind data service request for PUSH_DATA"); + return false; + } + success = + m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 1)); + if (!success) { + RD_LOGE(NO_TRACE_ID, "Failed to bind data service request for FETCH_DATA"); + return false; + } + return true; } bool RaftReplDev::join_group() { + bind_data_service(); auto raft_result = m_msg_mgr.join_group(m_group_id, "homestore_replication", std::dynamic_pointer_cast< nuraft_mesg::mesg_state_mgr >(shared_from_this())); if (!raft_result) { - HS_DBG_ASSERT(false, "Unable to join the group_id={} with error={}", boost::uuids::to_string(m_group_id), - raft_result.error()); + HS_DBG_ASSERT(false, "Unable to join the group_id={} with error={}", group_id_str(), raft_result.error()); return false; } return true; } +// All the steps in the implementation should be idempotent and retryable. +AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) { + if (is_stopping()) { + RD_LOGI(trace_id, "repl dev is being shutdown!"); + return make_async_error<>(ReplServiceError::STOPPING); + } + incr_pending_request_num(); + + RD_LOGI(trace_id, "Start replace member, member_out={} member_in={}", boost::uuids::to_string(member_out.id), + boost::uuids::to_string(member_in.id)); + + if (commit_quorum >= 1) { + // Two members are down and leader cant form the quorum. Reduce the quorum size. + reset_quorum_size(commit_quorum, trace_id); + } + + // Step1, validate request + auto out_srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member_out.id)); + if (!out_srv_cfg) { + auto in_srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member_in.id)); + if (in_srv_cfg) { + RD_LOGI( + trace_id, + "Step1. Replace member, the intent has already been fulfilled, ignore it, member_out={} member_in={}", + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_success<>(); + } + RD_LOGE(trace_id, "Step1. Replace member invalid parameter, out member is not found"); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); + } + if (m_my_repl_id != get_leader_id()) { + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::NOT_LEADER); + } + // Check if leader itself is requested to move out. + if (m_my_repl_id == member_out.id) { + // immediate=false successor=-1, nuraft will choose an alive peer with highest priority as successor, and wait + // until the successor finishes the catch-up of the latest log, and then resign. Return NOT_LEADER and let + // client retry. + raft_server()->yield_leadership(false /* immediate */, -1 /* successor */); + RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out so yield leadership"); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::NOT_LEADER); + } + // quorum safety check. TODO currently only consider lsn, need to check last response time. + auto active_peers = get_active_peers(); + // active_peers doesn't include leader itself. + auto quorum = active_peers.size() + 1; + for (const auto& p : active_peers) { + quorum = p == member_out.id ? quorum - 1 : quorum; + quorum = p == member_in.id ? quorum - 1 : quorum; + } + RD_LOGD(trace_id, + "Step1. Replace member, quorum safety check, active_peers={}, active_peers_exclude_out/in_member={}, " + "commit_quorum={}", + active_peers.size(), quorum, commit_quorum); + // commit_quorum=0 means actual commit quorum is the majority. In this case, active normal member count should be + // greater than 1. To be more specific, if we have S1(leader), S2, S3(out), S4(in), we don't allow + // replace_member(S3, S4) if S2 is down or laggy. Needs to recover S2 first or retry with commit_quorum=1. + if (quorum <= 1 && commit_quorum == 0) { + RD_LOGE(trace_id, "Step1. Replace member, quorum safety check failed, active_peers={}, active_peers_exclude_out/in_member={}, commit_quorum={}", + active_peers.size(), quorum, commit_quorum); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(ReplServiceError::QUORUM_NOT_MET); + } + + // Step 2: Handle out member. +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("replace_member_set_learner_failure")) { + RD_LOGE(trace_id, "Simulating set member to learner failure"); + return make_async_error(ReplServiceError::FAILED); + } +#endif + RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner"); + auto learner_ret = do_flip_learner(member_out, true, true, trace_id); + if (learner_ret != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step2. Replace member, failed to flip out member to learner {}", learner_ret); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error(std::move(learner_ret)); + } + RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner and set priority to 0"); + + // Step 3. Append log entry to mark the old member is out and new member is added. + RD_LOGI(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req, group_id={}", + group_id_str()); + auto rreq = repl_req_ptr_t(new repl_req_ctx{}); + replace_member_ctx members; + members.replica_out = member_out; + members.replica_in = member_in; + + sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_member_ctx)); + rreq->init(repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = trace_id}, + journal_type_t::HS_CTRL_START_REPLACE, true, header, sisl::blob{}, 0, m_listener); + + auto err = m_state_machine->propose_to_raft(std::move(rreq)); + if (err != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req failed {}", err); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(err)); + } + + // Step 4. Add the new member, new member will inherit the priority of the out member. +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("replace_member_add_member_failure")) { + RD_LOGE(trace_id, "Simulating add member failure"); + return make_async_error(ReplServiceError::FAILED); + } +#endif + RD_LOGI(trace_id, "Step4. Replace member, propose to raft to add new member, group_id={}", group_id_str()); + auto ret = do_add_member(member_in, trace_id); + if (ret != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step4. Replace member, add member failed {}", ret); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(ret)); + } + RD_LOGI(trace_id, "Step4. Replace member, proposed to raft to add member, member={}", boost::uuids::to_string(member_in.id)); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_success<>(); +} + +AsyncReplResult<> RaftReplDev::complete_replace_member(const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) { + if (is_stopping()) { + RD_LOGI(trace_id, "repl dev is being shutdown!"); + return make_async_error<>(ReplServiceError::STOPPING); + } + incr_pending_request_num(); + + RD_LOGI(trace_id, "Complete replace member, member={}", boost::uuids::to_string(member_out.id), + boost::uuids::to_string(member_out.id)); + + if (commit_quorum >= 1) { + // Two members are down and leader cant form the quorum. Reduce the quorum size. + reset_quorum_size(commit_quorum, trace_id); + } + + // Step 5: Remove member + RD_LOGI(trace_id, "Step5. Replace member, remove old member, member={}", boost::uuids::to_string(member_out.id)); +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("replace_member_remove_member_failure")) { + RD_LOGE(trace_id, "Simulating remove member failure"); + return make_async_error(ReplServiceError::FAILED); + } +#endif + auto ret = do_remove_member(member_out, trace_id); + if (ret != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step5. Replace member, failed to remove member, member={}, err={}", + boost::uuids::to_string(member_out.id), ret); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(ret)); + } + RD_LOGI(trace_id, "Step5. Replace member, proposed to raft to remove member, member={}", + boost::uuids::to_string(member_out.id)); + auto timeout = HS_DYNAMIC_CONFIG(consensus.wait_for_config_change_ms); + // TODO Move wait logic to nuraft_mesg + if (!wait_and_check( + [&]() { + auto srv_conf = raft_server()->get_srv_config(nuraft_mesg::to_server_id(member_out.id)); + if (srv_conf) { + RD_LOGD(trace_id, "out member still exists in raft group, member={}", + boost::uuids::to_string(member_out.id)); + return false; + } + return true; + }, + timeout)) { + RD_LOGD(trace_id, + "Step5. Replace member, wait for old member removed timed out, cancel the request, timeout: {}", + timeout); + // If the member_out is down, leader will force remove it after + // leave_timeout=leave_limit_(default=5)*heart_beat_interval_, it's better for client to retry it. + return make_async_error<>(ReplServiceError::CANCELLED); + } + RD_LOGD(trace_id, "Step5. Replace member, old member is removed, member={}", + boost::uuids::to_string(member_out.id)); + + // Step 2. Append log entry to complete replace member + RD_LOGI(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req, group_id={}", + group_id_str()); + auto rreq = repl_req_ptr_t(new repl_req_ctx{}); + replace_member_ctx members; + members.replica_out = member_out; + members.replica_in = member_in; + + sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_member_ctx)); + rreq->init(repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = trace_id}, + journal_type_t::HS_CTRL_COMPLETE_REPLACE, true, header, sisl::blob{}, 0, m_listener); + + auto err = m_state_machine->propose_to_raft(std::move(rreq)); + if (err != ReplServiceError::OK) { + RD_LOGE(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req failed , err={}", + err); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(err)); + } + + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + RD_LOGI(trace_id, "Complete replace member done, group_id={}, member_out={} member_in={}", group_id_str(), + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + return make_async_success<>(); +} + +ReplServiceError RaftReplDev::do_add_member(const replica_member_info& member, uint64_t trace_id) { + if (m_my_repl_id != get_leader_id()) { + RD_LOGI(trace_id, "Member to add failed, not leader"); + return ReplServiceError::BAD_REQUEST; + } + auto ret = retry_when_config_changing( + [&] { + auto rem_ret = m_msg_mgr.add_member(m_group_id, member.id) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, member, trace_id](auto&& e) -> nuraft::cmd_result_code { + return e.hasError() ? e.error() : nuraft::cmd_result_code::OK; + }); + return rem_ret.value(); + }, + trace_id); + if (ret == nuraft::cmd_result_code::SERVER_ALREADY_EXISTS) { + RD_LOGW(trace_id, "Ignoring error returned from nuraft add_member, member={}, err={}", + boost::uuids::to_string(member.id), ret); + } else if (ret != nuraft::cmd_result_code::OK) { + // Its ok to retry this request as the request + // of replace member is idempotent. + RD_LOGE(trace_id, "Add member failed, member={}, err={}", boost::uuids::to_string(member.id), ret); + return ReplServiceError::RETRY_REQUEST; + } + RD_LOGI(trace_id, "Proposed to raft to add member, member={}", boost::uuids::to_string(member.id)); + return ReplServiceError::OK; +} + +ReplServiceError RaftReplDev::do_remove_member(const replica_member_info& member, uint64_t trace_id) { + // The member should not be the leader. + if (m_my_repl_id == member.id && m_my_repl_id == get_leader_id()) { + // If leader is the member requested to move out, then give up leadership and return error. + // Client will retry replace_member request to the new leader. + raft_server()->yield_leadership(false /* immediate */, -1 /* successor */); + RD_LOGI(trace_id, "Member to remove is the leader so yield leadership"); + return ReplServiceError::NOT_LEADER; + } + auto ret = retry_when_config_changing( + [&] { + auto rem_ret = m_msg_mgr.rem_member(m_group_id, member.id) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, member, trace_id](auto&& e) -> nuraft::cmd_result_code { + return e.hasError() ? e.error() : nuraft::cmd_result_code::OK; + }); + return rem_ret.value(); + }, + trace_id); + if (ret == nuraft::cmd_result_code::SERVER_NOT_FOUND) { + RD_LOGW(trace_id, "Remove member not found in group error, ignoring, member={}", + boost::uuids::to_string(member.id)); + } else if (ret != nuraft::cmd_result_code::OK) { + // Its ok to retry this request as the request + // of replace member is idempotent. + RD_LOGE(trace_id, "Replace member failed to remove member, member={}, err={}", + boost::uuids::to_string(member.id), ret); + return ReplServiceError::RETRY_REQUEST; + } + RD_LOGI(trace_id, "Proposed to raft to remove member, member={}", boost::uuids::to_string(member.id)); + return ReplServiceError::OK; +} + +AsyncReplResult<> RaftReplDev::flip_learner_flag(const replica_member_info& member, bool target, uint32_t commit_quorum, + bool wait_and_verify, uint64_t trace_id) { + RD_LOGI(trace_id, "Flip learner flag to {}, member={}", target, boost::uuids::to_string(member.id)); + if (is_stopping()) { + RD_LOGI(trace_id, "repl dev is being shutdown!"); + return make_async_error<>(ReplServiceError::STOPPING); + } + incr_pending_request_num(); + + if (commit_quorum >= 1) { + // Two members are down and leader cant form the quorum. Reduce the quorum size. + reset_quorum_size(commit_quorum, trace_id); + } + auto ret = do_flip_learner(member, target, wait_and_verify, trace_id); + if (ret != ReplServiceError::OK) { + RD_LOGE(trace_id, "Flip learner flag failed {}, member={}", ret, boost::uuids::to_string(member.id)); + reset_quorum_size(0, trace_id); + decr_pending_request_num(); + return make_async_error<>(std::move(ret)); + } + RD_LOGI(trace_id, "Learner flag has been set to {}, member={}", target, boost::uuids::to_string(member.id)); + return make_async_success<>(); +} + +ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, bool target, bool wait_and_verify, + uint64_t trace_id) { + // 1. Prerequisite check + if (m_my_repl_id != get_leader_id()) { + RD_LOGI(trace_id, "flip learner flag failed, not leader"); + return ReplServiceError::NOT_LEADER; + } + if (!target && member.priority == 0) { + // If the intent is to take the learner back to normal member, then priority should not be 0(never has chance to + // become leader). Client need to trace the peers' priority, and give a meaningful value, currently default + // priorities of the quorum: leader=100, follower=66. + RD_LOGI(trace_id, "clear learner flag failed, priority is 0, member={}", boost::uuids::to_string(member.id)); + return ReplServiceError::BAD_REQUEST; + } + + // 2. Flip learner + RD_LOGI(trace_id, "flip learner flag to {}, member={}", target, boost::uuids::to_string(member.id)); + auto srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member.id)); + if (!srv_cfg) { + RD_LOGE(trace_id, "invalid parameter, member is not found, member={}", boost::uuids::to_string(member.id)); + return ReplServiceError::SERVER_NOT_FOUND; + } + if (srv_cfg->is_learner() != target) { + auto ret = retry_when_config_changing( + [&] { + auto learner_ret = raft_server()->flip_learner_flag(nuraft_mesg::to_server_id(member.id), target); + return learner_ret->get_result_code(); + }, + trace_id); + if (ret != nuraft::cmd_result_code::OK) { + RD_LOGE(trace_id, "Propose to raft to flip learner failed, err: {}", ret); + return ReplServiceError::RETRY_REQUEST; + } + } else { + RD_LOGD(trace_id, "learner flag has already been set to {}, skip, member={}", target, + boost::uuids::to_string(member.id)); + } + + // 3. Set priority + // Based on the current nuraft implementation, learner could be elected as leader, so we set priority to 0 to avoid + // it. And in turn, we need to revert prioiry change if the member is going to become a normal member. + // FIXME after nuraft fixes the bug, we can remove this logic. + auto priority = target ? 0 : member.priority; + RD_LOGI(trace_id, "Set the priority of the member to {}, member={}", priority, boost::uuids::to_string(member.id)); + if (srv_cfg->get_priority() != priority) { + auto priority_ret = set_priority(member.id, priority); + if (priority_ret != ReplServiceError::OK) { return ReplServiceError::NOT_LEADER; } + } else { + RD_LOGD(trace_id, "Priority has already been set to {}, skip, member={}", priority, + boost::uuids::to_string(member.id)); + } + + // 4. Verification + if (wait_and_verify) { + auto timeout = HS_DYNAMIC_CONFIG(consensus.wait_for_config_change_ms); + if (!wait_and_check( + [&]() { + auto srv_conf = raft_server()->get_srv_config(nuraft_mesg::to_server_id(member.id)); + return srv_conf->is_learner() && srv_conf->get_priority() == 0; + }, + timeout)) { + RD_LOGD(trace_id, "Wait for learner and priority config change timed out, cancel the request, timeout: {}", + timeout); + return ReplServiceError::CANCELLED; + } + } + + return ReplServiceError::OK; +} + +nuraft::cmd_result_code RaftReplDev::retry_when_config_changing(const std::function< nuraft::cmd_result_code() >& func, + uint64_t trace_id) { + auto ret = nuraft::cmd_result_code::OK; + int32_t retries = HS_DYNAMIC_CONFIG(consensus.config_changing_error_retries); + for (auto i = 0; i < retries; i++) { + ret = func(); + if (ret == nuraft::cmd_result_code::CONFIG_CHANGING) { + RD_LOGW(trace_id, "Propose to raft failed due to config_changing, attempt: {}", i); + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + continue; + } + break; + } + return ret; +} + +bool RaftReplDev::wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms) { + auto times = timeout_ms / interval_ms; + if (times == 0) { times = 1; } + for (auto i = 0; i < static_cast< int32_t >(times); i++) { + if (check_func()) { return true; } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + return false; +} + +ReplServiceError RaftReplDev::set_priority(const replica_id_t& member, int32_t priority, uint64_t trace_id) { + auto priority_ret = raft_server()->set_priority(nuraft_mesg::to_server_id(member), priority); + // Set_priority should be handled by leader, but if the intent is to set the leader's priority to 0, it returns + // BROADCAST. In this case return NOT_LEADER to let client retry new leader. + // If there is an uncommited_config, nuraft set_priority will honor this uncommited config and generate new + // config based on it and won't have config_changing error. + if (priority_ret != nuraft::raft_server::PrioritySetResult::SET) { + RD_LOGE(trace_id, "Propose to raft to set priority failed, result: {}", + priority_ret == nuraft::raft_server::PrioritySetResult::BROADCAST ? "BROADCAST" : "IGNORED"); + return ReplServiceError::NOT_LEADER; + } + return ReplServiceError::OK; +} + +void RaftReplDev::reset_quorum_size(uint32_t commit_quorum, uint64_t trace_id) { + RD_LOGI(trace_id, "Reset raft quorum size={}", commit_quorum); + nuraft::raft_params params = raft_server()->get_current_params(); + params.with_custom_commit_quorum_size(commit_quorum); + params.with_custom_election_quorum_size(commit_quorum); + raft_server()->update_params(params); +} + folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { // Set the intent to destroy the group m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::DESTROYING; }); @@ -131,24 +582,34 @@ folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() { // here, we set the dsn to a new one , which is definitely unique in the follower, so that the new rreq will not // have a conflict with the old rreq. - rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, - journal_type_t::HS_CTRL_DESTROY, true, sisl::blob{}, sisl::blob{}, 0); + auto err = init_req_ctx(rreq, + repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = std::numeric_limits< uint64_t >::max()}, + journal_type_t::HS_CTRL_DESTROY, true, sisl::blob{}, sisl::blob{}, 0, m_listener); - auto err = m_state_machine->propose_to_raft(std::move(rreq)); + if (err != ReplServiceError::OK) { + // Failed to initialize the repl_req_ctx for replace member. + LOGERROR("Failed to initialize repl_req_ctx for destorying group, error={}", err); + return folly::makeSemiFuture< ReplServiceError >(std::move(err)); + } + + err = m_state_machine->propose_to_raft(std::move(rreq)); if (err != ReplServiceError::OK) { m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::ACTIVE; }); return folly::makeSemiFuture< ReplServiceError >(std::move(err)); LOGERROR("RaftReplDev::destroy_group failed {}", err); } - LOGINFO("Raft repl dev destroy_group={}", boost::uuids::to_string(m_group_id)); + LOGINFO("Raft repl dev destroy_group={}", group_id_str()); return m_destroy_promise.getSemiFuture(); } void RaftReplDev::use_config(json_superblk raft_config_sb) { m_raft_config_sb = std::move(raft_config_sb); } void RaftReplDev::on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done) { - RD_LOG(DEBUG, "create_snapshot last_idx={}/term={}", s.get_last_log_idx(), s.get_last_log_term()); + RD_LOGD(NO_TRACE_ID, "create_snapshot last_idx={}/term={}", s.get_last_log_idx(), s.get_last_log_term()); auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); auto result = m_listener->create_snapshot(snp_ctx).get(); auto null_except = std::shared_ptr< std::exception >(); @@ -159,13 +620,13 @@ void RaftReplDev::on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< } void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& data, - repl_req_ptr_t rreq) { + repl_req_ptr_t rreq, bool part_of_batch, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } { auto const guard = m_stage.access(); if (auto const stage = *guard.get(); stage != repl_dev_stage_t::ACTIVE) { - RD_LOGW("Raft channel: Not ready to accept writes, stage={}", enum_name(stage)); + RD_LOGW(tid, "Raft channel: Not ready to accept writes, stage={}", enum_name(stage)); handle_error(rreq, (stage == repl_dev_stage_t::INIT) ? ReplServiceError::SERVER_IS_JOINING : ReplServiceError::SERVER_IS_LEAVING); @@ -173,9 +634,22 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } } - rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}, - data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */, - header, key, data.size); + auto status = init_req_ctx(rreq, + repl_key{.server_id = server_id(), + .term = raft_server()->get_term(), + .dsn = m_next_dsn.fetch_add(1), + .traceID = tid}, + data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, + true /* is_proposer */, header, key, data.size, m_listener); + + if (status != ReplServiceError::OK) { + RD_LOGI(tid, "Initializing rreq failed error={}, failing this req", status); + handle_error(rreq, status); + return; + } + + RD_LOGD(tid, "repl_key [{}], header size [{}] bytes, user_key size [{}] bytes, data size [{}] bytes", rreq->rkey(), + header.size(), key.size(), data.size); // Add the request to the repl_dev_rreq map, it will be accessed throughout the life cycle of this request auto const [it, happened] = m_repl_key_req_map.emplace(rreq->rkey(), rreq); @@ -183,16 +657,21 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& // If it is header only entry, directly propose to the raft if (rreq->has_linked_data()) { - push_data_to_all_followers(rreq, data); - - // Step 1: Alloc Blkid - auto const status = rreq->alloc_local_blks(m_listener, data.size); - if (status != ReplServiceError::OK) { - RD_LOGD("Allocating blks failed error={}, failing this req", status); - handle_error(rreq, status); + if (rreq->is_proposer() && rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { + RD_LOGE(tid, "data blks has already been allocated and committed, failing this req"); + handle_error(rreq, ReplServiceError::DATA_DUPLICATED); return; } +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("disable_leader_push_data")) { + RD_LOGD(tid, "Simulating push data failure, so that all the follower will have to fetch data"); + } else + push_data_to_all_followers(rreq, data); +#else + push_data_to_all_followers(rreq, data); +#endif + COUNTER_INCREMENT(m_metrics, total_write_cnt, 1); COUNTER_INCREMENT(m_metrics, outstanding_data_write_cnt, 1); @@ -221,7 +700,7 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } }); } else { - RD_LOGD("Skipping data channel send since value size is 0"); + RD_LOGT(tid, "Skipping data channel send since value size is 0"); rreq->add_state(repl_req_state_t::DATA_WRITTEN); auto raft_status = m_state_machine->propose_to_raft(rreq); if (raft_status != ReplServiceError::OK) { handle_error(rreq, raft_status); } @@ -233,7 +712,7 @@ void RaftReplDev::push_data_to_all_followers(repl_req_ptr_t rreq, sisl::sg_list // Prepare the rpc request packet with all repl_reqs details builder.FinishSizePrefixed(CreatePushDataRequest( - builder, server_id(), rreq->term(), rreq->dsn(), + builder, rreq->traceID(), server_id(), rreq->term(), rreq->dsn(), builder.CreateVector(rreq->header().cbytes(), rreq->header().size()), builder.CreateVector(rreq->key().cbytes(), rreq->key().size()), data.size, get_time_since_epoch_ms())); @@ -244,30 +723,37 @@ void RaftReplDev::push_data_to_all_followers(repl_req_ptr_t rreq, sisl::sg_list flatbuffers::FlatBufferToString(builder.GetBufferPointer() + sizeof(flatbuffers::uoffset_t), PushDataRequestTypeTable()));*/ - RD_LOGD("Data Channel: Pushing data to all followers: rreq=[{}]", rreq->to_string()); - - group_msg_service() - ->data_service_request_unidirectional(nuraft_mesg::role_regex::ALL, PUSH_DATA, rreq->m_pkts) - .via(&folly::InlineExecutor::instance()) - .thenValue([this, rreq = std::move(rreq)](auto e) { - if (e.hasError()) { - RD_LOGE("Data Channel: Error in pushing data to all followers: rreq=[{}] error={}", rreq->to_string(), - e.error()); - handle_error(rreq, RaftReplService::to_repl_error(e.error())); - return; + auto peers = get_active_peers(); + auto calls = std::vector< nuraft_mesg::NullAsyncResult >(); + for (auto peer : peers) { + RD_LOGD(rreq->traceID(), "Data Channel: Pushing data to follower {}, rreq=[{}]", peer, rreq->to_string()); + calls.push_back(group_msg_service() + ->data_service_request_unidirectional(peer, PUSH_DATA, rreq->m_pkts) + .via(&folly::InlineExecutor::instance())); + } + folly::collectAllUnsafe(calls).thenValue([this, rreq](auto&& v_res) { + for (auto const& res : v_res) { + if (sisl_likely(res.value())) { + auto r = res.value(); + if (r.hasError()) { + // Just logging PushData error, no action is needed as follower can try by fetchData. + RD_LOGI(rreq->traceID(), "Data Channel: Error in pushing data to all followers: rreq=[{}] error={}", + rreq->to_string(), r.error()); + } } - // Release the buffer which holds the packets - RD_LOGD("Data Channel: Data push completed for rreq=[{}]", rreq->to_string()); - rreq->release_fb_builder(); - rreq->m_pkts.clear(); - }); + } + RD_LOGD(rreq->traceID(), "Data Channel: Data push completed for rreq=[{}]", rreq->to_compact_string()); + // Release the buffer which holds the packets + rreq->release_fb_builder(); + rreq->m_pkts.clear(); + }); } void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_data) { auto const push_data_rcv_time = Clock::now(); auto const& incoming_buf = rpc_data->request_blob(); if (!incoming_buf.cbytes()) { - RD_LOGW("Data Channel: PushData received with empty buffer, ignoring this call"); + RD_LOGW(NO_TRACE_ID, "Data Channel: PushData received with empty buffer, ignoring this call"); rpc_data->send_response(); return; } @@ -275,20 +761,30 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d auto const fb_size = flatbuffers::ReadScalar< flatbuffers::uoffset_t >(incoming_buf.cbytes()) + sizeof(flatbuffers::uoffset_t); auto push_req = GetSizePrefixedPushDataRequest(incoming_buf.cbytes()); - HS_DBG_ASSERT_EQ(fb_size + push_req->data_size(), incoming_buf.size(), "Size mismatch of data size vs buffer size"); - + if (fb_size + push_req->data_size() != incoming_buf.size()) { + RD_LOGW(NO_TRACE_ID, + "Data Channel: PushData received with size mismatch, header size {}, data size {}, received size {}", + fb_size, push_req->data_size(), incoming_buf.size()); + rpc_data->send_response(); + return; + } sisl::blob header = sisl::blob{push_req->user_header()->Data(), push_req->user_header()->size()}; sisl::blob key = sisl::blob{push_req->user_key()->Data(), push_req->user_key()->size()}; - repl_key rkey{.server_id = push_req->issuer_replica_id(), .term = push_req->raft_term(), .dsn = push_req->dsn()}; + repl_key rkey{.server_id = push_req->issuer_replica_id(), + .term = push_req->raft_term(), + .dsn = push_req->dsn(), + .traceID = push_req->trace_id()}; auto const req_orig_time_ms = push_req->time_ms(); - RD_LOGD("Data Channel: PushData received: time diff={} ms.", get_elapsed_time_ms(req_orig_time_ms)); + RD_LOGD(rkey.traceID, "Data Channel: PushData received: time diff={} ms.", get_elapsed_time_ms(req_orig_time_ms)); #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("drop_push_data_request")) { - LOGINFO("Data Channel: Flip is enabled, skip on_push_data_received to simulate fetch remote data, " + RD_LOGI(rkey.traceID, + "Data Channel: Flip is enabled, skip on_push_data_received to simulate fetch remote data, " "server_id={}, term={}, dsn={}", push_req->issuer_replica_id(), push_req->raft_term(), push_req->dsn()); + rpc_data->send_response(); return; } #endif @@ -296,15 +792,18 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d auto rreq = applier_create_req(rkey, journal_type_t::HS_DATA_LINKED, header, key, push_req->data_size(), true /* is_data_channel */); if (rreq == nullptr) { - RD_LOG(ERROR, - "Data Channel: Creating rreq on applier has failed, will ignore the push and let Raft channel send " - "trigger a fetch explicitly if needed. rkey={}", - rkey.to_string()); + RD_LOGE(rkey.traceID, + "Data Channel: Creating rreq on applier has failed, will ignore the push and let Raft channel send " + "trigger a fetch explicitly if needed. rkey={}", + rkey.to_string()); + rpc_data->send_response(); return; } if (!rreq->save_pushed_data(rpc_data, incoming_buf.cbytes() + fb_size, push_req->data_size())) { - RD_LOGD("Data Channel: Data already received for rreq=[{}], ignoring this data", rreq->to_string()); + RD_LOGT(rkey.traceID, "Data Channel: Data already received for rreq=[{}], ignoring this data", + rreq->to_string()); + rpc_data->send_response(); return; } @@ -323,12 +822,15 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d RD_DBG_ASSERT(false, "Error in writing data, error_code={}", err.value()); handle_error(rreq, ReplServiceError::DRIVE_WRITE_ERROR); } else { + rreq->release_data(); rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->m_data_written_promise.setValue(); + // if rreq create time is earlier than push_data receive time, that means the rreq was created by raft + // channel log. Otherwise set to zero as rreq is created by data channel. const auto data_log_diff_us = push_data_rcv_time.time_since_epoch().count() > rreq->created_time().time_since_epoch().count() ? get_elapsed_time_us(rreq->created_time(), push_data_rcv_time) - : get_elapsed_time_us(push_data_rcv_time, rreq->created_time()); + : 0; auto const data_write_latency = get_elapsed_time_us(push_data_rcv_time); auto const total_data_write_latency = get_elapsed_time_us(rreq->created_time()); @@ -338,67 +840,64 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d HISTOGRAM_OBSERVE(m_metrics, rreq_push_data_latency_us, data_write_latency); HISTOGRAM_OBSERVE(m_metrics, rreq_total_data_write_latency_us, total_data_write_latency); - RD_LOGD("Data Channel: Data write completed for rreq=[{}], time_diff_data_log_us={}, " + RD_LOGD(rreq->traceID(), + "Data Channel: Data write completed for rreq=[{}], time_diff_data_log_us={}, " "data_write_latency_us={}, total_data_write_latency_us(rreq creation to write complete)={}, " "local_blkid.num_pieces={}", - rreq->to_string(), data_log_diff_us, data_write_latency, total_data_write_latency, + rreq->to_compact_string(), data_log_diff_us, data_write_latency, total_data_write_latency, write_num_pieces); } }); } repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_type_t code, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size, - [[maybe_unused]] bool is_data_channel) { + sisl::blob const& key, uint32_t data_size, bool is_data_channel, + int64_t lsn) { + if (is_data_channel) RD_DBG_ASSERT(-1 == lsn, "lsn from data channel should always be -1 , got lsn {}", lsn); + auto const [it, happened] = m_repl_key_req_map.try_emplace(rkey, repl_req_ptr_t(new repl_req_ctx())); RD_DBG_ASSERT((it != m_repl_key_req_map.end()), "Unexpected error in map_repl_key_to_req"); auto rreq = it->second; if (!happened) { - // We already have the entry in the map, check if we are already allocated the blk by previous caller, in - // that case we need to return the req. + // We already have the entry in the map, reset its start time to prevent it from being incorrectly gc during use. + rreq->set_created_time(); + // Check if we are already allocated the blk by previous caller, in that case we need to return the req. if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { // Do validation if we have the correct mapping // RD_REL_ASSERT(blob_equals(user_header, rreq->header), "User header mismatch for repl_key={}", // rkey.to_string()); // RD_REL_ASSERT(blob_equals(user_key, rreq->key), "User key mismatch for repl_key={}", rkey.to_string()); - RD_LOGD("Repl_key=[{}] already received ", rkey.to_string()); + RD_LOGT(rkey.traceID, "Repl_key=[{}] already received ", rkey.to_string()); return rreq; } } - // We need to allocate the block, since entry doesn't exist or if it exist, two threads are trying to do the same - // thing. So take state mutex and allocate the blk - std::unique_lock< std::mutex > lg(rreq->m_state_mtx); - rreq->init(rkey, code, false /* is_proposer */, user_header, key, data_size); - - // There is no data portion, so there is not need to allocate - if (!rreq->has_linked_data()) { return rreq; } - if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { return rreq; } - - auto alloc_status = rreq->alloc_local_blks(m_listener, data_size); -#ifdef _PRERELEASE - if (is_data_channel) { - if (iomgr_flip::instance()->test_flip("fake_reject_append_data_channel")) { - LOGINFO("Data Channel: Reject append_entries flip is triggered for rkey={}", rkey.to_string()); - alloc_status = ReplServiceError::NO_SPACE_LEFT; - } - } else { - if (iomgr_flip::instance()->test_flip("fake_reject_append_raft_channel")) { - LOGINFO("Raft Channel: Reject append_entries flip is triggered for rkey={}", rkey.to_string()); - alloc_status = ReplServiceError::NO_SPACE_LEFT; + // rreq->init will allocate the block if it has linked data. + auto status = init_req_ctx(rreq, rkey, code, false /* is_proposer */, user_header, key, data_size, m_listener); + + if (status != ReplServiceError::OK) { + RD_LOGD(rkey.traceID, "For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), + status); + if (status == ReplServiceError::NO_SPACE_LEFT && !is_data_channel && !rreq->is_proposer()) { + const auto& chunk_id = rreq->local_blkid().chunk_num(); + RD_LOGD(rkey.traceID, + "For Repl_key=[{}] alloc hints returned error={} when trying to allocate blk on chunk={}", + rkey.to_string(), status, chunk_id); + m_listener->on_no_space_left(lsn, chunk_id); + } else { + RD_LOGD( + rkey.traceID, + "For Repl_key=[{}] alloc hints returned error={}, failing this req, data_channl: {}, is_proposer: {} ", + rkey.to_string(), status, is_data_channel, rreq->is_proposer()); } - } -#endif - - if (alloc_status != ReplServiceError::OK) { - RD_LOGE("For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), alloc_status); // Do not call handle_error here, because handle_error is for rreq which needs to be terminated. This one can be // retried. return nullptr; } - RD_LOGD("in follower_create_req: rreq={}, addr={}", rreq->to_string(), reinterpret_cast< uintptr_t >(rreq.get())); + RD_LOGD(rkey.traceID, , "in follower_create_req: rreq={}, addr=0x{:x}", rreq->to_string(), + reinterpret_cast< uintptr_t >(rreq.get())); return rreq; } @@ -412,7 +911,7 @@ folly::Future< folly::Unit > RaftReplDev::notify_after_data_written(std::vector< if (!rreq->has_linked_data()) { continue; } auto const status = uint32_cast(rreq->state()); if (status & uint32_cast(repl_req_state_t::DATA_WRITTEN)) { - RD_LOGD("Raft Channel: Data write completed and blkid mapped: rreq=[{}]", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Data written and blkid mapped: rkey=[{}]", rreq->to_compact_string()); continue; } @@ -455,15 +954,16 @@ folly::Future< folly::Unit > RaftReplDev::notify_after_data_written(std::vector< HS_DBG_ASSERT(rreq->has_state(repl_req_state_t::DATA_WRITTEN), "Data written promise raised without updating DATA_WRITTEN state for rkey={}", rreq->rkey().to_string()); - RD_LOGD("Raft Channel: Data write completed and blkid mapped: rreq=[{}]", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Data write completed and blkid mapped: rreq=[{}]", rreq->to_compact_string()); } #endif - RD_LOGT("Data Channel: {} pending reqs's data are written", rreqs->size()); + RD_LOGT(NO_TRACE_ID, "{} pending reqs's data are written", rreqs->size()); return folly::makeFuture< folly::Unit >(folly::Unit{}); }); } -bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms) { +bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms, + std::vector< repl_req_ptr_t >* timeout_rreqs) { std::vector< folly::Future< folly::Unit > > futs; std::vector< repl_req_ptr_t > only_wait_reqs; only_wait_reqs.reserve(rreqs.size()); @@ -484,20 +984,27 @@ bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rre // sometime before do an explicit fetch. This is so that, it is possible raft channel has come ahead of data // channel and waiting for sometime avoid expensive fetch. On steady state, after a little bit of wait data // would be reached automatically. - RD_LOG(DEBUG, - "We haven't received data for {} out {} in reqs batch, will fetch and wait for {} ms, in_resync_mode()={} ", - only_wait_reqs.size(), rreqs.size(), timeout_ms, is_resync_mode()); + RD_LOGD(NO_TRACE_ID, + "We haven't received data for {} out {} in reqs batch, will fetch and wait for {} ms, in_resync_mode()={} ", + only_wait_reqs.size(), rreqs.size(), timeout_ms, is_resync_mode()); // We are yet to support reactive fetch from remote. if (is_resync_mode()) { - check_and_fetch_remote_data(std::move(only_wait_reqs)); + check_and_fetch_remote_data(only_wait_reqs); } else { - m_repl_svc.add_to_fetch_queue(shared_from_this(), std::move(only_wait_reqs)); + m_repl_svc.add_to_fetch_queue(shared_from_this(), only_wait_reqs); } // block waiting here until all the futs are ready (data channel filled in and promises are made); - auto all_futs = folly::collectAllUnsafe(futs).wait(std::chrono::milliseconds(timeout_ms)); - return (all_futs.isReady()); + auto all_futs_ready = folly::collectAllUnsafe(futs).wait(std::chrono::milliseconds(timeout_ms)).isReady(); + if (!all_futs_ready && timeout_rreqs != nullptr) { + timeout_rreqs->clear(); + for (size_t i{0}; i < futs.size(); ++i) { + if (!futs[i].isReady()) { timeout_rreqs->emplace_back(only_wait_reqs[i]); } + } + all_futs_ready = timeout_rreqs->empty(); + } + return all_futs_ready; } void RaftReplDev::check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreqs) { @@ -509,12 +1016,12 @@ void RaftReplDev::check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreq for (auto const& rreq : rreqs) { auto const cur_state = uint32_cast(rreq->state()); if (cur_state == uint32_cast(repl_req_state_t::ERRORED)) { - // We already received the data before, just ignore this data - RD_LOGD("Raft Channel: rreq=[{}] already errored out, ignoring the fetch", rreq->to_string()); + RD_LOGD(rreq->traceID(), "rreq=[{}] already errored out, ignoring the fetch", rreq->to_compact_string()); continue; } else if (cur_state == uint32_cast(repl_req_state_t::DATA_RECEIVED)) { // We already received the data before, just ignore this data - RD_LOGD("Raft Channel: Data already received for rreq=[{}], ignoring the fetch", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Data already received for rreq=[{}], ignoring the fetch", + rreq->to_compact_string()); continue; } @@ -542,7 +1049,8 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) { entries.reserve(rreqs.size()); shared< flatbuffers::FlatBufferBuilder > builder = std::make_shared< flatbuffers::FlatBufferBuilder >(); - RD_LOGD("Data Channel : FetchData from remote: rreq.size={}, my server_id={}", rreqs.size(), server_id()); + RD_LOGD(NO_TRACE_ID, "Data Channel : FetchData from remote: rreq.size={}, my server_id={}", rreqs.size(), + server_id()); auto const& originator = rreqs.front()->remote_blkid().server_id; for (auto const& rreq : rreqs) { @@ -558,7 +1066,8 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) { RD_DBG_ASSERT_EQ(rreq->remote_blkid().server_id, originator, "Unexpected originator for rreq={}", rreq->to_string()); - RD_LOGT("Fetching data from originator={}, remote: rreq=[{}], remote_blkid={}, my server_id={}", originator, + RD_LOGT(rreq->traceID(), + "Fetching data from originator={}, remote: rreq=[{}], remote_blkid={}, my server_id={}", originator, rreq->to_string(), rreq->remote_blkid().blkid.to_string(), server_id()); } @@ -583,15 +1092,15 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) { auto const fetch_latency_us = get_elapsed_time_us(fetch_start_time); HISTOGRAM_OBSERVE(m_metrics, rreq_data_fetch_latency_us, fetch_latency_us); - RD_LOGD("Data Channel: FetchData from remote completed, time taken={} us", fetch_latency_us); + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData from remote completed, time taken={} us", fetch_latency_us); if (!response) { // if we are here, it means the original who sent the log entries are down. // we need to handle error and when the other member becomes leader, it will resend the log entries; - RD_LOG(ERROR, - "Not able to fetching data from originator={}, error={}, probably originator is down. Will " - "retry when new leader start appending log entries", - rreqs.front()->remote_blkid().server_id, response.error()); + RD_LOGE(NO_TRACE_ID, + "Not able to fetching data from originator={}, error={}, probably originator is down. Will " + "retry when new leader start appending log entries", + rreqs.front()->remote_blkid().server_id, response.error()); for (auto const& rreq : rreqs) { // TODO: Set the data_received promise with error, so that waiting threads can be unblocked and // reject the request. Without that, it will timeout and then reject it. @@ -619,13 +1128,14 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) { void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_data) { auto const& incoming_buf = rpc_data->request_blob(); if (!incoming_buf.cbytes()) { - RD_LOGW("Data Channel: PushData received with empty buffer, ignoring this call"); + RD_LOGW(NO_TRACE_ID, "Data Channel: PushData received with empty buffer, ignoring this call"); rpc_data->send_response(); return; } auto fetch_req = GetSizePrefixedFetchData(incoming_buf.cbytes()); - RD_LOGD("Data Channel: FetchData received: fetch_req.size={}", fetch_req->request()->entries()->size()); + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData received: fetch_req.size={}", + fetch_req->request()->entries()->size()); std::vector< sisl::sg_list > sgs_vec; std::vector< folly::Future< bool > > futs; @@ -636,33 +1146,29 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_ auto const& lsn = req->lsn(); auto const& originator = req->blkid_originator(); auto const& remote_blkid = req->remote_blkid(); - - // release this assert if in the future we want to fetch from non-originator; - RD_REL_ASSERT_EQ(originator, server_id(), - "Not expect to receive fetch data from remote when I am not the originator of this request"); - - // fetch data based on the remote_blkid - if (originator == server_id()) { - // We are the originator of the blkid, read data locally; - MultiBlkId local_blkid; - - // convert remote_blkid serialized data to local blkid - local_blkid.deserialize(sisl::blob{remote_blkid->Data(), remote_blkid->size()}, true /* copy */); - - RD_LOGD("Data Channel: FetchData received: dsn={} lsn={} my_blkid={}", req->dsn(), lsn, - local_blkid.to_string()); - - // prepare the sgs data buffer to read into; - auto const total_size = local_blkid.blk_count() * get_blk_size(); - sisl::sg_list sgs; - sgs.size = total_size; - sgs.iovs.emplace_back( - iovec{.iov_base = iomanager.iobuf_alloc(get_blk_size(), total_size), .iov_len = total_size}); - - // accumulate the sgs for later use (send back to the requester)); - sgs_vec.push_back(sgs); - futs.emplace_back(async_read(local_blkid, sgs, total_size)); + MultiBlkId local_blkid; + local_blkid.deserialize(sisl::blob{remote_blkid->Data(), remote_blkid->size()}, true /* copy */); + // prepare the sgs data buffer to read into; + auto const total_size = local_blkid.blk_count() * get_blk_size(); + sisl::sg_list sgs; + sgs.size = total_size; + sgs.iovs.emplace_back( + iovec{.iov_base = iomanager.iobuf_alloc(get_blk_size(), total_size), .iov_len = total_size}); + + // accumulate the sgs for later use (send back to the requester)); + sgs_vec.push_back(sgs); + + if (originator != server_id()) { + RD_LOGD(NO_TRACE_ID, "non-originator FetchData received: dsn={} lsn={} originator={}, my_server_id={}", + req->dsn(), lsn, originator, server_id()); + } else { + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData received: dsn={} lsn={}", req->dsn(), lsn); } + + auto const& header = req->user_header(); + sisl::blob user_header = sisl::blob{header->Data(), header->size()}; + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData handled, my_blkid={}", local_blkid.to_string()); + futs.emplace_back(std::move(m_listener->on_fetch_data(lsn, user_header, local_blkid, sgs))); } folly::collectAllUnsafe(futs).thenValue( @@ -677,7 +1183,7 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_ } } - RD_LOGD("Data Channel: FetchData data read completed for {} buffers", sgs_vec.size()); + RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData data read completed for {} buffers", sgs_vec.size()); // now prepare the io_blob_list to response back to requester; nuraft_mesg::io_blob_list_t pkts = sisl::io_blob_list_t{}; @@ -709,7 +1215,7 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons RD_DBG_ASSERT_GT(total_size, 0, "Empty response from remote"); RD_DBG_ASSERT(raw_data, "Empty response from remote"); - RD_LOGD("Data Channel: FetchData completed for {} requests", rreqs.size()); + RD_LOGD(NO_TRACE_ID, "Data Channel: FetchData completed for {} requests", rreqs.size()); for (auto const& rreq : rreqs) { auto const data_size = rreq->remote_blkid().blkid.blk_count() * get_blk_size(); @@ -720,8 +1226,9 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons RD_DBG_ASSERT_EQ(data_size, local_size, "Data size mismatch for rreq={} remote size: {}, local size: {}", rreq->to_string(), data_size, local_size); - RD_LOGD("Data Channel: Data already received for rreq=[{}], skip and move on to next rreq.", - rreq->to_string()); + RD_LOGT(rreq->traceID(), + "Data Channel: Data already received for rreq=[{}], skip and move on to next rreq.", + rreq->to_compact_string()); } else { auto const data_write_start_time = Clock::now(); COUNTER_INCREMENT(m_metrics, total_write_cnt, 1); @@ -741,16 +1248,19 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons RD_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener + rreq->release_data(); rreq->add_state(repl_req_state_t::DATA_WRITTEN); rreq->m_data_written_promise.setValue(); - RD_LOGD("Data Channel: Data Write completed rreq=[{}], data_write_latency_us={}, " + RD_LOGD(rreq->traceID(), + "Data Channel: Data Write completed rreq=[{}], data_write_latency_us={}, " "total_write_latency_us={}, write_num_pieces={}", - rreq->to_string(), data_write_latency, total_data_write_latency, write_num_pieces); + rreq->to_compact_string(), data_write_latency, total_data_write_latency, write_num_pieces); }); - RD_LOGD("Data Channel: Data fetched from remote: rreq=[{}], data_size: {}, total_size: {}, local_blkid: {}", - rreq->to_string(), data_size, total_size, rreq->local_blkid().to_string()); + RD_LOGT(rreq->traceID(), + "Data Channel: Data fetched from remote: rreq=[{}], data_size: {}, total_size: {}, local_blkid: {}", + rreq->to_compact_string(), data_size, total_size, rreq->local_blkid().to_string()); } raw_data += data_size; total_size -= data_size; @@ -770,24 +1280,46 @@ void RaftReplDev::commit_blk(repl_req_ptr_t rreq) { } } +void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) { + // 1. call the listener to rollback + RD_LOGD(rreq->traceID(), "Rolling back rreq: {}", rreq->to_compact_string()); + m_listener->on_rollback(rreq->lsn(), rreq->header(), rreq->key(), rreq); + // 2. remove the request from maps + m_state_machine->unlink_lsn_to_req(rreq->lsn(), rreq); + m_repl_key_req_map.erase(rreq->rkey()); + + // 3. free the allocated blocks + if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { + auto blkid = rreq->local_blkid(); + data_service().async_free_blk(blkid).thenValue([this, blkid, rreq](auto&& err) { + HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", blkid.to_string()); + RD_LOGD(rreq->traceID(), "Releasing blkid={} freed successfully", blkid.to_string()); + }); + } +} + void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { - commit_blk(rreq); + if (!rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { commit_blk(rreq); } // Remove the request from repl_key map. m_repl_key_req_map.erase(rreq->rkey()); // Remove the request from lsn map. - m_state_machine->unlink_lsn_to_req(rreq->lsn()); + m_state_machine->unlink_lsn_to_req(rreq->lsn(), rreq); auto cur_dsn = m_next_dsn.load(std::memory_order_relaxed); while (cur_dsn <= rreq->dsn()) { m_next_dsn.compare_exchange_strong(cur_dsn, rreq->dsn() + 1); } - RD_LOGD("Raft channel: Commit rreq=[{}]", rreq->to_string()); + RD_LOGD(rreq->traceID(), "Raft channel: Commit rreq=[{}]", rreq->to_compact_string()); if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY) { leave(); + } else if (rreq->op_code() == journal_type_t::HS_CTRL_START_REPLACE) { + start_replace_member(rreq); + } else if (rreq->op_code() == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { + complete_replace_member(rreq); } else { - m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkid(), rreq); + m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), {rreq->local_blkid()}, rreq); } if (!recovery) { @@ -796,23 +1328,55 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) { "Out of order commit of lsns, it is not expected in RaftReplDev. cur_lsns={}, prev_lsns={}", rreq->lsn(), prev_lsn); } - if (!rreq->is_proposer()) { rreq->clear(); } + + if (!rreq->is_proposer()) rreq->clear(); +} + +void RaftReplDev::handle_config_commit(const repl_lsn_t lsn, raft_cluster_config_ptr_t& new_conf) { + // when reaching here, the new config has already been applied to the cluster. + // since we didn't create repl req for config change, we just need to update m_commit_upto_lsn here. + RD_LOGD(NO_TRACE_ID, "config commit on lsn {}", lsn); + // keep this variable in case it is needed later + (void) new_conf; + auto prev_lsn = m_commit_upto_lsn.load(std::memory_order_relaxed); + if (prev_lsn >= lsn || !m_commit_upto_lsn.compare_exchange_strong(prev_lsn, lsn)) { + RD_LOGE(NO_TRACE_ID, "Raft Channel: unexpected log {} commited before config {} committed", prev_lsn, lsn); + } +} + +void RaftReplDev::handle_config_rollback(const repl_lsn_t lsn, raft_cluster_config_ptr_t& conf) { + RD_LOGD(NO_TRACE_ID, "roll back config on lsn {}", lsn); + // keep this variable in case it is needed later + (void)conf; + m_listener->on_config_rollback(lsn); } void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) { if (err == ReplServiceError::OK) { return; } + RD_LOGE(rreq->traceID(), "Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); if (!rreq->add_state_if_not_already(repl_req_state_t::ERRORED)) { - RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); + RD_LOGE(rreq->traceID(), "Raft Channel: Error has been added for rreq=[{}] error={}", rreq->to_string(), err); return; } // Remove from the map and thus its no longer accessible from applier_create_req m_repl_key_req_map.erase(rreq->rkey()); - if (rreq->op_code() == journal_type_t::HS_DATA_INLINED) { + // Ensure non-volatile lsn not exist because handle_error should not be called after append entries. + auto exist_rreq = m_state_machine->lsn_to_req(rreq->lsn()); + if (exist_rreq != nullptr && !exist_rreq->is_volatile()) { + HS_REL_ASSERT(false, "Unexpected: LSN={} is already ready to commit, exist_rreq=[{}]", rreq->lsn(), + exist_rreq->to_string()); + } + if (err == ReplServiceError::DATA_DUPLICATED) { + RD_LOGE(rreq->traceID(), "Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); + m_listener->on_error(err, rreq->header(), rreq->key(), rreq); + rreq->clear(); + return; + } + if (rreq->op_code() == journal_type_t::HS_DATA_LINKED) { // Free the blks which is allocated already - RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { auto blkid = rreq->local_blkid(); data_service().async_free_blk(blkid).thenValue([blkid](auto&& err) { @@ -820,8 +1384,12 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) blkid.to_string()); }); } - } else if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY) { - if (rreq->is_proposer()) { m_destroy_promise.setValue(err); } + } else if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY || + rreq->op_code() == journal_type_t::HS_CTRL_COMPLETE_REPLACE) { + if (rreq->is_proposer()) { + RD_LOGE(rreq->traceID(), "Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err); + m_destroy_promise.setValue(err); + } } // TODO: Validate if this is a correct assert or not. Is it possible that the log is already flushed and we receive @@ -836,6 +1404,35 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) rreq->clear(); } +void RaftReplDev::start_replace_member(repl_req_ptr_t rreq) { + auto members = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); + + RD_LOGI(rreq->traceID(), "Raft repl start_replace_member commit member_out={} member_in={}", + boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); + + m_listener->on_start_replace_member(members->replica_out, members->replica_in, rreq->traceID()); + // record the replace_member intent + std::unique_lock lg{m_sb_mtx}; + m_rd_sb->replace_member_ctx.replica_in = members->replica_in.id; + m_rd_sb->replace_member_ctx.replica_out = members->replica_out.id; + m_rd_sb.write(); +} + +void RaftReplDev::complete_replace_member(repl_req_ptr_t rreq) { + auto members = r_cast< const replace_member_ctx* >(rreq->header().cbytes()); + + RD_LOGI(rreq->traceID(), "Raft repl complete_replace_member commit member_out={} member_in={}", + boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id)); + + m_listener->on_complete_replace_member(members->replica_out, members->replica_in, rreq->traceID()); + + // clear the replace_member intent + std::unique_lock lg{m_sb_mtx}; + m_rd_sb->replace_member_ctx = replace_member_ctx_superblk{}; + m_rd_sb.write(); + RD_LOGI(rreq->traceID(), "Raft repl replace_member_ctx has been cleared."); +} + static bool blob_equals(sisl::blob const& a, sisl::blob const& b) { if (a.size() != b.size()) { return false; } return (std::memcmp(a.cbytes(), b.cbytes(), a.size()) == 0); @@ -848,11 +1445,15 @@ repl_req_ptr_t RaftReplDev::repl_key_to_req(repl_key const& rkey) const { } folly::Future< std::error_code > RaftReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch) { + bool part_of_batch, trace_id_t tid) { + if (is_stopping()) { + LOGINFO("repl dev is being shutdown!"); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + } return data_service().async_read(bid, sgs, size, part_of_batch); } -void RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid) { +folly::Future< std::error_code > RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) { // TODO: For timeline consistency required, we should retain the blkid that is changed and write that to another // journal. data_service().async_free_blk(bid); @@ -861,7 +1462,8 @@ void RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid) { AsyncReplResult<> RaftReplDev::become_leader() { return m_msg_mgr.become_leader(m_group_id).via(&folly::InlineExecutor::instance()).thenValue([this](auto&& e) { if (e.hasError()) { - RD_LOGE("Error in becoming leader: {}", e.error()); + RD_LOGE(NO_TRACE_ID, "Error in becoming leader: {}", e.error()); + decr_pending_request_num(); return make_async_error<>(RaftReplService::to_repl_error(e.error())); } return make_async_success<>(); @@ -882,11 +1484,42 @@ std::vector< peer_info > RaftReplDev::get_replication_status() const { for (auto const& pinfo : rep_status) { pi.emplace_back(peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_), .replication_idx_ = pinfo.last_log_idx_, - .last_succ_resp_us_ = pinfo.last_succ_resp_us_}); + .last_succ_resp_us_ = pinfo.last_succ_resp_us_, + .priority_ = pinfo.priority_, + .can_vote = !pinfo.is_learner_}); } return pi; } +std::set< replica_id_t > RaftReplDev::get_active_peers() const { + auto repl_status = get_replication_status(); + std::set< replica_id_t > res; + auto my_committed_idx = m_commit_upto_lsn.load(); + auto laggy=HS_DYNAMIC_CONFIG(consensus.laggy_threshold); + uint64_t least_active_repl_idx = my_committed_idx > HS_DYNAMIC_CONFIG(consensus.laggy_threshold) + ? my_committed_idx - HS_DYNAMIC_CONFIG(consensus.laggy_threshold) + : 0; + // peer's last log idx should also >= leader's start_index-1(ensure existence), otherwise leader can't append log entries to it + // and baseline resync will be triggerred. Try to avoid conflict between baseline resync and normal replication. + least_active_repl_idx = std::max(least_active_repl_idx, m_data_journal->start_index() - 1); + for (auto p : repl_status) { + if (p.id_ == m_my_repl_id) { continue; } + if (p.replication_idx_ >= least_active_repl_idx) { + res.insert(p.id_); + RD_LOGT(NO_TRACE_ID, + "Found active peer {}, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}, laggy={}", p.id_, + my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_, least_active_repl_idx, + laggy); + } else { + RD_LOGW(NO_TRACE_ID, + "Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}", + p.id_, my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_, + least_active_repl_idx); + } + } + return res; +} + uint32_t RaftReplDev::get_blk_size() const { return data_service().get_blk_size(); } nuraft_mesg::repl_service_ctx* RaftReplDev::group_msg_service() { return m_repl_svc_ctx.get(); } @@ -960,8 +1593,8 @@ nuraft::ptr< nuraft::cluster_config > RaftReplDev::load_config() { if (!js.contains("config")) { auto cluster_conf = nuraft::cs_new< nuraft::cluster_config >(); - cluster_conf->get_servers().push_back( - nuraft::cs_new< nuraft::srv_config >(m_raft_server_id, my_replica_id_str())); + cluster_conf->get_servers().push_back(nuraft::cs_new< nuraft::srv_config >( + m_raft_server_id, 0, my_replica_id_str(), "", false, raft_leader_priority)); js["config"] = serialize_cluster_config(*cluster_conf); } return deserialize_cluster_config(js["config"]); @@ -971,12 +1604,17 @@ void RaftReplDev::save_config(const nuraft::cluster_config& config) { std::unique_lock lg{m_config_mtx}; (*m_raft_config_sb)["config"] = serialize_cluster_config(config); m_raft_config_sb.write(); + RD_LOGI(NO_TRACE_ID, "Saved config {}", (*m_raft_config_sb)["config"].dump()); } void RaftReplDev::save_state(const nuraft::srv_state& state) { std::unique_lock lg{m_config_mtx}; - (*m_raft_config_sb)["state"] = nlohmann::json{{"term", state.get_term()}, {"voted_for", state.get_voted_for()}}; + (*m_raft_config_sb)["state"] = nlohmann::json{{"term", state.get_term()}, + {"voted_for", state.get_voted_for()}, + {"election_timer_allowed", state.is_election_timer_allowed()}, + {"catching_up", state.is_catching_up()}}; m_raft_config_sb.write(); + RD_LOGI(NO_TRACE_ID, "Saved state {}", (*m_raft_config_sb)["state"].dump()); } nuraft::ptr< nuraft::srv_state > RaftReplDev::read_state() { @@ -984,11 +1622,16 @@ nuraft::ptr< nuraft::srv_state > RaftReplDev::read_state() { auto& js = *m_raft_config_sb; auto state = nuraft::cs_new< nuraft::srv_state >(); if (js["state"].empty()) { - js["state"] = nlohmann::json{{"term", state->get_term()}, {"voted_for", state->get_voted_for()}}; + js["state"] = nlohmann::json{{"term", state->get_term()}, + {"voted_for", state->get_voted_for()}, + {"election_timer_allowed", state->is_election_timer_allowed()}, + {"catching_up", state->is_catching_up()}}; } else { try { state->set_term(uint64_cast(js["state"]["term"])); state->set_voted_for(static_cast< int >(js["state"]["voted_for"])); + state->allow_election_timer(static_cast< bool >(js["state"]["election_timer_allowed"])); + state->set_catching_up(static_cast< bool >(js["state"]["catching_up"])); } catch (std::out_of_range const&) { LOGWARN("State data was not in the expected format [group_id={}]!", m_group_id) } @@ -1000,7 +1643,7 @@ nuraft::ptr< nuraft::log_store > RaftReplDev::load_log_store() { return m_data_j int32_t RaftReplDev::server_id() { return m_raft_server_id; } -bool RaftReplDev::is_destroy_pending() const { return (m_rd_sb->destroy_pending == 0x1); } +bool RaftReplDev::is_destroy_pending() const { return (*m_stage.access().get() == repl_dev_stage_t::DESTROYED); } bool RaftReplDev::is_destroyed() const { return (*m_stage.access().get() == repl_dev_stage_t::PERMANENT_DESTROYED); } /////////////////////////////////// nuraft_mesg::mesg_state_mgr overrides //////////////////////////////////// @@ -1013,88 +1656,218 @@ uint32_t RaftReplDev::get_logstore_id() const { return m_data_journal->logstore_ std::shared_ptr< nuraft::state_machine > RaftReplDev::get_state_machine() { return m_state_machine; } void RaftReplDev::permanent_destroy() { - RD_LOGI("Permanent destroy for raft repl dev"); - m_rd_sb.destroy(); + RD_LOGI(NO_TRACE_ID, "Permanent destroy for raft repl dev group_id={}", group_id_str()); + // let the listener know at first, so that they can cleanup persistent structures before raft repl dev is destroyed + m_listener->on_destroy(group_id()); m_raft_config_sb.destroy(); m_data_journal->remove_store(); logstore_service().destroy_log_dev(m_data_journal->logdev_id()); m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::PERMANENT_DESTROYED; }); + + // we should destroy repl_dev superblk only after all the resources are cleaned up, so that is crash recovery + // occurs, we have a chance to find the stale repl_dev and reclaim all the stale resources. + m_rd_sb.destroy(); } void RaftReplDev::leave() { + // this will be called in 3 cases : + // 1. commit log entry of journal_type_t::HS_CTRL_DESTROY + // 2. it is removed from the cluster and the new config(excluding this node) is being committed on this node + // 3. it is removed from the cluster , but the node is down and new config log(excluding this node) is not + // replicated to this removed node. when the node restart, leader will not send any append entry to this node, + // since it is not a member of the raft group. it will become a condidate and send request-vote request to other + // members of this raft group. a member will send RemovedFromCluster to the node if this member finds the node + // is no longer a member of the raft group. + + // leave() will never be called concurrently, since config change and journal_type_t::HS_CTRL_DESTROY are all log + // entry, which will be committed sequentially. + if (is_destroy_pending()) return; + // We update that this repl_dev in destroyed state, actual clean up of resources happen in reaper thread later m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::DESTROYED; }); m_destroyed_time = Clock::now(); - // We let the listener know right away, so that they can cleanup persistent structures soonest. This will - // reduce the time window of leaked resources if any - m_listener->on_destroy(); - // Persist that destroy pending in superblk, so that in case of crash before cleanup of resources, it can be done // post restart. m_rd_sb->destroy_pending = 0x1; m_rd_sb.write(); - RD_LOGI("RaftReplDev leave group"); + RD_LOGI(NO_TRACE_ID, "RaftReplDev leave group_id={}", group_id_str()); m_destroy_promise.setValue(ReplServiceError::OK); // In case proposer is waiting for the destroy to complete } -std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nuraft::cb_func::Type type, - nuraft::cb_func::Param* param) { +nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, nuraft::cb_func::Param* param) { auto ret = nuraft::cb_func::ReturnCode::Ok; - if (type == nuraft::cb_func::Type::GotAppendEntryReqFromLeader) { + switch (type) { + case nuraft::cb_func::Type::GotAppendEntryReqFromLeader: { auto raft_req = r_cast< nuraft::req_msg* >(param->ctx); auto const& entries = raft_req->log_entries(); auto start_lsn = raft_req->get_last_log_idx() + 1; - RD_LOGD("Raft channel: Received {} append entries on follower from leader, term {}, lsn {} ~ {} , my commited " - "lsn {} , leader commmited lsn {}", + if (entries.size() == 0) { + RD_LOGT(NO_TRACE_ID, "Raft channel: Received no entry, leader committed lsn {}", + raft_req->get_commit_idx()); + return ret; + } + RD_LOGT(NO_TRACE_ID, + "Raft channel: Received {} append entries on follower from leader, term {}, lsn {} ~ {} , my " + "committed lsn {} , leader committed lsn {}", entries.size(), raft_req->get_last_log_term(), start_lsn, start_lsn + entries.size() - 1, m_commit_upto_lsn.load(), raft_req->get_commit_idx()); - if (!entries.empty()) { - RD_LOGT("Raft channel: Received {} append entries on follower from leader, localizing them", - entries.size()); - - auto reqs = sisl::VectorPool< repl_req_ptr_t >::alloc(); - for (auto& entry : entries) { - if (entry->get_val_type() != nuraft::log_val_type::app_log) { continue; } - if (entry->get_buf_ptr()->size() == 0) { continue; } - auto req = m_state_machine->localize_journal_entry_prepare(*entry); - if (req == nullptr) { - sisl::VectorPool< repl_req_ptr_t >::free(reqs); - return {true, nuraft::cb_func::ReturnCode::ReturnNull}; - } - reqs->emplace_back(std::move(req)); + auto reqs = sisl::VectorPool< repl_req_ptr_t >::alloc(); + auto last_commit_lsn = uint64_cast(get_last_commit_lsn()); + for (unsigned long i = 0; i < entries.size(); i++) { + auto& entry = entries[i]; + auto lsn = start_lsn + i; + auto term = entry->get_term(); + if (entry->get_val_type() != nuraft::log_val_type::app_log) { continue; } + if (entry->get_buf_ptr()->size() == 0) { continue; } + // skipping localize for already committed log(dup), they anyway will be discard + // by nuraft before append_log. + if (lsn <= last_commit_lsn) { + RD_LOGT(NO_TRACE_ID, "Raft channel: term {}, lsn {}, skipping dup, last_commit_lsn {}", term, lsn, + last_commit_lsn); + continue; } + // Those LSNs already in logstore but not yet committed, will be dedup here, + // applier_create_req will return same req as previous one + auto req = m_state_machine->localize_journal_entry_prepare(*entry, lsn); + if (req == nullptr) { + sisl::VectorPool< repl_req_ptr_t >::free(reqs); + // The hint set here will be used by the next after next appendEntry, the next one + // always go with -1 from NuRraft code. + // + // We are rejecting this log entry, meaning we can accept previous log entries. + // If there is nothing we can accept(i==0), that maens we are waiting for commit + // of previous lsn, set it to 1 in this case. + m_state_machine->reset_next_batch_size_hint(std::max(1ul, i)); + return nuraft::cb_func::ReturnCode::ReturnNull; + } + report_blk_metrics_if_needed(req); + reqs->emplace_back(std::move(req)); + } - // Wait till we receive the data from its originator for all the requests - if (!wait_for_data_receive(*reqs, HS_DYNAMIC_CONFIG(consensus.data_receive_timeout_ms))) { - for (auto const& rreq : *reqs) { - handle_error(rreq, ReplServiceError::TIMEOUT); - } - ret = nuraft::cb_func::ReturnCode::ReturnNull; + // Wait till we receive the data from its originator for all the requests + std::vector< repl_req_ptr_t > timeout_rreqs; + if (!wait_for_data_receive(*reqs, HS_DYNAMIC_CONFIG(consensus.data_receive_timeout_ms), &timeout_rreqs)) { + for (auto const& rreq : timeout_rreqs) { + handle_error(rreq, ReplServiceError::TIMEOUT); } - sisl::VectorPool< repl_req_ptr_t >::free(reqs); + ret = nuraft::cb_func::ReturnCode::ReturnNull; } - return {true, ret}; - } else { - return {false, ret}; + sisl::VectorPool< repl_req_ptr_t >::free(reqs); + if (ret == nuraft::cb_func::ReturnCode::Ok) { m_state_machine->inc_next_batch_size_hint(); } + return ret; + } + case nuraft::cb_func::Type::JoinedCluster: + RD_LOGD(NO_TRACE_ID, "Raft channel: Received JoinedCluster, implies become_follower"); + become_follower_cb(); + return nuraft::cb_func::ReturnCode::Ok; + case nuraft::cb_func::Type::BecomeFollower: { + RD_LOGD(NO_TRACE_ID, "Raft channel: Received BecomeFollower"); + become_follower_cb(); + return nuraft::cb_func::ReturnCode::Ok; + } + case nuraft::cb_func::Type::BecomeLeader: { + RD_LOGD(NO_TRACE_ID, "Raft channel: Received BecomeLeader"); + become_leader_cb(); + return nuraft::cb_func::ReturnCode::Ok; + } + + // RemovedFromCluster will be handled in nuraft_mesg::generic_raft_event_handler where leave() is called + + // TODO: Add more type handler if necessary + default: + break; } + return nuraft::cb_func::ReturnCode::Ok; } void RaftReplDev::flush_durable_commit_lsn() { auto const lsn = m_commit_upto_lsn.load(); + m_listener->notify_committed_lsn(lsn); + + if (is_destroyed()) { + RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore flush durable commit lsn"); + return; + } + + RD_LOGT(NO_TRACE_ID, "Flushing durable commit lsn to {}", lsn); std::unique_lock lg{m_sb_mtx}; m_rd_sb->durable_commit_lsn = lsn; m_rd_sb.write(); } +void RaftReplDev::check_replace_member_status() { + if (is_destroyed()) { + RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore check replace member status"); + return; + } + if (!m_repl_svc_ctx || !is_leader()) { return; } + if (m_rd_sb->replace_member_ctx.replica_in == boost::uuids::nil_uuid() || + m_rd_sb->replace_member_ctx.replica_out == boost::uuids::nil_uuid()) { + RD_LOGT(NO_TRACE_ID, "No replace member in progress, return"); + return; + } + + auto peers = get_replication_status(); + auto replica_in = m_rd_sb->replace_member_ctx.replica_in; + auto replica_out = m_rd_sb->replace_member_ctx.replica_out; + repl_lsn_t in_lsn = 0; + repl_lsn_t out_lsn = 0; + repl_lsn_t laggy = HS_DYNAMIC_CONFIG(consensus.laggy_threshold); + + for (auto& peer : peers) { + if (peer.id_ == replica_out) { + out_lsn = peer.replication_idx_; + RD_LOGD(NO_TRACE_ID, "Replica out {} with lsn {}", boost::uuids::to_string(peer.id_), out_lsn); + } else if (peer.id_ == replica_in) { + in_lsn = peer.replication_idx_; + RD_LOGD(NO_TRACE_ID, "Replica in {} with lsn {}", boost::uuids::to_string(peer.id_), in_lsn); + } + } + // TODO optimize the condition + bool catch_up = in_lsn + laggy >= out_lsn; + + if (!catch_up) { + RD_LOGD(NO_TRACE_ID, "Checking replace member status, replica_in={} with lsn={}, replica_out={} with lsn={}", + boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); + return; + } + + RD_LOGD(NO_TRACE_ID, + "Checking replace member status, new member has caught up, replica_in={} with lsn={}, replica_out={} with " + "lsn={}", + boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn); + + trace_id_t trace_id = generateRandomTraceId(); + + RD_LOGD(trace_id, "Trigger complete_replace_member, replica_in={}, replica_out={}", + boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out)); + + replica_member_info out{replica_out, ""}; + replica_member_info in{replica_in, ""}; + auto ret = complete_replace_member(out, in, 0, trace_id).get(); + if (ret.hasError()) { + RD_LOGE(trace_id, "Failed to complete replace member, next time will retry it, error={}", ret.error()); + return; + } + RD_LOGI(trace_id, "Complete replace member, replica_in={}, replica_out={}", + boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out)) +} + /////////////////////////////////// Private metohds //////////////////////////////////// -void RaftReplDev::cp_flush(CP* cp) { - auto const lsn = m_commit_upto_lsn.load(); - auto const clsn = m_compact_lsn.load(); +void RaftReplDev::cp_flush(CP* cp, cshared< ReplDevCPContext > ctx) { + if (is_destroyed()) { + RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore cp flush"); + return; + } + + auto const lsn = ctx->cp_lsn; + auto const clsn = ctx->compacted_to_lsn; + auto const dsn = ctx->last_applied_dsn; if (lsn == m_last_flushed_commit_lsn) { // Not dirtied since last flush ignore @@ -1103,58 +1876,116 @@ void RaftReplDev::cp_flush(CP* cp) { std::unique_lock lg{m_sb_mtx}; m_rd_sb->compact_lsn = clsn; - m_rd_sb->durable_commit_lsn = lsn; + // dc_lsn is also flushed in flush_durable_commit_lsn() + // we need to take a max to avoid rolling back. + m_rd_sb->durable_commit_lsn = std::max(lsn, m_rd_sb->durable_commit_lsn); m_rd_sb->checkpoint_lsn = lsn; - m_rd_sb->last_applied_dsn = m_next_dsn.load(); + m_rd_sb->last_applied_dsn = dsn; m_rd_sb.write(); m_last_flushed_commit_lsn = lsn; - RD_LOGD("cp flush in raft repl dev, lsn={}, clsn={}, next_dsn={}, cp string:{}", lsn, clsn, m_next_dsn.load(), - cp->to_string()); + RD_LOGD(NO_TRACE_ID, "cp flush in raft repl dev, lsn={}, clsn={}, next_dsn={}, cp string:{}", lsn, clsn, + m_next_dsn.load(), cp->to_string()); +} + +cshared< ReplDevCPContext > RaftReplDev::get_cp_ctx(CP* cp) { + auto const cp_lsn = m_commit_upto_lsn.load(); + auto const clsn = m_compact_lsn.load(); + auto const dsn = m_next_dsn.load(); + + RD_LOGD(NO_TRACE_ID, "getting cp_ctx for raft repl dev {}, cp_lsn={}, clsn={}, next_dsn={}, cp string:{}", + (void*)this, cp_lsn, clsn, dsn, cp->to_string()); + auto dev_ctx = std::make_shared< ReplDevCPContext >(); + dev_ctx->cp_lsn = cp_lsn; + dev_ctx->compacted_to_lsn = clsn; + dev_ctx->last_applied_dsn = dsn; + return dev_ctx; } void RaftReplDev::cp_cleanup(CP*) {} void RaftReplDev::gc_repl_reqs() { - std::vector< int64_t > expired_keys; - m_state_machine->iterate_repl_reqs([this, &expired_keys](auto key, auto rreq) { + auto cur_dsn = m_next_dsn.load(); + if (cur_dsn != 0) cur_dsn = cur_dsn - 1; + // On follower, DSN below cur_dsn should very likely be commited. + // It is not guaranteed because DSN and LSN are generated separately, + // DSN in async_alloc_write before pushing data, LSN later when + // proposing to raft. Two simultaneous write requests on leader can have + // and during the window. + std::vector< repl_req_ptr_t > expired_rreqs; + + auto req_map_size = m_repl_key_req_map.size(); + RD_LOGI(NO_TRACE_ID, "m_repl_key_req_map size is {};", req_map_size); + for (auto [key, rreq] : m_repl_key_req_map) { + // FIXME: Skipping proposer for now, the DSN in proposer increased in proposing stage, not when commit(). + // Need other mechanism. + if (rreq->is_proposer()) { + // don't clean up proposer's request + continue; + } + if (rreq->dsn() < cur_dsn && rreq->is_expired()) { + // The DSN can be out of order, wait till rreq expired. + RD_LOGD(rreq->traceID(), + "legacy req with commited DSN, rreq=[{}] , dsn = {}, next_dsn = {}, gap= {}, elapsed_time_sec {}", + rreq->to_string(), rreq->dsn(), cur_dsn, cur_dsn - rreq->dsn(), + get_elapsed_time_sec(rreq->created_time())); + expired_rreqs.push_back(rreq); + } + } + int sm_req_cnt = 0; + // FIXME: we ensured data written before appending log to log store, in which we add rreq to state_machine + // and during pre-commit/commit we retrieve rreq from state_machine. Removing requests outside of state + // machine is risky. + // Below logs are logging only, can be removed once we get more confidence. + m_state_machine->iterate_repl_reqs([this, cur_dsn, &sm_req_cnt](auto key, auto rreq) { + sm_req_cnt++; if (rreq->is_proposer()) { // don't clean up proposer's request return; } - if (rreq->is_expired()) { - expired_keys.push_back(key); - RD_LOGD("rreq=[{}] is expired, cleaning up; elapsed_time_sec{};", rreq->to_string(), + RD_LOGD(rreq->traceID(), "StateMachine: rreq=[{}] is expired, elapsed_time_sec{};", rreq->to_string(), get_elapsed_time_sec(rreq->created_time())); - - // do garbage collection - // 1. free the allocated blocks - if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { - auto blkid = rreq->local_blkid(); - data_service().async_free_blk(blkid).thenValue([this, blkid](auto&& err) { - HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", - blkid.to_string()); - RD_LOGD("blkid={} freed successfully", blkid.to_string()); - }); - } - - // 2. remove from the m_repl_key_req_map - // handle_error during fetch data response might have already removed the rreq from the this map - if (m_repl_key_req_map.find(rreq->rkey()) != m_repl_key_req_map.end()) { - m_repl_key_req_map.erase(rreq->rkey()); - } } }); + RD_LOGT(NO_TRACE_ID, "state_machine req map size is {};", sm_req_cnt); - for (auto const& l : expired_keys) { - m_state_machine->unlink_lsn_to_req(l); + for (auto removing_rreq : expired_rreqs) { + // once log flushed, the commit progress controlled by raft + if (removing_rreq->has_state(repl_req_state_t::LOG_FLUSHED)) { + RD_LOGT(removing_rreq->traceID(), "Skipping GC rreq [{}] because it is in state machine", + removing_rreq->to_string()); + continue; + } + // do garbage collection + // 1. free the allocated blocks + RD_LOGD(removing_rreq->traceID(), "Removing rreq [{}]", removing_rreq->to_string()); + if (removing_rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { + auto blkid = removing_rreq->local_blkid(); + data_service().async_free_blk(blkid).thenValue([this, blkid, removing_rreq](auto&& err) { + HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", + blkid.to_string()); + RD_LOGD(removing_rreq->traceID(), "GC rreq: Releasing blkid={} freed successfully", blkid.to_string()); + }); + } + // 2. remove from the m_repl_key_req_map + if (m_repl_key_req_map.find(removing_rreq->rkey()) != m_repl_key_req_map.end()) { + m_repl_key_req_map.erase(removing_rreq->rkey()); + } } } +void RaftReplDev::set_log_store_last_durable_lsn(store_lsn_t lsn) { m_data_journal->set_last_durable_lsn(lsn); } + void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { auto repl_lsn = to_repl_lsn(lsn); + if (need_skip_processing(repl_lsn)) { + RD_LOGI(NO_TRACE_ID, + "Raft Channel: Log {} is outdated and will be handled by baseline resync. Ignoring replay.", lsn); + return; + } + // apply the log entry if the lsn is between checkpoint lsn and durable commit lsn - if (repl_lsn < m_rd_sb->checkpoint_lsn) { return; } + if (repl_lsn <= m_rd_sb->checkpoint_lsn) { return; } // 1. Get the log entry and prepare rreq auto const lentry = to_nuraft_log_entry(buf); @@ -1166,8 +1997,9 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry received from RAFT peer"); - RD_LOGT("Raft Channel: Applying Raft log_entry upon recovery: server_id={}, term={}, journal_entry=[{}] ", - jentry->server_id, lentry->get_term(), jentry->to_string()); + RD_LOGT(jentry->traceID, + "Raft Channel: Applying Raft log_entry upon recovery: server_id={}, term={}, lsn={}, journal_entry=[{}] ", + jentry->server_id, lentry->get_term(), repl_lsn, jentry->to_string()); auto entry_to_hdr = [](repl_journal_entry* jentry) { return sisl::blob{uintptr_cast(jentry) + sizeof(repl_journal_entry), jentry->user_header_size}; @@ -1184,7 +2016,8 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx jentry->value_size}; }; - repl_key const rkey{.server_id = jentry->server_id, .term = lentry->get_term(), .dsn = jentry->dsn}; + repl_key const rkey{ + .server_id = jentry->server_id, .term = lentry->get_term(), .dsn = jentry->dsn, .traceID = jentry->traceID}; auto const [it, happened] = m_repl_key_req_map.try_emplace(rkey, repl_req_ptr_t(new repl_req_ctx())); RD_DBG_ASSERT((it != m_repl_key_req_map.end()), "Unexpected error in map_repl_key_to_req"); @@ -1192,33 +2025,97 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx RD_DBG_ASSERT(happened, "rreq already exists for rkey={}", rkey.to_string()); uint32_t data_size{0u}; + // If the data is linked and value_size is non-zero, it means blks have been allocated for data. + // Since the log is flushed after data is written, the data has already been received. if ((jentry->code == journal_type_t::HS_DATA_LINKED) && (jentry->value_size > 0)) { MultiBlkId entry_blkid; entry_blkid.deserialize(entry_to_val(jentry), true /* copy */); data_size = entry_blkid.blk_count() * get_blk_size(); - rreq->set_local_blkid(entry_blkid); + rreq->set_local_blkids({entry_blkid}); + rreq->add_state(repl_req_state_t::BLK_ALLOCATED); + rreq->add_state(repl_req_state_t::DATA_RECEIVED); } rreq->set_lsn(repl_lsn); // keep lentry in scope for the lyfe cycle of the rreq rreq->set_lentry(lentry); - rreq->init(rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), entry_to_key(jentry), data_size); - RD_LOGD("Replay log on restart, rreq=[{}]", rreq->to_string()); + auto status = init_req_ctx(rreq, rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), + entry_to_key(jentry), data_size, m_listener); + if (status != ReplServiceError::OK) { + RD_LOGE(jentry->traceID, "Initializing rreq failed, rreq=[{}], error={}", rreq->to_string(), status); + } + + // we load the log from log device, implies log flushed. We only flush log after data is written to data device. + rreq->add_state(repl_req_state_t::DATA_WRITTEN); + rreq->add_state(repl_req_state_t::LOG_RECEIVED); + rreq->add_state(repl_req_state_t::LOG_FLUSHED); + RD_LOGD(rreq->traceID(), "Replay log on restart, rreq=[{}]", rreq->to_string()); + // 2. Pre-commit the log entry as in nuraft pre-commit was called once log appended to logstore. + m_listener->on_pre_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq); + + // LSN above dc_lsn we forgot their states, they can either + // a. be committed before, but DC_LSN not yet flushed + // b. not yet committed, might be committed or rollback if (repl_lsn > m_rd_sb->durable_commit_lsn) { // In memory state of these blks is lost. Commit them now to avoid usage of same blk twice. commit_blk(rreq); + // add rreq to state machine, state-machine will decide to commit or rollback this rreq. m_state_machine->link_lsn_to_req(rreq, int64_cast(repl_lsn)); return; } - // 2. Pre-commit the log entry - m_listener->on_pre_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq); - // 3. Commit the log entry handle_commit(rreq, true /* recovery */); } +void RaftReplDev::create_snp_resync_data(raft_buf_ptr_t& data_out) { + snp_repl_dev_data msg; + auto msg_size = sizeof(snp_repl_dev_data); + msg.dsn = m_next_dsn; + auto crc = crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(&msg), msg_size); + RD_LOGD(NO_TRACE_ID, "create snapshot resync msg, dsn={}, crc={}", msg.dsn, crc); + msg.crc = crc; + data_out = nuraft::buffer::alloc(msg_size); + std::memcpy(data_out->data_begin(), &msg, msg_size); +} + +bool RaftReplDev::save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s) { + auto msg = r_cast< snp_repl_dev_data* >(data.data_begin()); + if (msg->magic_num != HOMESTORE_RESYNC_DATA_MAGIC || + msg->protocol_version != HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1) { + RD_LOGE(NO_TRACE_ID, "Snapshot resync data validation failed, magic={}, version={}", msg->magic_num, + msg->protocol_version); + return false; + } + auto received_crc = msg->crc; + RD_LOGD(NO_TRACE_ID, "received snapshot resync msg, dsn={}, crc={}, received crc={}", msg->dsn, msg->crc, + received_crc); + // Clear the crc field before verification, because the crc value computed by leader doesn't contain it. + msg->crc = 0; + auto computed_crc = + crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(msg), sizeof(snp_repl_dev_data)); + if (received_crc != computed_crc) { + RD_LOGE(NO_TRACE_ID, "Snapshot resync data crc mismatch, received_crc={}, computed_crc={}", received_crc, + computed_crc); + return false; + } + { + // Save last_snapshot_lsn, so that we can skip the replay/commit operation for logs included in baseline resync. + // The reason is baseline resync will clear existing resources on the upper layer, skipping replay/commit + // operations can avoid accessing unavailable resources + std::unique_lock lg{m_sb_mtx}; + m_rd_sb->last_snapshot_lsn = s_cast< repl_lsn_t >(s.get_last_log_idx()); + m_rd_sb.write(); + } + if (msg->dsn > m_next_dsn) { + m_next_dsn = msg->dsn; + RD_LOGD(NO_TRACE_ID, "Update next_dsn from {} to {}", m_next_dsn.load(), msg->dsn); + return true; + } + return true; +} + void RaftReplDev::on_restart() { m_listener->on_restart(); } bool RaftReplDev::is_resync_mode() { @@ -1227,10 +2124,116 @@ bool RaftReplDev::is_resync_mode() { auto diff = leader_commited_lsn - my_log_idx; bool resync_mode = (diff > HS_DYNAMIC_CONFIG(consensus.resync_log_idx_threshold)); if (resync_mode) { - RD_LOGD("Raft Channel: Resync mode, leader_commited_lsn={}, my_log_idx={}, diff={}", leader_commited_lsn, - my_log_idx, diff); + RD_LOGD(NO_TRACE_ID, "Raft Channel: Resync mode, leader_commited_lsn={}, my_log_idx={}, diff={}", + leader_commited_lsn, my_log_idx, diff); } return resync_mode; } +void RaftReplDev::report_blk_metrics_if_needed(repl_req_ptr_t rreq) { + auto chunk_id = rreq->local_blkid().chunk_num(); + auto chunk = hs()->device_mgr()->get_chunk(chunk_id); + if (chunk->get_blk_usage() >= chunk->get_blk_usage_report_threshold()) { + auto local_blk_num = rreq->local_blkid().blk_num(); + auto remote_blk_num = rreq->remote_blkid().blkid.blk_num(); + // Focus only on cases where the locally allocated blocks exceed the proposer's allocated blocks, + // as this indicates that the member might encounter NO_SPACE_LEFT before the proposer. + auto blk_diff_with_remote = local_blk_num > remote_blk_num ? local_blk_num - remote_blk_num : 0; + HISTOGRAM_OBSERVE(m_metrics, blk_diff_with_proposer, blk_diff_with_remote); + } +} + +void RaftReplDev::quiesce_reqs() { + // all the block allocation happens in rreq->init. so after we wait for all the pending req has been initialized we + // can make sure + // 1 all the pending reqs has allocated their blocks + // 2 no new pending reqs will be initialized again. + m_in_quience.store(true, std::memory_order_release); + RD_LOGD(NO_TRACE_ID, "enter quience state, waiting for all the pending req to be initialized"); + while (true) { + uint64_t pending_req_num = get_pending_init_req_num(); + if (pending_req_num) { + RD_LOGD(NO_TRACE_ID, "wait for {} pending create_req requests to be completed", pending_req_num); + std::this_thread::sleep_for(std::chrono::microseconds(1)); + } else + break; + } +} + +void RaftReplDev::resume_accepting_reqs() { m_in_quience.store(false, std::memory_order_release); } + +void RaftReplDev::clear_chunk_req(chunk_num_t chunk_id) { + RD_LOGD(NO_TRACE_ID, + "start cleaning all the in-memory rreqs, which has allocated blk on the emergent chunk={} before handling " + "no_space_left error", + chunk_id); + std::vector< folly::Future< folly::Unit > > futs; + for (auto& [key, rreq] : m_repl_key_req_map) { + if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { + auto blkid = rreq->local_blkid(); + if (chunk_id == blkid.chunk_num()) { + // only clean the rreqs which has allocated blks on the emergent chunk + futs.emplace_back( + std::move(data_service().async_free_blk(blkid).thenValue([this, &blkid, &key](auto&& err) { + HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", + blkid.to_string()); + RD_LOGD(NO_TRACE_ID, "blkid={} freed successfully for handling no_space_left error", + blkid.to_string()); + m_repl_key_req_map.erase(key); // remove from the req map after freeing the blk + }))); + } + } + } + + folly::collectAllUnsafe(futs) + .thenValue([this](auto&& vf) { + // TODO:: handle the error in freeing blk if necessary in the future. + // for nuobject case, error for freeing blk in the emergent chunk can be ingored + RD_LOGD( + NO_TRACE_ID, + "all the necessary in-memory rreqs which has allocated blks on the emergent chunk have been cleaned up " + "successfully, continue to handle no_space_left error."); + }) + // need to wait for the completion + .wait(); +} + +ReplServiceError RaftReplDev::init_req_ctx(repl_req_ptr_t rreq, repl_key rkey, journal_type_t op_code, bool is_proposer, + sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, + cshared< ReplDevListener >& listener) { + if (!rreq) { + RD_LOGD(rkey.traceID, "got nullptr for initing req, rkey=[{}]", rkey.to_string()); + return ReplServiceError::CANCELLED; + } + + init_req_counter counter(m_pending_init_req_num); + if (is_in_quience()) { + // In quience state, reject any new requests. + RD_LOGD(rkey.traceID, "Rejecting new request in quience state, rkey=[{}]", rkey.to_string()); + return ReplServiceError::QUIENCE_STATE; + } + + return rreq->init(rkey, op_code, is_proposer, user_header, key, data_size, m_listener); +} + +void RaftReplDev::become_leader_cb() { + auto new_gate = raft_server()->get_last_log_idx(); + repl_lsn_t existing_gate = 0; + if (!m_traffic_ready_lsn.compare_exchange_strong(existing_gate, new_gate)) { + // was a follower, m_traffic_ready_lsn should be zero on follower. + RD_REL_ASSERT(!existing_gate, "existing gate should be zero"); + } + RD_LOGD(NO_TRACE_ID, "become_leader_cb: setting traffic_ready_lsn from {} to {}", existing_gate, new_gate); +} + +bool RaftReplDev::is_ready_for_traffic() const { + if (is_stopping()) return false; + auto committed_lsn = m_commit_upto_lsn.load(); + auto gate = m_traffic_ready_lsn.load(); + bool ready = committed_lsn >= gate; + if (!ready) { + RD_LOGD(NO_TRACE_ID, "Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate); + } + return ready; +} } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h index 41594b528..abede36bf 100644 --- a/src/lib/replication/repl_dev/raft_repl_dev.h +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -15,6 +15,10 @@ #include "replication/log_store/repl_log_store.h" namespace homestore { +struct replace_member_ctx_superblk { + replica_id_t replica_out; + replica_id_t replica_in; +}; #pragma pack(1) struct raft_repl_dev_superblk : public repl_dev_superblk { @@ -25,6 +29,8 @@ struct raft_repl_dev_superblk : public repl_dev_superblk { uint8_t is_timeline_consistent; // Flag to indicate whether the recovery of followers need to be timeline consistent uint64_t last_applied_dsn; // Last applied data sequence number uint8_t destroy_pending; // Flag to indicate whether the group is in destroy pending state + repl_lsn_t last_snapshot_lsn; // Last snapshot LSN follower received from leader + replace_member_ctx_superblk replace_member_ctx; // Replace members context, used to track the replace member status uint32_t get_raft_sb_version() const { return raft_sb_version; } }; @@ -35,6 +41,11 @@ using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >; ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, DESTROYING, DESTROYED, PERMANENT_DESTROYED); +struct replace_member_ctx { + replica_member_info replica_out; + replica_member_info replica_in; +}; + class RaftReplDevMetrics : public sisl::MetricsGroup { public: explicit RaftReplDevMetrics(const char* inst_name) : sisl::MetricsGroup("RaftReplDev", inst_name) { @@ -84,6 +95,13 @@ class RaftReplDevMetrics : public sisl::MetricsGroup { REGISTER_HISTOGRAM(rreq_pieces_per_write, "Number of individual pieces per write", HistogramBucketsType(LinearUpto64Buckets)); + // In the identical layout chunk, the blk num of the follower and leader is expected to be the same. + // However, due to the concurrency between the data channel and the raft channel, there might be some + // allocation differences on the same lsn. When a leader switch occurs, these differences could become garbage. + // This metric can partially reflect the potential amount of garbage. + REGISTER_HISTOGRAM(blk_diff_with_proposer, + "allocated blk num diff on the same lsn with proposer when chunk usage >= 0.9"); + // Raft channel metrics REGISTER_HISTOGRAM(raft_end_of_append_batch_latency_us, "Raft end_of_append_batch latency in us", "raft_logstore_append_latency", {"op", "end_of_append_batch"}); @@ -102,18 +120,70 @@ class RaftReplDevMetrics : public sisl::MetricsGroup { class RaftReplService; class CP; +struct ReplDevCPContext { + repl_lsn_t cp_lsn; + repl_lsn_t compacted_to_lsn; + uint64_t last_applied_dsn; +}; + +class nuraft_snapshot_context : public snapshot_context { +public: + nuraft_snapshot_context(nuraft::snapshot& snp) : snapshot_context(snp.get_last_log_idx()) { + auto snp_buf = snp.serialize(); + snapshot_ = nuraft::snapshot::deserialize(*snp_buf); + } + + nuraft_snapshot_context(sisl::io_blob_safe const& snp_ctx) : snapshot_context(0) { deserialize(snp_ctx); } + + sisl::io_blob_safe serialize() override { + // Dump the context from nuraft buffer to the io blob. + auto snp_buf = snapshot_->serialize(); + sisl::io_blob_safe blob{s_cast< size_t >(snp_buf->size())}; + std::memcpy(blob.bytes(), snp_buf->data_begin(), snp_buf->size()); + return blob; + } + + void deserialize(const sisl::io_blob_safe& snp_ctx) { + // Load the context from the io blob to nuraft buffer. + auto snp_buf = nuraft::buffer::alloc(snp_ctx.size()); + snp_buf->put_raw(snp_ctx.cbytes(), snp_ctx.size()); + snp_buf->pos(0); + snapshot_ = nuraft::snapshot::deserialize(*snp_buf); + lsn_ = snapshot_->get_last_log_idx(); + } + + nuraft::ptr< nuraft::snapshot > nuraft_snapshot() { return snapshot_; } + +private: + nuraft::ptr< nuraft::snapshot > snapshot_; +}; + class RaftReplDev : public ReplDev, public nuraft_mesg::mesg_state_mgr, public std::enable_shared_from_this< RaftReplDev > { +private: + class init_req_counter { + public: + init_req_counter(std::atomic_uint64_t& counter) : my_counter(counter) { + my_counter.fetch_add(1, std::memory_order_acq_rel); + } + + ~init_req_counter() { my_counter.fetch_sub(1, std::memory_order_acq_rel); } + + private: + std::atomic_uint64_t& my_counter; + }; + private: shared< RaftStateMachine > m_state_machine; RaftReplService& m_repl_svc; folly::ConcurrentHashMap< repl_key, repl_req_ptr_t, repl_key::Hasher > m_repl_key_req_map; nuraft_mesg::Manager& m_msg_mgr; - group_id_t m_group_id; // Replication Group id - std::string m_rdev_name; // Short name for the group for easy debugging - replica_id_t m_my_repl_id; // This replica's uuid - int32_t m_raft_server_id; // Server ID used by raft (unique within raft group) + group_id_t m_group_id; // Replication Group id + std::string m_rdev_name; // Short name for the group for easy debugging + std::string m_identify_str; // combination of rdev_name:group_id + replica_id_t m_my_repl_id; // This replica's uuid + int32_t m_raft_server_id; // Server ID used by raft (unique within raft group) shared< ReplLogStore > m_data_journal; shared< HomeLogStore > m_free_blks_journal; sisl::urcu_scoped_ptr< repl_dev_stage_t > m_stage; @@ -124,8 +194,12 @@ class RaftReplDev : public ReplDev, mutable folly::SharedMutexWritePriority m_sb_lock; // Lock to protect staged sb and persisting sb raft_repl_dev_superblk m_sb_in_mem; // Cached version which is used to read and for staging - std::atomic< repl_lsn_t > m_commit_upto_lsn{0}; // LSN which was lastly written, to track flushes + std::atomic< repl_lsn_t > m_commit_upto_lsn{0}; // LSN which was lastly committed, to track flushes std::atomic< repl_lsn_t > m_compact_lsn{0}; // LSN upto which it was compacted, it is used to track where to + // The `traffic_ready_lsn` variable holds the Log Sequence Number (LSN) up to which + // the state machine should committed to before accepting traffic. This threshold ensures that + // all potential committed log be committed before handling incoming requests. + std::atomic< repl_lsn_t > m_traffic_ready_lsn{0}; std::mutex m_sb_mtx; // Lock to protect the repl dev superblock @@ -143,53 +217,123 @@ class RaftReplDev : public ReplDev, static std::atomic< uint64_t > s_next_group_ordinal; bool m_log_store_replay_done{false}; + // pending create requests, including both raft and data channel + std::atomic_uint64_t m_pending_init_req_num; + std::atomic< bool > m_in_quience; + public: friend class RaftStateMachine; RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk >&& rd_sb, bool load_existing); virtual ~RaftReplDev() = default; + bool bind_data_service(); bool join_group(); + AsyncReplResult<> start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, + uint32_t commit_quorum = 0, uint64_t trace_id = 0); + AsyncReplResult<> complete_replace_member(const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0); + AsyncReplResult<> flip_learner_flag(const replica_member_info& member, bool target, uint32_t commit_quorum, + bool wait_and_verify = true, uint64_t trace_id = 0); + ReplServiceError do_add_member(const replica_member_info& member, uint64_t trace_id = 0); + ReplServiceError do_remove_member(const replica_member_info& member, uint64_t trace_id = 0); + ReplServiceError do_flip_learner(const replica_member_info& member, bool target, bool wait_and_verify, + uint64_t trace_id = 0); + ReplServiceError set_priority(const replica_id_t& member, int32_t priority, uint64_t trace_id = 0); + nuraft::cmd_result_code retry_when_config_changing(const std::function< nuraft::cmd_result_code() >& func, + uint64_t trace_id = 0); + bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms = 100); + folly::SemiFuture< ReplServiceError > destroy_group(); //////////////// All ReplDev overrides/implementation /////////////////////// + virtual std::error_code alloc_blks(uint32_t size, const blk_alloc_hints& hints, + std::vector< MultiBlkId >& out_blkids) override { + RD_REL_ASSERT(false, "NOT SUPPORTED"); + return std::make_error_code(std::errc::operation_not_supported); + } + virtual folly::Future< std::error_code > async_write(const std::vector< MultiBlkId >& blkids, + sisl::sg_list const& value, bool part_of_batch = false, + trace_id_t tid = 0) override { + RD_REL_ASSERT(false, "NOT SUPPORTED"); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_not_supported)); + } + + virtual void async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header, + sisl::blob const& key, uint32_t data_size, repl_req_ptr_t ctx, + trace_id_t tid = 0) override { + RD_REL_ASSERT(false, "NOT SUPPORTED"); + } + void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx) override; + repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) override; folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch = false) override; - void async_free_blks(int64_t lsn, MultiBlkId const& blkid) override; + bool part_of_batch = false, trace_id_t tid = 0) override; + folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid, trace_id_t tid = 0) override; AsyncReplResult<> become_leader() override; bool is_leader() const override; replica_id_t get_leader_id() const override; std::vector< peer_info > get_replication_status() const override; + std::set< replica_id_t > get_active_peers() const; group_id_t group_id() const override { return m_group_id; } + void set_custom_rdev_name(std::string const& name) override { + RD_LOGI(NO_TRACE_ID, "Resetting repl dev name from {} to {}", m_rdev_name, name); + m_rdev_name = name; + m_identify_str = name + ":" + group_id_str(); + m_rd_sb->set_rdev_name(m_rdev_name); + } std::string group_id_str() const { return boost::uuids::to_string(m_group_id); } - std::string rdev_name() const { return m_rdev_name; } + std::string rdev_name() const { return m_rd_sb->rdev_name; }; + std::string identify_str() const { return m_identify_str; }; std::string my_replica_id_str() const { return boost::uuids::to_string(m_my_repl_id); } uint32_t get_blk_size() const override; - repl_lsn_t get_last_commit_lsn() const { return m_commit_upto_lsn.load(); } + repl_lsn_t get_last_commit_lsn() const override { return m_commit_upto_lsn.load(); } void set_last_commit_lsn(repl_lsn_t lsn) { m_commit_upto_lsn.store(lsn); } + repl_lsn_t get_last_append_lsn() override { return raft_server()->get_last_log_idx(); } bool is_destroy_pending() const; bool is_destroyed() const; + Clock::time_point destroyed_time() const { return m_destroyed_time; } + bool is_ready_for_traffic() const override; + // purge all resources (e.g., logs in logstore) is a very dangerous operation, it is not supported yet. + void purge() override { RD_REL_ASSERT(false, "NOT SUPPORTED YET"); } + + std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { + return std::make_shared< nuraft_snapshot_context >(snp_ctx); + } //////////////// Accessor/shortcut methods /////////////////////// nuraft_mesg::repl_service_ctx* group_msg_service(); + nuraft::raft_server* raft_server(); RaftReplDevMetrics& metrics() { return m_metrics; } //////////////// Methods needed for other Raft classes to access ///////////////// void use_config(json_superblk raft_config_sb); void handle_commit(repl_req_ptr_t rreq, bool recovery = false); + void handle_config_commit(const repl_lsn_t lsn, raft_cluster_config_ptr_t& new_conf); + void handle_rollback(repl_req_ptr_t rreq); + void handle_config_rollback(const repl_lsn_t lsn, raft_cluster_config_ptr_t& old_conf); repl_req_ptr_t repl_key_to_req(repl_key const& rkey) const; repl_req_ptr_t applier_create_req(repl_key const& rkey, journal_type_t code, sisl::blob const& user_header, - sisl::blob const& key, uint32_t data_size, bool is_data_channel); + sisl::blob const& key, uint32_t data_size, bool is_data_channel, + int64_t lsn = -1 /*init lsn*/); folly::Future< folly::Unit > notify_after_data_written(std::vector< repl_req_ptr_t >* rreqs); void check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreqs); - void cp_flush(CP* cp); + void cp_flush(CP* cp, cshared< ReplDevCPContext > ctx); + cshared< ReplDevCPContext > get_cp_ctx(CP* cp); void cp_cleanup(CP* cp); void become_ready(); + void become_leader_cb(); + + void become_follower_cb() { + // m_traffic_ready_lsn should be zero on follower. + m_traffic_ready_lsn.store(0); + RD_LOGD(NO_TRACE_ID, "become_follower_cb setting traffic_ready_lsn to 0"); + } + /// @brief This method is called when the data journal is compacted /// /// @param upto_lsn : LSN upto which the data journal was compacted @@ -207,6 +351,7 @@ class RaftReplDev : public ReplDev, */ void on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done); +#if 0 /** * Truncates the replication log by providing a specified number of reserved entries. * @@ -215,6 +360,7 @@ class RaftReplDev : public ReplDev, void truncate(uint32_t num_reserved_entries) { m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load()); } +#endif void wait_for_logstore_ready() { m_data_journal->wait_for_log_store_ready(); } @@ -225,6 +371,11 @@ class RaftReplDev : public ReplDev, */ void flush_durable_commit_lsn(); + /** + * Check the replace_member status, if the new member is fully synced up and ready to take over, remove the old member. + */ + void check_replace_member_status(); + /** * \brief This method is called during restart to notify the upper layer */ @@ -238,6 +389,22 @@ class RaftReplDev : public ReplDev, */ void force_leave() { leave(); } + /** + * \brief This method is called to check if the given LSN is within the last snapshot LSN received from the leader. + * All logs with LSN less than or equal to the last snapshot LSN are considered as part of the baseline resync, + * which doesn't need any more operations (e.g., replay, commit). + * + * \param lsn The LSN to be checked. + * \return true if the LSN is within the last snapshot LSN, false otherwise. + */ + bool need_skip_processing(const repl_lsn_t lsn) { return lsn <= m_rd_sb->last_snapshot_lsn; } + + void quiesce_reqs(); + void resume_accepting_reqs(); + + // clear reqs that has allocated blks on the given chunk. + void clear_chunk_req(chunk_num_t chunk_id); + protected: //////////////// All nuraft::state_mgr overrides /////////////////////// nuraft::ptr< nuraft::cluster_config > load_config() override; @@ -253,8 +420,8 @@ class RaftReplDev : public ReplDev, std::shared_ptr< nuraft::state_machine > get_state_machine() override; void permanent_destroy() override; void leave() override; - std::pair< bool, nuraft::cb_func::ReturnCode > handle_raft_event(nuraft::cb_func::Type, - nuraft::cb_func::Param*) override; + + nuraft::cb_func::ReturnCode raft_event(nuraft::cb_func::Type, nuraft::cb_func::Param*) override; private: shared< nuraft::log_store > data_journal() { return m_data_journal; } @@ -264,10 +431,32 @@ class RaftReplDev : public ReplDev, void fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs); void handle_fetch_data_response(sisl::GenericClientResponse response, std::vector< repl_req_ptr_t > rreqs); bool is_resync_mode(); + + /** + * \brief This method handles errors that occur during append entries or data receiving. + * It should not be called after the append entries phase. + */ void handle_error(repl_req_ptr_t const& rreq, ReplServiceError err); - bool wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms); + + bool wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms, + std::vector< repl_req_ptr_t >* timeout_rreqs = nullptr); void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); + void set_log_store_last_durable_lsn(store_lsn_t lsn); void commit_blk(repl_req_ptr_t rreq); + void start_replace_member(repl_req_ptr_t rreq); + void complete_replace_member(repl_req_ptr_t rreq); + void reset_quorum_size(uint32_t commit_quorum, uint64_t trace_id); + void create_snp_resync_data(raft_buf_ptr_t& data_out); + bool save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s); + + void report_blk_metrics_if_needed(repl_req_ptr_t rreq); + ReplServiceError init_req_ctx(repl_req_ptr_t rreq, repl_key rkey, journal_type_t op_code, bool is_proposer, + sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size, + cshared< ReplDevListener >& listener); + + bool is_in_quience() { return m_in_quience.load(std::memory_order_acquire); } + + uint64_t get_pending_init_req_num() { return m_pending_init_req_num.load(std::memory_order_acquire); } }; } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp index 0b932bbe1..c0f910741 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.cpp +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -9,6 +10,7 @@ #include "repl_dev/raft_repl_dev.h" #include #include "common/homestore_config.hpp" +#include "common/crash_simulator.hpp" namespace homestore { @@ -30,7 +32,7 @@ static std::pair< sisl::blob, sisl::blob > header_only_extract(nuraft::buffer& b ReplServiceError RaftStateMachine::propose_to_raft(repl_req_ptr_t rreq) { rreq->create_journal_entry(true /* raft_buf */, m_rd.server_id()); - RD_LOGT("Raft Channel: propose journal_entry=[{}] ", rreq->journal_entry()->to_string()); + RD_LOGT(rreq->traceID(), "Raft Channel: propose journal_entry=[{}] ", rreq->journal_entry()->to_string()); auto* vec = sisl::VectorPool< raft_buf_ptr_t >::alloc(); vec->push_back(rreq->raft_journal_buf()); @@ -39,21 +41,21 @@ ReplServiceError RaftStateMachine::propose_to_raft(repl_req_ptr_t rreq) { sisl::VectorPool< raft_buf_ptr_t >::free(vec); if (append_status && !append_status->get_accepted()) { - RD_LOGE("Raft Channel: Failed to propose rreq=[{}] result_code={}", rreq->to_compact_string(), + RD_LOGE(rreq->traceID(), "Raft Channel: Failed to propose rreq=[{}] result_code={}", rreq->to_compact_string(), append_status->get_result_code()); return RaftReplService::to_repl_error(append_status->get_result_code()); } return ReplServiceError::OK; } -repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entry& lentry) { +repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entry& lentry, int64_t lsn) { // Validate the journal entry and see if it needs to be transformed repl_journal_entry* jentry = r_cast< repl_journal_entry* >(lentry.get_buf().data_begin()); RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry received from RAFT peer"); - RD_LOGT("Raft Channel: Localizing Raft log_entry: server_id={}, term={}, journal_entry=[{}] ", jentry->server_id, - lentry.get_term(), jentry->to_string()); + RD_LOGT(jentry->traceID, "Raft Channel: Localizing Raft log_entry: server_id={}, term={}, journal_entry=[{}] ", + jentry->server_id, lentry.get_term(), jentry->to_string()); auto entry_to_hdr = [](repl_journal_entry* jentry) { return sisl::blob{uintptr_cast(jentry) + sizeof(repl_journal_entry), jentry->user_header_size}; @@ -70,7 +72,8 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr jentry->value_size}; }; - repl_key const rkey{.server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn}; + repl_key const rkey{ + .server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn, .traceID = jentry->traceID}; // Create a new rreq (or) Pull rreq from the map given the repl_key, header and key. Any new rreq will // allocate the blks (in case of large data). We will use the new blkid and transform the current journal entry's @@ -80,8 +83,9 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr MultiBlkId entry_blkid; entry_blkid.deserialize(entry_to_val(jentry), true /* copy */); - rreq = m_rd.applier_create_req(rkey, jentry->code, entry_to_hdr(jentry), entry_to_key(jentry), - (entry_blkid.blk_count() * m_rd.get_blk_size()), false /* is_data_channel */); + rreq = + m_rd.applier_create_req(rkey, jentry->code, entry_to_hdr(jentry), entry_to_key(jentry), + (entry_blkid.blk_count() * m_rd.get_blk_size()), false /* is_data_channel */, lsn); if (rreq == nullptr) { goto out; } rreq->set_remote_blkid(RemoteBlkId{jentry->server_id, entry_blkid}); @@ -106,7 +110,8 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr std::memcpy(blkid_location, rreq->local_blkid().serialize().cbytes(), local_size); } else { rreq = m_rd.applier_create_req(rkey, jentry->code, entry_to_hdr(jentry), entry_to_key(jentry), - jentry->value_size, false /* is_data_channel */); + jentry->value_size, false /* is_data_channel */, lsn); + if (rreq == nullptr) goto out; } // We might have localized the journal entry with new blkid. We need to also update the header/key pointers pointing @@ -116,9 +121,9 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr out: if (rreq == nullptr) { - RD_LOG(ERROR, - "Failed to localize journal entry rkey={} jentry=[{}], we return error and let Raft resend this req", - rkey.to_string(), jentry->to_string()); + RD_LOGE(rkey.traceID, + "Failed to localize journal entry rkey={} jentry=[{}], we return error and let Raft resend this req", + rkey.to_string(), jentry->to_string()); } return rreq; } @@ -146,11 +151,13 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_finish(nuraft::log_entry RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry received from RAFT peer"); - repl_key rkey{.server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn}; + repl_key rkey{ + .server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn, .traceID = jentry->traceID}; auto rreq = m_rd.repl_key_to_req(rkey); if ((rreq == nullptr) || (rreq->is_localize_pending())) { - rreq = localize_journal_entry_prepare(lentry); + rreq = localize_journal_entry_prepare(lentry, + -1 /* lsn=-1, since this is a finish call and we don't have lsn yet */); if (rreq == nullptr) { RELEASE_ASSERT(rreq != nullptr, "We get an linked data for rkey=[{}], jentry=[{}] not as part of Raft Append but " @@ -176,7 +183,7 @@ raft_buf_ptr_t RaftStateMachine::pre_commit_ext(nuraft::state_machine::ext_op_pa int64_t lsn = s_cast< int64_t >(params.log_idx); repl_req_ptr_t rreq = lsn_to_req(lsn); - RD_LOGD("Raft channel: Precommit rreq=[{}]", rreq->to_compact_string()); + RD_LOGT(rreq->traceID(), "Precommit rreq=[{}]", rreq->to_compact_string()); m_rd.m_listener->on_pre_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq); return m_success_ptr; @@ -184,24 +191,91 @@ raft_buf_ptr_t RaftStateMachine::pre_commit_ext(nuraft::state_machine::ext_op_pa raft_buf_ptr_t RaftStateMachine::commit_ext(nuraft::state_machine::ext_op_params const& params) { int64_t lsn = s_cast< int64_t >(params.log_idx); - RD_LOGD("Raft channel: Received Commit message lsn {} store {} logdev {} size {}", lsn, - m_rd.m_data_journal->logstore_id(), m_rd.m_data_journal->logdev_id(), params.data->size()); repl_req_ptr_t rreq = lsn_to_req(lsn); + if (m_rd.need_skip_processing(lsn)) { + RD_LOGI(rreq->traceID(), "Raft Channel: Log {} is expected to be handled by snapshot. Skipping commit.", lsn); + return m_success_ptr; + } RD_DBG_ASSERT(rreq != nullptr, "Raft channel got null rreq for lsn={}", lsn); - RD_LOGD("Raft channel: Received Commit message rreq=[{}]", rreq->to_string()); + RD_LOGT(rreq->traceID(), "Raft channel: Received Commit message rreq=[{}]", rreq->to_string()); if (rreq->is_proposer()) { // This is the time to ensure flushing of journal happens in the proposer rreq->add_state(repl_req_state_t::LOG_FLUSHED); } - m_rd.handle_commit(rreq); - return m_success_ptr; } void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_ptr_t& new_conf) { - RD_LOGD("Raft channel: Commit new cluster conf , log_idx = {}", log_idx); - // TODO:add more logic here if necessary + // when reaching here, the config change log has already been committed, and the new config has been applied to the + // cluster + if (m_rd.need_skip_processing(s_cast< repl_lsn_t >(log_idx))) { + RD_LOGI(NO_TRACE_ID, "Raft Channel: Config {} is expected to be handled by snapshot. Skipping commit.", + log_idx); + return; + } + + RD_LOGD(NO_TRACE_ID, "Raft channel: Commit new cluster conf , log_idx = {}", log_idx); + +#ifdef _PRERELEASE + auto& servers_in_new_conf = new_conf->get_servers(); + std::vector< int32_t > server_ids_in_new_conf; + for (auto& server : servers_in_new_conf) + server_ids_in_new_conf.emplace_back(server->get_id()); + + auto my_id = m_rd.server_id(); + + std::ostringstream oss; + auto it = server_ids_in_new_conf.begin(); + if (it != server_ids_in_new_conf.end()) { + oss << *it; + ++it; + } + for (; it != server_ids_in_new_conf.end(); ++it) { + oss << "," << *it; + } + + RD_LOGI(NO_TRACE_ID, "Raft channel: server ids in new cluster conf : {}, my_id {}, group_id {}", oss.str(), my_id, + m_rd.group_id_str()); +#endif + + m_rd.handle_config_commit(s_cast< repl_lsn_t >(log_idx), new_conf); +} + +void RaftStateMachine::rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) { + RD_LOGD(NO_TRACE_ID, "Raft channel: Rollback cluster conf , log_idx = {}", log_idx); + m_rd.handle_config_rollback(s_cast< repl_lsn_t >(log_idx), conf); +} + +void RaftStateMachine::rollback_ext(const nuraft::state_machine::ext_op_params& params) { + int64_t lsn = s_cast< int64_t >(params.log_idx); + repl_req_ptr_t rreq = lsn_to_req(lsn); + if (rreq == nullptr) { + RD_LOGE(NO_TRACE_ID, "Raft channel: Rollback lsn {} rreq not found", lsn); + return; + } + + RD_LOGD(rreq->traceID(), "Raft channel: Rollback lsn {}, rreq=[{}]", lsn, rreq->to_string()); + m_rd.handle_rollback(rreq); +} + +int64_t RaftStateMachine::get_next_batch_size_hint_in_bytes() { return next_batch_size_hint; } + +int64_t RaftStateMachine::inc_next_batch_size_hint() { + constexpr int64_t next_batch_size_hint_limit = 16; + // set to minimal if previous hint is negative (i.e do not want any log) + if (next_batch_size_hint < 0) { + next_batch_size_hint = 1; + return next_batch_size_hint; + } + // Exponential growth till next_batch_size_hint_limit, set to 0 afterward means leader take control. + next_batch_size_hint = next_batch_size_hint * 2 > next_batch_size_hint_limit ? 0 : next_batch_size_hint * 2; + return next_batch_size_hint; +} + +int64_t RaftStateMachine::reset_next_batch_size_hint(int64_t new_hint) { + next_batch_size_hint = new_hint; + return next_batch_size_hint; } void RaftStateMachine::iterate_repl_reqs(std::function< void(int64_t, repl_req_ptr_t rreq) > const& cb) { @@ -211,18 +285,17 @@ void RaftStateMachine::iterate_repl_reqs(std::function< void(int64_t, repl_req_p } uint64_t RaftStateMachine::last_commit_index() { - RD_LOG(DEBUG, "Raft channel: last_commit_index {}", uint64_cast(m_rd.get_last_commit_lsn())); + RD_LOGD(NO_TRACE_ID, "Raft channel: last_commit_index {}", uint64_cast(m_rd.get_last_commit_lsn())); return uint64_cast(m_rd.get_last_commit_lsn()); } void RaftStateMachine::become_ready() { m_rd.become_ready(); } -void RaftStateMachine::unlink_lsn_to_req(int64_t lsn) { - auto const it = m_lsn_req_map.find(lsn); - if (it != m_lsn_req_map.cend()) { - RD_LOG(DEBUG, "Raft channel: erase lsn {}, rreq {}", lsn, it->second->to_string()); - m_lsn_req_map.erase(lsn); - } +void RaftStateMachine::unlink_lsn_to_req(int64_t lsn, repl_req_ptr_t rreq) { + // it is possible a LSN mapped to different rreq in history + // due to log overwritten. Verify the rreq before removing + auto deleted = m_lsn_req_map.erase_if_equal(lsn, rreq); + if (deleted) { RD_LOGT(rreq->traceID(), "Raft channel: erase lsn {}, rreq {}", lsn, rreq->to_string()); } } void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) { @@ -230,8 +303,12 @@ void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) { rreq->add_state(repl_req_state_t::LOG_RECEIVED); // reset the rreq created_at time to now https://github.com/eBay/HomeStore/issues/506 rreq->set_created_time(); - [[maybe_unused]] auto r = m_lsn_req_map.insert(lsn, std::move(rreq)); - RD_DBG_ASSERT_EQ(r.second, true, "lsn={} already in precommit list", lsn); + auto r = m_lsn_req_map.insert(lsn, std::move(rreq)); + if (!r.second) { + RD_LOGE(rreq->traceID(), "lsn={} already in precommit list, exist_term={}, is_volatile={}", lsn, + r.first->second->term(), r.first->second->is_volatile()); + // TODO: we need to think about the case where volatile is in the map already, is it safe to overwrite it? + } } repl_req_ptr_t RaftStateMachine::lsn_to_req(int64_t lsn) { @@ -253,18 +330,39 @@ void RaftStateMachine::create_snapshot(nuraft::snapshot& s, nuraft::async_result int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, ulong obj_id, raft_buf_ptr_t& data_out, bool& is_last_obj) { + + // Ensure all logs snapshot included are committed to prevent the following scenario: + // If a crash occurs during snapshot creation, the snapshot might be persisted while the rd's sb is not. + // This means the durable_commit_lsn is less than the snapshot's log_idx. Upon restart, the changes in + // uncommitted logs may or may not included in the snapshot data sent by leader, + // depending on the racing of commit vs snapshot read, leading to data inconsistency. + if (s_cast< repl_lsn_t >(s.get_last_log_idx()) > m_rd.get_last_commit_lsn()) { + RD_LOGW(NO_TRACE_ID, + "not ready to read because there are some uncommitted logs in snapshot, " + "let nuraft retry later. snapshot log_idx={}, last_commit_lsn={}", + s.get_last_log_idx(), m_rd.get_last_commit_lsn()); + return -1; + } + + // For Nuraft baseline resync, we separate the process into two layers: HomeStore layer and Application layer. + // We use the highest bit of the obj_id to indicate the message type: 0 is for HS, 1 is for Application. + if (is_hs_snp_obj(obj_id)) { + // This is the preserved msg for homestore to resync data + m_rd.create_snp_resync_data(data_out); + is_last_obj = false; + return 0; + } auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); - auto snp_data = std::make_shared< snapshot_data >(); + auto snp_data = std::make_shared< snapshot_obj >(); snp_data->user_ctx = user_ctx; snp_data->offset = obj_id; snp_data->is_last_obj = is_last_obj; // Listener will read the snapshot data and we pass through the same. - int ret = m_rd.m_listener->read_snapshot_data(snp_ctx, snp_data); + int ret = m_rd.m_listener->read_snapshot_obj(snp_ctx, snp_data); + user_ctx = snp_data->user_ctx; // Have to pass the user_ctx to NuRaft even if ret<0 to get it freed later if (ret < 0) return ret; - // Update user_ctx and whether is_last_obj - user_ctx = snp_data->user_ctx; is_last_obj = snp_data->is_last_obj; // We are doing a copy here. @@ -276,28 +374,52 @@ int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, nuraft::buffer& data, bool is_first_obj, bool is_last_obj) { + if (is_hs_snp_obj(obj_id)) { + // Homestore preserved msg + if (m_rd.save_snp_resync_data(data, s)) { + obj_id = snp_obj_id_type_app; + LOGDEBUG("save_snp_resync_data success, next obj_id={}", obj_id); + } + return; + } auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); - auto snp_data = std::make_shared< snapshot_data >(); + auto snp_data = std::make_shared< snapshot_obj >(); snp_data->offset = obj_id; snp_data->is_first_obj = is_first_obj; snp_data->is_last_obj = is_last_obj; // We are doing a copy here. - sisl::io_blob_safe blob{s_cast< size_t >(data.size())}; + sisl::io_blob_safe blob{static_cast< uint32_t >(data.size())}; std::memcpy(blob.bytes(), data.data_begin(), data.size()); snp_data->blob = std::move(blob); - m_rd.m_listener->write_snapshot_data(snp_ctx, snp_data); + m_rd.m_listener->write_snapshot_obj(snp_ctx, snp_data); + if (is_last_obj) { + hs()->cp_mgr().trigger_cp_flush(true).wait(); // ensure DSN is flushed to disk + } // Update the object offset. obj_id = snp_data->offset; + +#ifdef _PRERELEASE + if (iomgr_flip::instance()->test_flip("baseline_resync_restart_new_follower")) { + LOGINFO("Hit flip baseline_resync_restart_new_follower crashing"); + hs()->crash_simulator().crash_now(); + } +#endif } bool RaftStateMachine::apply_snapshot(nuraft::snapshot& s) { + // NOTE: Currently, NuRaft considers the snapshot applied once compaction and truncation are completed, even if a + // crash occurs before apply_snapshot() is called. Therefore, the LSN must be updated here to ensure it is + // persisted AFTER log truncation. m_rd.set_last_commit_lsn(s.get_last_log_idx()); m_rd.m_data_journal->set_last_durable_lsn(s.get_last_log_idx()); + auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s); - return m_rd.m_listener->apply_snapshot(snp_ctx); + auto res = m_rd.m_listener->apply_snapshot(snp_ctx); + hs()->cp_mgr().trigger_cp_flush(true /* force */).get(); + return res; } nuraft::ptr< nuraft::snapshot > RaftStateMachine::last_snapshot() { @@ -308,6 +430,6 @@ nuraft::ptr< nuraft::snapshot > RaftStateMachine::last_snapshot() { void RaftStateMachine::free_user_snp_ctx(void*& user_snp_ctx) { m_rd.m_listener->free_user_snp_ctx(user_snp_ctx); } -std::string RaftStateMachine::rdev_name() const { return m_rd.rdev_name(); } +std::string RaftStateMachine::identify_str() const { return m_rd.identify_str(); } } // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h index b931e42f4..0de9b2744 100644 --- a/src/lib/replication/repl_dev/raft_state_machine.h +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -24,43 +24,35 @@ namespace homestore { class ReplicaSetImpl; class StateMachineStore; -#define RD_LOG(level, msg, ...) \ - LOG##level##MOD_FMT(replication, ([&](fmt::memory_buffer& buf, const char* msgcb, auto&&... args) -> bool { \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "}, \ - fmt::make_format_args(file_name(__FILE__), __LINE__)); \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ - fmt::make_format_args("rd", rdev_name())); \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ - fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ - return true; \ - }), \ - msg, ##__VA_ARGS__); +#define NO_TRACE_ID "n/a" +#define RD_LOG(level, traceID, msg, ...) \ + LOG##level##MOD(replication, "[traceID={}] [{}] " msg, traceID, identify_str(), ##__VA_ARGS__) #define RD_ASSERT_CMP(assert_type, val1, cmp, val2, ...) \ { \ assert_type##_ASSERT_CMP( \ val1, cmp, val2, \ [&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool { \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "}, \ - fmt::make_format_args(file_name(__FILE__), __LINE__)); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}:{}] "}, \ + fmt::make_format_args(file_name(__FILE__), __LINE__, __FUNCTION__)); \ sisl::logging::default_cmp_assert_formatter(buf, msgcb, std::forward< decltype(args) >(args)...); \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ - fmt::make_format_args("rd", rdev_name())); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}] "}, fmt::make_format_args(identify_str())); \ return true; \ }, \ ##__VA_ARGS__); \ } #define RD_ASSERT(assert_type, cond, ...) \ { \ - assert_type##_ASSERT_FMT(cond, \ - ([&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool { \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ - fmt::make_format_args("rd", rdev_name())); \ - fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ - fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ - return true; \ - }), \ - ##__VA_ARGS__); \ + assert_type##_ASSERT_FMT( \ + cond, ([&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool { \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}:{}] "}, \ + fmt::make_format_args(file_name(__FILE__), __LINE__, __FUNCTION__)); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}] "}, fmt::make_format_args(identify_str())); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ + fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ + return true; \ + }), \ + ##__VA_ARGS__); \ } #define RD_DBG_ASSERT(cond, ...) RD_ASSERT(DEBUG, cond, ##__VA_ARGS__) @@ -79,12 +71,16 @@ class StateMachineStore; #define RD_REL_ASSERT_GT(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, >, val2, ##__VA_ARGS__) #define RD_REL_ASSERT_GE(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, >=, val2, ##__VA_ARGS__) -#define RD_LOGT(...) RD_LOG(TRACE, ##__VA_ARGS__) -#define RD_LOGD(...) RD_LOG(DEBUG, ##__VA_ARGS__) -#define RD_LOGI(...) RD_LOG(INFO, ##__VA_ARGS__) -#define RD_LOGW(...) RD_LOG(WARN, ##__VA_ARGS__) -#define RD_LOGE(...) RD_LOG(ERROR, ##__VA_ARGS__) -#define RD_LOGC(...) RD_LOG(CRITICAL, ##__VA_ARGS__) +#define RD_LOGT(traceID, ...) RD_LOG(TRACE, traceID, ##__VA_ARGS__) +#define RD_LOGD(traceID, ...) RD_LOG(DEBUG, traceID, ##__VA_ARGS__) +#define RD_LOGI(traceID, ...) RD_LOG(INFO, traceID, ##__VA_ARGS__) +#define RD_LOGW(traceID, ...) RD_LOG(WARN, traceID, ##__VA_ARGS__) +#define RD_LOGE(traceID, ...) RD_LOG(ERROR, traceID, ##__VA_ARGS__) +#define RD_LOGC(traceID, ...) RD_LOG(CRITICAL, traceID, ##__VA_ARGS__) + +// For the logic snapshot obj_id, we use the highest bit to indicate the type of the snapshot message. +// 0 is for HS, 1 is for Application. +static constexpr uint64_t snp_obj_id_type_app = 1ULL << 63; using AsyncNotify = folly::SemiFuture< folly::Unit >; using AsyncNotifier = folly::Promise< folly::Unit >; @@ -97,6 +93,7 @@ class RaftStateMachine : public nuraft::state_machine { nuraft::ptr< nuraft::buffer > m_success_ptr; // Preallocate the success return to raft // iomgr::timer_handle_t m_wait_blkid_write_timer_hdl{iomgr::null_timer_handle}; bool m_resync_mode{false}; + int64_t next_batch_size_hint{0}; public: RaftStateMachine(RaftReplDev& rd); @@ -109,8 +106,10 @@ class RaftStateMachine : public nuraft::state_machine { raft_buf_ptr_t pre_commit_ext(const nuraft::state_machine::ext_op_params& params) override; raft_buf_ptr_t commit_ext(const nuraft::state_machine::ext_op_params& params) override; void commit_config(const ulong log_idx, raft_cluster_config_ptr_t& new_conf) override; - void rollback(uint64_t lsn, nuraft::buffer&) override { LOGCRITICAL("Unimplemented rollback on: [{}]", lsn); } + void rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) override; + void rollback_ext(const nuraft::state_machine::ext_op_params& params) override; void become_ready(); + int64_t get_next_batch_size_hint_in_bytes() override; void create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done) override; int read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, ulong obj_id, raft_buf_ptr_t& data_out, @@ -123,16 +122,20 @@ class RaftStateMachine : public nuraft::state_machine { ////////// APIs outside of nuraft::state_machine requirements //////////////////// ReplServiceError propose_to_raft(repl_req_ptr_t rreq); - repl_req_ptr_t localize_journal_entry_prepare(nuraft::log_entry& lentry); + repl_req_ptr_t localize_journal_entry_prepare(nuraft::log_entry& lentry, int64_t lsn = -1); repl_req_ptr_t localize_journal_entry_finish(nuraft::log_entry& lentry); void link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn); - void unlink_lsn_to_req(int64_t lsn); + void unlink_lsn_to_req(int64_t lsn, repl_req_ptr_t rreq); repl_req_ptr_t lsn_to_req(int64_t lsn); nuraft_mesg::repl_service_ctx* group_msg_service(); void iterate_repl_reqs(std::function< void(int64_t, repl_req_ptr_t rreq) > const& cb); - std::string rdev_name() const; + std::string identify_str() const; + int64_t reset_next_batch_size_hint(int64_t new_hint); + int64_t inc_next_batch_size_hint(); + + static bool is_hs_snp_obj(uint64_t obj_id) { return (obj_id & snp_obj_id_type_app) == 0; } private: void after_precommit_in_leader(const nuraft::raft_server::req_ext_cb_params& params); diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index e5e2cb1a5..03b540184 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -1,6 +1,7 @@ #include #include "replication/repl_dev/solo_repl_dev.h" #include "replication/repl_dev/common.h" +#include #include #include #include @@ -10,44 +11,56 @@ namespace homestore { SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing) : m_rd_sb{std::move(rd_sb)}, m_group_id{m_rd_sb->group_id} { if (load_existing) { - logstore_service().open_logdev(m_rd_sb->logdev_id); + m_logdev_id = m_rd_sb->logdev_id; + logstore_service().open_logdev(m_rd_sb->logdev_id, flush_mode_t::TIMER); logstore_service() .open_log_store(m_rd_sb->logdev_id, m_rd_sb->logstore_id, true /* append_mode */) .thenValue([this](auto log_store) { m_data_journal = std::move(log_store); m_rd_sb->logstore_id = m_data_journal->get_store_id(); m_data_journal->register_log_found_cb(bind_this(SoloReplDev::on_log_found, 3)); + m_is_recovered = true; }); } else { - m_logdev_id = logstore_service().create_new_logdev(); + m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::TIMER); m_data_journal = logstore_service().create_new_log_store(m_logdev_id, true /* append_mode */); m_rd_sb->logstore_id = m_data_journal->get_store_id(); m_rd_sb->logdev_id = m_logdev_id; m_rd_sb.write(); + m_is_recovered = true; } } void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t rreq) { + repl_req_ptr_t rreq, bool part_of_batch, trace_id_t tid) { if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } - rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1}, - value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, header, key, - value.size); + // incr_pending_request_num(); + auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1, .traceID = tid}, + value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, + header, key, value.size, m_listener); + HS_REL_ASSERT_EQ(status, ReplServiceError::OK, "Error in allocating local blks"); // If it is header only entry, directly write to the journal - if (rreq->has_linked_data()) { - // Step 1: Alloc Blkid - auto const status = rreq->alloc_local_blks(m_listener, value.size); - HS_REL_ASSERT_EQ(status, ReplServiceError::OK, "Error in allocating local blks"); - + if (rreq->has_linked_data() && !rreq->has_state(repl_req_state_t::DATA_WRITTEN)) { // Write the data - data_service().async_write(value, rreq->local_blkid()).thenValue([this, rreq = std::move(rreq)](auto&& err) { + data_service().async_write(value, rreq->local_blkids()).thenValue([this, rreq = std::move(rreq)](auto&& err) { HS_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener write_journal(std::move(rreq)); }); - } else { - write_journal(std::move(rreq)); + } else { write_journal(std::move(rreq)); } +} + +// destroy is only called in worker thread; +void SoloReplDev::destroy() { + HS_REL_ASSERT(iomanager.am_i_worker_reactor(), "Destroy should be called in worker thread"); + while (!m_is_recovered) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); } + + hs()->logstore_service().remove_log_store(m_logdev_id, m_data_journal->get_store_id()); + hs()->logstore_service().destroy_log_dev(m_logdev_id); + + m_rd_sb.destroy(); } void SoloReplDev::write_journal(repl_req_ptr_t rreq) { @@ -62,17 +75,97 @@ void SoloReplDev::write_journal(repl_req_ptr_t rreq) { auto cur_lsn = m_commit_upto.load(); if (cur_lsn < lsn) { m_commit_upto.compare_exchange_strong(cur_lsn, lsn); } - data_service().commit_blk(rreq->local_blkid()); - m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkid(), rreq); + for (const auto& blkid : rreq->local_blkids()) { + data_service().commit_blk(blkid); + } + m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkids(), rreq); + // decr_pending_request_num(); }); } +std::error_code SoloReplDev::alloc_blks(uint32_t data_size, const blk_alloc_hints& hints, + std::vector< MultiBlkId >& out_blkids) { + // if (is_stopping()) { return std::make_error_code(std::errc::operation_canceled); } + + // incr_pending_request_num(); + std::vector< BlkId > blkids; + auto status = + data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), hints, blkids); + if (status != BlkAllocStatus::SUCCESS) { + DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks"); + // decr_pending_request_num(); + return std::make_error_code(std::errc::no_space_on_device); + } + for (auto& blkid : blkids) { + out_blkids.emplace_back(blkid); + } + // decr_pending_request_num(); + return std::error_code{}; +} + +folly::Future< std::error_code > SoloReplDev::async_write(const std::vector< MultiBlkId >& blkids, + sisl::sg_list const& value, bool part_of_batch, + trace_id_t tid) { + /*if (is_stopping()) { + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + }*/ + + // incr_pending_request_num(); + HS_REL_ASSERT_GT(blkids.size(), 0, "Empty blkid vec"); + std::vector< folly::Future< std::error_code > > futs; + futs.reserve(blkids.size()); + sisl::sg_iterator sg_it{value.iovs}; + + for (const auto& blkid : blkids) { + auto sgs_size = blkid.blk_count() * data_service().get_blk_size(); + const auto iovs = sg_it.next_iovs(sgs_size); + uint32_t total_size = 0; + for (auto& iov : iovs) { + total_size += iov.iov_len; + } + if (total_size != sgs_size) { + LOGINFO("Block size mismatch total_size={} sgs_size={}", total_size, sgs_size); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::invalid_argument)); + } + sisl::sg_list sgs{sgs_size, iovs}; + futs.emplace_back(data_service().async_write(sgs, blkid, part_of_batch)); + } + + return folly::collectAllUnsafe(futs).thenValue([this](auto&& v_res) { + for (const auto& err_c : v_res) { + if (sisl_unlikely(err_c.value())) { + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::io_error)); + } + } + + // decr_pending_request_num(); + return folly::makeFuture< std::error_code >(std::error_code{}); + }); +} + +void SoloReplDev::async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header, + sisl::blob const& key, uint32_t data_size, repl_req_ptr_t rreq, trace_id_t tid) { + // if (is_stopping()) { return; } + // incr_pending_request_num(); + + // We expect clients to provide valid repl req ctx with blocks allocated. + HS_REL_ASSERT(rreq, "Invalid repl req ctx"); + rreq->add_state(repl_req_state_t::BLK_ALLOCATED); + rreq->set_local_blkids(blkids); + auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1, .traceID = tid}, + data_size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, header, + key, data_size, m_listener); + HS_REL_ASSERT_EQ(status, ReplServiceError::OK, "Error in initializing repl req context."); + + // Write to journal. + write_journal(std::move(rreq)); +} + void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { repl_journal_entry const* entry = r_cast< repl_journal_entry const* >(buf.bytes()); uint32_t remain_size = buf.size() - sizeof(repl_journal_entry); HS_REL_ASSERT_EQ(entry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry found"); - HS_REL_ASSERT_EQ(entry->code, journal_type_t::HS_DATA_LINKED, "Found a journal entry which is not data"); uint8_t const* raw_ptr = r_cast< uint8_t const* >(entry) + sizeof(repl_journal_entry); sisl::blob header{raw_ptr, entry->user_header_size}; @@ -85,24 +178,44 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx raw_ptr += entry->key_size; remain_size -= entry->key_size; - sisl::blob value_blob{raw_ptr, remain_size}; - MultiBlkId blkid; - if (remain_size) { blkid.deserialize(value_blob, true /* copy */); } + std::vector< MultiBlkId > blkids; + while (remain_size > 0) { + MultiBlkId blkid; + sisl::blob value_blob{raw_ptr, sizeof(BlkId)}; + blkid.deserialize(value_blob, true /* copy */); + raw_ptr += sizeof(BlkId); + remain_size -= sizeof(BlkId); + blkids.push_back(blkid); + } m_listener->on_pre_commit(lsn, header, key, nullptr); auto cur_lsn = m_commit_upto.load(); if (cur_lsn < lsn) { m_commit_upto.compare_exchange_strong(cur_lsn, lsn); } - m_listener->on_commit(lsn, header, key, blkid, nullptr); + m_listener->on_commit(lsn, header, key, blkids, nullptr); } folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch) { - return data_service().async_read(bid, sgs, size, part_of_batch); + bool part_of_batch, trace_id_t tid) { + /*if (is_stopping()) { + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + }*/ + // incr_pending_request_num(); + auto result = data_service().async_read(bid, sgs, size, part_of_batch); + // decr_pending_request_num(); + return result; } -void SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid) { data_service().async_free_blk(bid); } +folly::Future< std::error_code > SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) { + /*if (is_stopping()) { + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled)); + }*/ + // incr_pending_request_num(); + auto result = data_service().async_free_blk(bid); + // decr_pending_request_num(); + return result; +} uint32_t SoloReplDev::get_blk_size() const { return data_service().get_blk_size(); } diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index cddb94856..9cf41dcce 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -30,37 +31,70 @@ class CP; class SoloReplDev : public ReplDev { private: logdev_id_t m_logdev_id; - std::shared_ptr< HomeLogStore > m_data_journal; + std::shared_ptr< HomeLogStore > m_data_journal{nullptr}; superblk< repl_dev_superblk > m_rd_sb; uuid_t m_group_id; std::atomic< logstore_seq_num_t > m_commit_upto{-1}; + std::atomic< bool > m_is_recovered{false}; public: SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing); virtual ~SoloReplDev() = default; + virtual std::error_code alloc_blks(uint32_t data_size, const blk_alloc_hints& hints, + std::vector< MultiBlkId >& out_blkids) override; + virtual folly::Future< std::error_code > async_write(const std::vector< MultiBlkId >& blkids, + sisl::sg_list const& value, bool part_of_batch = false, + trace_id_t tid = 0) override; + virtual void async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header, + sisl::blob const& key, uint32_t data_size, repl_req_ptr_t ctx, + trace_id_t tid = 0) override; + void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - repl_req_ptr_t ctx) override; + repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) override; folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch = false) override; + bool part_of_batch = false, trace_id_t tid = 0) override; - void async_free_blks(int64_t lsn, MultiBlkId const& blkid) override; + folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid, trace_id_t tid = 0) override; AsyncReplResult<> become_leader() override { return make_async_error(ReplServiceError::OK); } bool is_leader() const override { return true; } replica_id_t get_leader_id() const override { return m_group_id; } std::vector< peer_info > get_replication_status() const override { - return std::vector< peer_info >{peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0}}; + return std::vector< peer_info >{ + peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0, .priority_ = 1}}; + } + bool is_ready_for_traffic() const override { return true; } + void purge() override {} + + std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override { + return nullptr; } uuid_t group_id() const override { return m_group_id; } + void set_custom_rdev_name(std::string const& name) override { + std::strncpy(m_rd_sb->rdev_name, name.c_str(), m_rd_sb->max_name_len - 1); + m_rd_sb->rdev_name[m_rd_sb->max_name_len - 1] = '\0'; + } + + repl_lsn_t get_last_commit_lsn() const override { return 0; } + repl_lsn_t get_last_append_lsn() override { return 0; }; + uint32_t get_blk_size() const override; + void quiesce_reqs() override { return; } + void resume_accepting_reqs() override { return; } + + // clear reqs that has allocated blks on the given chunk. + void clear_chunk_req(chunk_num_t chunk_id) override { return; } + void cp_flush(CP* cp); void cp_cleanup(CP* cp); + void destroy(); + private: void write_journal(repl_req_ptr_t rreq); void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 89800df3f..6f3861d59 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include "common/homestore_assert.hpp" #include "replication/service/generic_repl_svc.h" #include "replication/service/raft_repl_service.h" @@ -87,6 +88,9 @@ void SoloReplService::start() { } m_sb_bufs.clear(); + LOGINFO("Repl devs load completed, calling upper layer on_repl_devs_init_completed"); + m_repl_app->on_repl_devs_init_completed(); + hs()->data_service().start(); hs()->logstore_service().start(hs()->is_first_time_boot()); @@ -95,8 +99,23 @@ void SoloReplService::start() { } void SoloReplService::stop() { - GenericReplService::stop(); + /*start_stopping(); + while (true) { + auto pending_request_num = get_pending_request_num(); + if (!pending_request_num) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + }*/ + + // stop all repl_devs + { + std::unique_lock lg(m_rd_map_mtx); + for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) { + auto rdev = std::dynamic_pointer_cast< SoloReplDev >(it->second); + rdev->stop(); + } + } hs()->logstore_service().stop(); + hs()->data_service().stop(); } AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t group_id, @@ -109,6 +128,7 @@ AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t auto listener = m_repl_app->create_repl_dev_listener(group_id); listener->set_repl_dev(rdev); rdev->attach_listener(std::move(listener)); + // incr_pending_request_num(); { std::unique_lock lg(m_rd_map_mtx); @@ -116,15 +136,42 @@ AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t if (!happened) { // We should never reach here, as we have failed to emplace in map, but couldn't find entry DEBUG_ASSERT(false, "Unable to put the repl_dev in rd map"); + // decr_pending_request_num(); return make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_ALREADY_EXISTS); } } + // decr_pending_request_num(); return make_async_success< shared< ReplDev > >(rdev); } folly::SemiFuture< ReplServiceError > SoloReplService::remove_repl_dev(group_id_t group_id) { - return folly::makeSemiFuture< ReplServiceError >(ReplServiceError::NOT_IMPLEMENTED); + // RD_LOGI("Removing repl dev for group_id={}", boost::uuids::to_string(group_id)); + auto rdev = get_repl_dev(group_id); + if (rdev.hasError()) { return folly::makeSemiFuture(rdev.error()); } + + auto rdev_ptr = rdev.value(); + + // 1. Firstly stop the repl dev which waits for any outstanding requests to finish + rdev_ptr->stop(); + + // 2. Destroy the repl dev which will remove the logstore and free the memory; + dp_cast< SoloReplDev >(rdev_ptr)->destroy(); + + // 3. detaches both ways: + // detach rdev from its listener and listener from rdev; + rdev_ptr->detach_listener(); + { + // 4. remove from rd map which finally call SoloReplDev's destructor because this is the last one holding ref to + // this instance; + std::unique_lock lg(m_rd_map_mtx); + m_rd_map.erase(group_id); + } + + // 5. now destroy the upper layer's listener instance; + m_repl_app->destroy_repl_dev_listener(group_id); + + return folly::makeSemiFuture(ReplServiceError::OK); } void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) { @@ -147,23 +194,31 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } } -AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const { +AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } -std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; } +AsyncReplResult<> SoloReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, + uint32_t commit_quorum, bool wait_and_verify, uint64_t trace_id) const { + return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); +} + +std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { + return std::make_unique< CPContext >(new_cp); +} folly::Future< bool > SoloReplServiceCPHandler::cp_flush(CP* cp) { repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) { - if (repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_flush(cp); } + if (repl_dev) { dp_cast< SoloReplDev >(repl_dev)->cp_flush(cp); } }); return folly::makeFuture< bool >(true); } void SoloReplServiceCPHandler::cp_cleanup(CP* cp) { repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) { - if (repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_cleanup(cp); } + if (repl_dev) { dp_cast< SoloReplDev >(repl_dev)->cp_cleanup(cp); } }); } diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index e2d445427..cd63a8866 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -73,8 +73,12 @@ class SoloReplService : public GenericReplService { std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const override; + AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; + AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, + uint32_t commit_quorum, bool wait_and_verify = true, + uint64_t trace_id = 0) const override; }; class SoloReplServiceCPHandler : public CPCallbacks { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp index 65d928390..8df5d5e6a 100644 --- a/src/lib/replication/service/raft_repl_service.cpp +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -59,6 +59,17 @@ ReplServiceError RaftReplService::to_repl_error(nuraft::cmd_result_code code) { return ret; } +// NuRaft priority decay coefficient is set to 0.8(currently not configurable). For more details, please refer to +// https://github.com/eBay/NuRaft/blob/master/docs/leader_election_priority.md +int32_t RaftReplService::compute_raft_follower_priority() { + auto max_wait_round = std::min(raft_priority_election_round_upper_limit, + HS_DYNAMIC_CONFIG(consensus.max_wait_rounds_of_priority_election)); + if (max_wait_round == 0) { return raft_leader_priority; } + auto priority = 1 + static_cast< int32_t >( + std::ceil(raft_leader_priority * std::pow(raft_priority_decay_coefficient, max_wait_round))); + return priority; +} + RaftReplService::RaftReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} { m_config_sb_bufs.reserve(100); meta_service().register_handler( @@ -79,12 +90,20 @@ void RaftReplService::start() { .ssl_key_ = ioenvironment.get_ssl_key(), .ssl_cert_ = ioenvironment.get_ssl_cert(), .token_verifier_ = std::dynamic_pointer_cast< sisl::GrpcTokenVerifier >(ioenvironment.get_token_verifier()), - .token_client_ = std::dynamic_pointer_cast< sisl::GrpcTokenClient >(ioenvironment.get_token_client())}; + .token_client_ = std::dynamic_pointer_cast< sisl::GrpcTokenClient >(ioenvironment.get_token_client()), + .max_receive_message_size_ = HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), + .max_send_message_size_ = HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size)}; m_msg_mgr = nuraft_mesg::init_messaging(params, weak_from_this(), true /* with_data_channel */); LOGINFO("Starting RaftReplService with server_uuid={} port={}", boost::uuids::to_string(params.server_uuid_), params.mesg_port_); + // check if ssl cert files are provided, if yes, monitor the changes + if (!params.ssl_key_.empty() && !params.ssl_cert_.empty()) { + ioenvironment.with_file_watcher(); + monitor_cert_changes(); + } + // Step 2: Register all RAFT parameters. At the end of this step, raft is ready to be created/join group auto r_params = nuraft::raft_params() .with_election_timeout_lower(HS_DYNAMIC_CONFIG(consensus.elect_to_low_ms)) @@ -99,7 +118,13 @@ void RaftReplService::start() { .with_snapshot_enabled(HS_DYNAMIC_CONFIG(consensus.snapshot_freq_distance)) .with_leadership_expiry(HS_DYNAMIC_CONFIG(consensus.leadership_expiry_ms)) .with_reserved_log_items(HS_DYNAMIC_CONFIG(consensus.num_reserved_log_items)) + .with_snapshot_sync_ctx_timeout(HS_DYNAMIC_CONFIG(consensus.snapshot_sync_ctx_timeout_ms)) .with_auto_forwarding(false); + // new_joiner_type fully disabled log pack behavior. + // There is no callback available for handling and localizing the log entries within the pack, which could + // result in data corruption. + r_params.use_new_joiner_type_ = false; + r_params.use_bg_thread_for_snapshot_io_ = HS_DYNAMIC_CONFIG(consensus.use_bg_thread_for_snapshot_io); r_params.return_method_ = nuraft::raft_params::async_handler; m_msg_mgr->register_mgr_type(params.default_group_type_, r_params); @@ -118,23 +143,47 @@ void RaftReplService::start() { // We need to first load the repl_dev with its config and then attach the raft config to that repl dev. for (auto const& [buf, mblk] : m_config_sb_bufs) { auto rdev = raft_group_config_found(buf, voidptr_cast(mblk)); - rdev->on_restart(); + // if repl_dev is in destroy_pending state, it will not be loaded. + if (rdev) rdev->on_restart(); } m_config_sb_bufs.clear(); + LOGINFO("Repl devs load completed, calling upper layer on_repl_devs_init_completed"); + m_repl_app->on_repl_devs_init_completed(); // Step 5: Start the data and logstore service now. This step is essential before we can ask Raft to join groups etc - hs()->data_service().start(); + + // It is crucial to start the logstore before the enalbe data channel. This is because during log replay, + // the commit_blks() function is called, which interacts with the allocator. + // Starting the data channel before the log replay is complete can lead to a race condition between + // PUSHDATA operations and log replay. + // For example, consider LSN 100 in the log store is associated with PBA1. After a restart, the allocator + // is only aware of allocations up to the last checkpoint and may consider PBA1 as available. + // If a PUSHDATA request is received during this time, PBA1 could be allocated again to a new request, + // leading to data corruption by overwriting the data associated with LSN 100. + // Now the data channel is started in join_group(). + + LOGINFO("Starting LogStore service, fist_boot = {}", hs()->is_first_time_boot()); hs()->logstore_service().start(hs()->is_first_time_boot()); + LOGINFO("Started LogStore service, log replay should already done till this point"); + // all log stores are replayed, time to start data service. + LOGINFO("Starting DataService"); + hs()->data_service().start(); - // Step 6: Iterate all the repl dev and ask each one of the join the raft group. - for (auto it = m_rd_map.begin(); it != m_rd_map.end();) { - auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); - rdev->wait_for_logstore_ready(); - if (!rdev->join_group()) { - it = m_rd_map.erase(it); - } else { - ++it; - } + // Step 6: Iterate all the repl devs and ask each one of them to join the raft group concurrently. + std::vector< std::future< bool > > join_group_futures; + for (const auto& [_, repl_dev] : m_rd_map) { + join_group_futures.emplace_back(std::async(std::launch::async, [&repl_dev]() { + auto rdev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev); + rdev->wait_for_logstore_ready(); + + // upper layer can register a callback to be notified when log replay is done. + if (auto listener = rdev->get_listener(); listener) listener->on_log_replay_done(rdev->group_id()); + return rdev->join_group(); + })); + } + + for (auto& future : join_group_futures) { + if (!future.get()) HS_REL_ASSERT(false, "FAILED TO JOIN GROUP, PANIC HERE"); } // Step 7: Register to CPManager to ensure we can flush the superblk. @@ -148,12 +197,75 @@ void RaftReplService::start() { } void RaftReplService::stop() { - stop_reaper_thread(); - GenericReplService::stop(); +#if 0 + start_stopping(); + while (true) { + auto pending_request_num = get_pending_request_num(); + if (!pending_request_num) break; + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } +#endif + + // stop all repl_devs + { + std::unique_lock lg(m_rd_map_mtx); + for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) { + auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); + rdev->stop(); + } + } + + // this will stop and shutdown all the repl_dev and grpc server(data channel). + // for each raft_repl_dev: + // 1 Cancel snapshot requests if exist. + // 2 Terminate background commit thread. + // 3 Cancel all scheduler tasks. + // after m_msg_mgr is reset , no further data will hit data service and no futher log will hit log store. m_msg_mgr.reset(); hs()->logstore_service().stop(); } +void RaftReplService::monitor_cert_changes() { + auto fw = ioenvironment.get_file_watcher(); + auto cert_change_cb = [this](const std::string filepath, const bool deleted) { + LOGINFO("file change event for {}, deleted? {}", filepath, deleted) + // do not block file_watcher thread + std::thread restart_svc(&RaftReplService::restart_raft_svc, this, filepath, deleted); + restart_svc.detach(); + }; + + // monitor ssl cert file + if (!fw->register_listener(ioenvironment.get_ssl_cert(), "hs_ssl_cert_watcher", cert_change_cb)) { + LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", "hs_ssl_cert_watcher", + ioenvironment.get_ssl_cert()); + } + // monitor ssl key file + if (!fw->register_listener(ioenvironment.get_ssl_key(), "hs_ssl_key_watcher", cert_change_cb)) { + LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", "hs_ssl_key_watcher", + ioenvironment.get_ssl_key()); + } +} + +void RaftReplService::restart_raft_svc(const std::string filepath, const bool deleted) { + if (deleted && !wait_for_cert(filepath)) { + LOGINFO("file {} deleted, ", filepath) + // wait for the deleted file to be added again + throw std::runtime_error(fmt::format("file {} not found! Can not start grpc server", filepath)); + } + const std::unique_lock lock(raft_restart_mutex); + m_msg_mgr->restart_server(); + if (deleted) { monitor_cert_changes(); } +} + +bool RaftReplService::wait_for_cert(const std::string& filepath) { + auto attempts = cert_change_timeout / cert_check_sleep; + for (auto i = attempts; i > 0; --i) { + if (std::filesystem::exists(filepath)) { return true; } + std::this_thread::sleep_for(cert_check_sleep); + } + return false; +} + RaftReplDev* RaftReplService::raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie) { json_superblk group_config; auto& js = group_config.load(buf, meta_cookie); @@ -234,14 +346,18 @@ AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t return make_async_error< shared< ReplDev > >(to_repl_error(status.error())); } + auto follower_priority = compute_raft_follower_priority(); + auto my_id = m_repl_app->get_my_repl_id(); for (auto& member : members) { if (member == my_id) { continue; } // Skip myself do { - auto const result = m_msg_mgr->add_member(group_id, member).get(); + auto srv_config = nuraft::srv_config(nuraft_mesg::to_server_id(member), 0, boost::uuids::to_string(member), "", + false, follower_priority); + auto const result = m_msg_mgr->add_member(group_id, srv_config).get(); if (result) { - LOGINFOMOD(replication, "Groupid={}, new member={} added", boost::uuids::to_string(group_id), - boost::uuids::to_string(member)); + LOGINFOMOD(replication, "Groupid={}, new member={} added with priority={}", boost::uuids::to_string(group_id), + boost::uuids::to_string(member), follower_priority); break; } else if (result.error() != nuraft::CONFIG_CHANGING) { LOGWARNMOD(replication, "Groupid={}, add member={} failed with error={}", @@ -293,7 +409,10 @@ folly::SemiFuture< ReplServiceError > RaftReplService::remove_repl_dev(group_id_ auto rdev_result = get_repl_dev(group_id); if (!rdev_result) { return folly::makeSemiFuture< ReplServiceError >(ReplServiceError::SERVER_NOT_FOUND); } - return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value())->destroy_group(); + auto ret = std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value())->destroy_group(); + + // decr_pending_request_num(); + return ret; } void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) { @@ -314,7 +433,22 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki } if (rd_sb->destroy_pending == 0x1) { - LOGINFOMOD(replication, "ReplDev group_id={} was destroyed, skipping the load", group_id); + LOGINFOMOD(replication, "ReplDev group_id={} was destroyed, reclaim the stale resource", group_id); + // if we do not add the repl_dev to m_rd_map, it will not be permanently destroyed since gc thread finds the + // pending destroy repl_dev only from m_rd_map. so, we should try to reclaim all the repl_dev stale resources + // here. + + // 1 since we permanantly destroy the repl_dev here, it will not join_raft group where raft_server will be + // created. hence , no need to detroy it through nuraft_mesg, where raft_server will be shutdown. + // 2 m_raft_config_sb will be destroyed in raft_group_config_found() method if repl_dev is is not found, so + // skip it. + + // 3 logdev will be destroyed in delete_unopened_logdevs() if we don't open it(create repl_dev) here, so skip + // it. + + // 4 destroy the superblk, and after this, the repl_dev will not be loaded and found again. + rd_sb.destroy(); + return; } @@ -325,9 +459,49 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki add_repl_dev(group_id, rdev); } -AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const { - return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); +// replace_member actually has two phases: +// 1. start_replace_member: flip member_out to learner and add member_in. +// 2. complete_replace_member: remove member_out. +// In this function, it only invokes replDev start_replace_member. There is +// a background reaper thread helps periodically check the member_in replication status, after in_member has caught up, +// will trigger replDev complete_replace_member. +AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum, + uint64_t trace_id) const { + // if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); + // incr_pending_request_num(); + auto rdev_result = get_repl_dev(group_id); + if (!rdev_result) { return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); } + + return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) + ->start_replace_member(member_out, member_in, commit_quorum, trace_id) + .via(&folly::InlineExecutor::instance()) + .thenValue([this](auto&& e) mutable { + if (e.hasError()) { + // decr_pending_request_num(); + return make_async_error<>(e.error()); + } + // decr_pending_request_num(); + return make_async_success<>(); + }); +} + +AsyncReplResult<> RaftReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum, + bool wait_and_verify, uint64_t trace_id) const { + // if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING); + // incr_pending_request_num(); + auto rdev_result = get_repl_dev(group_id); + if (!rdev_result) { + // decr_pending_request_num(); + return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); + } + return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value()) + ->flip_learner_flag(member, target, commit_quorum, wait_and_verify, trace_id) + .via(&folly::InlineExecutor::instance()) + .thenValue([this](auto&& e) mutable { + if (e.hasError()) { return make_async_error<>(e.error()); } + return make_async_success<>(); + }); } ////////////////////// Reaper Thread related ////////////////////////////////// @@ -344,7 +518,7 @@ void RaftReplService::start_reaper_thread() { m_rdev_gc_timer_hdl = iomanager.schedule_thread_timer( HS_DYNAMIC_CONFIG(generic.repl_dev_cleanup_interval_sec) * 1000 * 1000 * 1000, true /* recurring */, nullptr, [this](void*) { - LOGINFOMOD(replication, "Reaper Thread: Doing GC"); + LOGDEBUGMOD(replication, "Reaper Thread: Doing GC"); gc_repl_reqs(); gc_repl_devs(); }); @@ -361,12 +535,19 @@ void RaftReplService::start_reaper_thread() { HS_DYNAMIC_CONFIG(consensus.flush_durable_commit_interval_ms) * 1000 * 1000, true /* recurring */, nullptr, [this](void*) { flush_durable_commit_lsn(); }); + // Check replace_member sync status to see a new member is fully synced up and ready to remove the old member + m_replace_member_sync_check_timer_hdl = iomanager.schedule_thread_timer( + HS_DYNAMIC_CONFIG(consensus.replace_member_sync_check_interval_ms) * 1000 * 1000, true /* recurring */, + nullptr, [this](void*) { check_replace_member_status(); }); + + p.setValue(); } else { // Cancel all recurring timers started iomanager.cancel_timer(m_rdev_gc_timer_hdl, true /* wait */); iomanager.cancel_timer(m_rdev_fetch_timer_hdl, true /* wait */); iomanager.cancel_timer(m_flush_durable_commit_timer_hdl, true /* wait */); + iomanager.cancel_timer(m_replace_member_sync_check_timer_hdl, true /* wait */); } }); std::move(f).get(); @@ -407,21 +588,43 @@ void RaftReplService::gc_repl_reqs() { } void RaftReplService::gc_repl_devs() { - std::unique_lock lg(m_rd_map_mtx); - for (auto it = m_rd_map.begin(); it != m_rd_map.end();) { - auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); - if (rdev->is_destroy_pending() && - (get_elapsed_time_sec(rdev->destroyed_time()) >= - HS_DYNAMIC_CONFIG(generic.repl_dev_cleanup_interval_sec))) { - LOGINFOMOD(replication, - "ReplDev group_id={} was destroyed, shutting down the raft group in delayed fashion now", - rdev->group_id()); - m_msg_mgr->leave_group(rdev->group_id()); - it = m_rd_map.erase(it); - } else { - ++it; + /* incr_pending_request_num(); + // Skip gc when raft repl service is stopping to avoid concurrency issues between repl_dev's stop and destroy ops. + if (is_stopping()) { + LOGINFOMOD(replication, "ReplSvc is stopping, skipping GC"); + decr_pending_request_num(); + return; + } */ + + std::vector< group_id_t > groups_to_leave; + { + std::shared_lock lg(m_rd_map_mtx); + for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) { + auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second); + if (rdev->is_destroy_pending() && + (get_elapsed_time_sec(rdev->destroyed_time()) >= + HS_DYNAMIC_CONFIG(generic.repl_dev_cleanup_interval_sec))) { + LOGINFOMOD(replication, + "ReplDev group_id={} was destroyed, shutting down the raft group in delayed fashion now", + rdev->group_id()); + groups_to_leave.push_back(rdev->group_id()); + } + } + } + + // Call leave_group to shut down the raft server and destroy all resources on the repl dev. + // This operation may require acquiring the m_rd_map_mtx lock for some steps (e.g., trigger cp flush). + // Therefore, we perform it outside the lock scope and then remove group from m_rd_map. + for (const auto& group_id : groups_to_leave) { + m_msg_mgr->leave_group(group_id); + // notify consumer to cleanup any resources associated with the listener itself; + m_repl_app->destroy_repl_dev_listener(group_id); + { + std::unique_lock lg(m_rd_map_mtx); + m_rd_map.erase(group_id); } } + // decr_pending_request_num(); } void RaftReplService::flush_durable_commit_lsn() { @@ -433,12 +636,53 @@ void RaftReplService::flush_durable_commit_lsn() { } } +void RaftReplService::check_replace_member_status() { + std::unique_lock lg(m_rd_map_mtx); + for (auto& rdev_parent : m_rd_map) { + auto rdev = std::dynamic_pointer_cast< RaftReplDev >(rdev_parent.second); + rdev->check_replace_member_status(); + } +} + ///////////////////// RaftReplService CP Callbacks ///////////////////////////// -std::unique_ptr< CPContext > RaftReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; } +int ReplSvcCPContext::add_repl_dev_ctx(ReplDev* dev, cshared< ReplDevCPContext > dev_ctx) { + m_cp_ctx_map.emplace(dev, dev_ctx); + return 0; +} + +cshared< ReplDevCPContext > ReplSvcCPContext::get_repl_dev_ctx(ReplDev* dev) { + if (m_cp_ctx_map.count(dev) == 0) { + // it is possible if a repl dev added during the cp flush + return std::make_shared< ReplDevCPContext >(); + } + return m_cp_ctx_map[dev]; +} + +std::unique_ptr< CPContext > RaftReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { + // checking if cur_cp == nullptr as on_switchover_cp will be called when registering the cp handler + if (cur_cp != nullptr) { + // Add cp info from all devices to current cp. + // We dont need taking cp_guard as cp_mgr already taken it in do_trigger_cp_flush + auto cur_cp_ctx = s_cast< ReplSvcCPContext* >(cur_cp->context(cp_consumer_t::REPLICATION_SVC)); + repl_service().iterate_repl_devs([cur_cp, cur_cp_ctx](cshared< ReplDev >& repl_dev) { + // we need collecting the LSN of each repl dev and put it into current CP. + // There is no dirty buffers accumulated to new_cp yet, as the cp_mgr ensure replication_svc + // is the first one being called during cp switchover. + auto dev_ctx = std::static_pointer_cast< RaftReplDev >(repl_dev)->get_cp_ctx(cur_cp); + cur_cp_ctx->add_repl_dev_ctx(repl_dev.get(), std::move(dev_ctx)); + }); + } + // create new ctx + auto ctx = std::make_unique< ReplSvcCPContext >(new_cp); + return ctx; +} folly::Future< bool > RaftReplServiceCPHandler::cp_flush(CP* cp) { - repl_service().iterate_repl_devs( - [cp](cshared< ReplDev >& repl_dev) { std::static_pointer_cast< RaftReplDev >(repl_dev)->cp_flush(cp); }); + auto cp_ctx = s_cast< ReplSvcCPContext* >(cp->context(cp_consumer_t::REPLICATION_SVC)); + repl_service().iterate_repl_devs([cp, cp_ctx](cshared< ReplDev >& repl_dev) { + auto dev_ctx = cp_ctx->get_repl_dev_ctx(repl_dev.get()); + std::static_pointer_cast< RaftReplDev >(repl_dev)->cp_flush(cp, dev_ctx); + }); return folly::makeFuture< bool >(true); } diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h index a38cbbccb..aa9550c4f 100644 --- a/src/lib/replication/service/raft_repl_service.h +++ b/src/lib/replication/service/raft_repl_service.h @@ -31,6 +31,12 @@ namespace homestore { +constexpr auto cert_change_timeout = std::chrono::seconds(1200); +constexpr auto cert_check_sleep = std::chrono::seconds(1); +constexpr int32_t raft_leader_priority = 100; +constexpr double raft_priority_decay_coefficient = 0.8; +constexpr uint32_t raft_priority_election_round_upper_limit = 5; + struct repl_dev_superblk; class RaftReplDev; @@ -46,12 +52,15 @@ class RaftReplService : public GenericReplService, iomgr::timer_handle_t m_rdev_fetch_timer_hdl; iomgr::timer_handle_t m_rdev_gc_timer_hdl; iomgr::timer_handle_t m_flush_durable_commit_timer_hdl; + iomgr::timer_handle_t m_replace_member_sync_check_timer_hdl; iomgr::io_fiber_t m_reaper_fiber; + std::mutex raft_restart_mutex; public: RaftReplService(cshared< ReplApplication >& repl_app); static ReplServiceError to_repl_error(nuraft::cmd_result_code code); + int32_t compute_raft_follower_priority(); ///////////////////// Overrides of nuraft_mesg::MessagingApplication //////////////////// std::string lookup_peer(nuraft_mesg::peer_id_t const&) override; @@ -69,8 +78,13 @@ class RaftReplService : public GenericReplService, std::set< replica_id_t > const& members) override; folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override; void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; - AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, - replica_id_t member_in) const override; + AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out, + const replica_member_info& member_in, uint32_t commit_quorum = 0, + uint64_t trace_id = 0) const override; + + AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, + uint32_t commit_quorum, bool wait_and_verify = true, + uint64_t trace_id = 0) const override; private: RaftReplDev* raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie); @@ -80,7 +94,28 @@ class RaftReplService : public GenericReplService, void gc_repl_devs(); void gc_repl_reqs(); void flush_durable_commit_lsn(); + void check_replace_member_status(); + void monitor_cert_changes(); + void restart_raft_svc(const std::string filepath, const bool deleted); + bool wait_for_cert(const std::string& filepath); +}; + +// cp context for repl_dev, repl_dev cp_lsn is critical cursor in the system, +// anything below the cp_lsn we believed is persisted through cp and will not +// go through replay. The cp_lsn need to be kept into ctx when switchover_cp, +// and the persist of repl_dev_cp need to be done after all other consumers succeed. +struct ReplDevCPContext; + +class ReplSvcCPContext : public CPContext { + std::shared_mutex m_cp_map_mtx; + std::map< ReplDev*, cshared< ReplDevCPContext > > m_cp_ctx_map; + +public: + ReplSvcCPContext(CP* cp) : CPContext(cp) {}; + virtual ~ReplSvcCPContext() = default; + int add_repl_dev_ctx(ReplDev* dev, cshared< ReplDevCPContext > dev_ctx); + cshared< ReplDevCPContext > get_repl_dev_ctx(ReplDev* dev); }; class RaftReplServiceCPHandler : public CPCallbacks { diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index d922f71cb..dece4b36e 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -102,6 +102,41 @@ if (${io_tests}) target_link_libraries(test_cp_mgr homestore ${COMMON_TEST_DEPS} GTest::gtest) add_test(NAME CPMgr COMMAND test_cp_mgr) + can_build_epoll_io_tests(epoll_tests) + if(${epoll_tests}) + add_test(NAME LogDev-Epoll COMMAND test_log_dev) + add_test(NAME LogStore-Epoll COMMAND test_log_store) + add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr) + add_test(NAME DataService-Epoll COMMAND test_data_service) + endif() + + can_build_spdk_io_tests(spdk_tests) + if(${spdk_tests}) + add_test(NAME LogStore-Spdk COMMAND test_log_store -- --spdk "true") + add_test(NAME LogDev-Spdk COMMAND test_log_dev -- --spdk "true") + add_test(NAME MetaBlkMgr-Spdk COMMAND test_meta_blk_mgr -- --spdk "true") + add_test(NAME DataSerice-Spdk COMMAND test_data_service -- --spdk "true") + if(${epoll_tests}) + SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk) + SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk) + endif() + endif() +endif() + +can_build_repl_tests(repl_tests) +if (${repl_tests}) + add_executable(test_repl_service) + target_sources(test_repl_service PRIVATE test_repl_service.cpp) + target_link_libraries(test_repl_service homestore ${COMMON_TEST_DEPS} GTest::gmock) + + add_executable(test_repl_log_store) + target_sources(test_repl_log_store PRIVATE test_repl_log_store.cpp) + target_link_libraries(test_repl_log_store hs_logdev homestore ${COMMON_TEST_DEPS} GTest::gmock) + + add_executable(test_repl_data_service) + target_sources(test_repl_data_service PRIVATE test_repl_data_service.cpp) + target_link_libraries(test_repl_data_service homestore ${COMMON_TEST_DEPS} GTest::gmock) + add_executable(test_solo_repl_dev) target_sources(test_solo_repl_dev PRIVATE test_solo_repl_dev.cpp) target_link_libraries(test_solo_repl_dev homestore ${COMMON_TEST_DEPS} GTest::gmock) @@ -114,30 +149,24 @@ if (${io_tests}) target_sources(test_raft_repl_dev PRIVATE test_raft_repl_dev.cpp) target_link_libraries(test_raft_repl_dev homestore ${COMMON_TEST_DEPS} GTest::gmock) + add_executable(test_raft_repl_dev_dynamic) + target_sources(test_raft_repl_dev_dynamic PRIVATE test_raft_repl_dev_dynamic.cpp) + target_link_libraries(test_raft_repl_dev_dynamic homestore ${COMMON_TEST_DEPS} GTest::gmock) + can_build_epoll_io_tests(epoll_tests) if(${epoll_tests}) - add_test(NAME LogDev-Epoll COMMAND test_log_dev) - add_test(NAME LogStore-Epoll COMMAND test_log_store) - add_test(NAME HomeRaftLogStore-Epoll COMMAND test_home_raft_logstore) - add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr) - add_test(NAME DataService-Epoll COMMAND test_data_service) add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev) - # add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) + add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic --override_config homestore_config.consensus.replace_member_sync_check_interval_ms=1000) + add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev) + add_test(NAME HomeRaftLogStore-Epoll COMMAND test_home_raft_logstore) endif() can_build_spdk_io_tests(spdk_tests) if(${spdk_tests}) - add_test(NAME LogStore-Spdk COMMAND test_log_store -- --spdk "true") - add_test(NAME LogDev-Spdk COMMAND test_log_dev -- --spdk "true") - add_test(NAME MetaBlkMgr-Spdk COMMAND test_meta_blk_mgr -- --spdk "true") - add_test(NAME DataSerice-Spdk COMMAND test_data_service -- --spdk "true") - add_test(NAME SoloReplDev-Spdk COMMAND test_solo_repl_dev -- --spdk "true") - add_test(NAME HomeRaftLogStore-Spdk COMMAND test_home_raft_logstore -- --spdk "true") add_test(NAME RaftReplDev-Spdk COMMAND test_raft_repl_dev -- --spdk "true") - if(${epoll_tests}) - SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk) - SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk) - endif() + add_test(NAME RaftReplDevDynamic-Spdk COMMAND test_raft_repl_dev_dynamic -- --spdk "true" --override_config homestore_config.consensus.replace_member_sync_check_interval_ms=1000) + add_test(NAME SoloReplDev-Spdk COMMAND test_solo_repl_dev -- --spdk "true") + add_test(NAME HomeRaftLogStore-Spdk COMMAND test_home_raft_logstore -- --spdk "true") endif() endif() diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index a7e14df41..9b2b07c52 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -276,7 +276,7 @@ struct BtreeTestHelper { } void range_remove_existing_random() { - static std::uniform_int_distribution< uint32_t > s_rand_range_generator{2, 5}; + static std::uniform_int_distribution< uint32_t > s_rand_range_generator{2, 50}; auto const [start_k, end_k] = m_shadow_map.pick_random_existing_keys(s_rand_range_generator(m_re)); do_range_remove(start_k, end_k, true /* only_existing */); diff --git a/src/tests/btree_helpers/btree_test_kvs.hpp b/src/tests/btree_helpers/btree_test_kvs.hpp index cac6bc4dc..c1baa8f38 100644 --- a/src/tests/btree_helpers/btree_test_kvs.hpp +++ b/src/tests/btree_helpers/btree_test_kvs.hpp @@ -60,6 +60,17 @@ static std::string gen_random_string(size_t len, uint32_t preamble = std::numeri } return str; } +template < typename T > +static bool willAdditionOverflow(T a, int b) { + static_assert(std::is_integral< T >::value, "Template parameter must be an integral type."); + + if (b > 0) { + return a > std::numeric_limits< T >::max() - b; + } else if (b < 0) { + return a < std::numeric_limits< T >::min() - b; + } + return false; +} using namespace homestore; @@ -310,7 +321,7 @@ class TestIntervalKey : public BtreeIntervalKey { m_offset = other->m_offset; } - std::string to_string() const override { return fmt::format("{}.{}", m_base, m_offset); } + std::string to_string() const override { return fmt::format("{}", key()); } static uint32_t get_max_size() { return sizeof(TestIntervalKey); } @@ -323,9 +334,10 @@ class TestIntervalKey : public BtreeIntervalKey { int distance(BtreeKey const& f) const override { TestIntervalKey const& from = s_cast< TestIntervalKey const& >(f); - DEBUG_ASSERT_EQ(m_base, from.m_base, "Invalid from key for distance"); - DEBUG_ASSERT_GE(m_offset, from.m_offset, "Invalid from key for distance"); - return m_offset - from.m_offset; + uint64_t this_val = (uint64_cast(m_base) << 32) | m_offset; + uint64_t from_val = (uint64_cast(from.m_base) << 32) | from.m_offset; + DEBUG_ASSERT_GE(this_val, from_val, "Invalid from key for distance"); + return static_cast< int >(this_val - from_val); } bool is_interval_key() const override { return true; } @@ -519,7 +531,8 @@ class TestIntervalValue : public BtreeIntervalValue { m_offset = other->m_offset; } - std::string to_string() const override { return fmt::format("{}.{}", m_base_val, m_offset); } + std::string to_string() const override { return fmt::format("{}", value()); } + uint64_t value() const { return (uint64_cast(m_base_val) << 16) | m_offset; } friend std::ostream& operator<<(std::ostream& os, const TestIntervalValue& v) { os << v.to_string(); diff --git a/src/tests/btree_helpers/shadow_map.hpp b/src/tests/btree_helpers/shadow_map.hpp index 8aae946d3..7d2070e04 100644 --- a/src/tests/btree_helpers/shadow_map.hpp +++ b/src/tests/btree_helpers/shadow_map.hpp @@ -242,6 +242,7 @@ class ShadowMap { file << key.key() << " " << value << '\n'; } file.close(); + LOGINFO("Saved shadow map to file: {}", filename); } void load(const std::string& filename) { diff --git a/src/tests/log_store_benchmark.cpp b/src/tests/log_store_benchmark.cpp index a80f67b45..c34db76a3 100644 --- a/src/tests/log_store_benchmark.cpp +++ b/src/tests/log_store_benchmark.cpp @@ -55,7 +55,7 @@ class BenchLogStore { public: friend class SampleDB; BenchLogStore() { - m_logdev_id = logstore_service().create_new_logdev(); + m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); m_log_store = logstore_service().create_new_log_store(m_logdev_id, true /* append_mode */); m_log_store->register_log_found_cb(bind_this(BenchLogStore::on_log_found, 3)); m_nth_entry.store(0); diff --git a/src/tests/test_blk_read_tracker.cpp b/src/tests/test_blk_read_tracker.cpp index 0c91ea035..8c372cf55 100644 --- a/src/tests/test_blk_read_tracker.cpp +++ b/src/tests/test_blk_read_tracker.cpp @@ -25,8 +25,7 @@ using namespace homestore; - -SISL_OPTIONS_ENABLE(logging, test_blk_read_tracker, nuraft_mesg) +SISL_OPTIONS_ENABLE(logging, test_blk_read_tracker) VENUM(op_type_t, uint8_t, insert = 0, remove = 1, wait_on = 2, max_op = 3); class BlkReadTrackerTest : public testing::Test { diff --git a/src/tests/test_blkid.cpp b/src/tests/test_blkid.cpp index 2555d321d..93a1813b8 100644 --- a/src/tests/test_blkid.cpp +++ b/src/tests/test_blkid.cpp @@ -7,8 +7,7 @@ #include - -SISL_OPTIONS_ENABLE(logging, test_blkid, nuraft_mesg) +SISL_OPTIONS_ENABLE(logging, test_blkid) SISL_OPTION_GROUP(test_blkid, (num_iterations, "", "num_iterations", "number of iterations", diff --git a/src/tests/test_btree_long_running b/src/tests/test_btree_long_running index 2e24d18bf..3c9ff5ffa 100644 --- a/src/tests/test_btree_long_running +++ b/src/tests/test_btree_long_running @@ -39,7 +39,7 @@ SISL_OPTION_GROUP( (num_iters, "", "num_iters", "number of iterations for rand ops", ::cxxopts::value< uint32_t >()->default_value("500"), "number"), (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), + ::cxxopts::value< uint32_t >()->default_value("7000"), "number"), (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", @@ -269,6 +269,34 @@ TYPED_TEST(BtreeTest, RandomInsert) { this->get_all(); } +TYPED_TEST(BtreeTest, TriggerCacheEviction) { + // restart homestore with smaller cache % + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.resource_limits.cache_size_percent = 1u; + HS_SETTINGS_FACTORY().save(); + }); + + this->restart_homestore(); + + LOGINFO("TriggerCacheEviction test start"); + const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); + LOGINFO("Step 1: Do insert for {} entries", num_entries); + for (uint32_t i{0}; i < num_entries; ++i) { + this->put(i, btree_put_type::INSERT); + // this->print(); + } + + this->get_all(); + + // reset cache pct + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.resource_limits.cache_size_percent = 65u; + HS_SETTINGS_FACTORY().save(); + }); + + LOGINFO("TriggerCacheEviction test end"); +} + TYPED_TEST(BtreeTest, SequentialRemove) { LOGINFO("SequentialRemove test start"); // Forward sequential insert @@ -633,6 +661,8 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin this->m_bt->count_keys(this->m_bt->root_node_id())); BtreeTestHelper< TestType >::TearDown(); m_helper.shutdown_homestore(false); + this->m_bt.reset(); + log_obj_life_counter(); } private: @@ -663,6 +693,10 @@ int main(int argc, char* argv[]) { auto seed = SISL_OPTIONS["seed"].as< uint64_t >(); LOGINFO("Using seed {} to sow the random generation", seed); g_re.seed(seed); + } else { + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + LOGINFO("No seed provided. Using randomly generated seed: {}", seed); + g_re.seed(seed); } auto ret = RUN_ALL_TESTS(); return ret; diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp index 1634984f3..8698f5100 100644 --- a/src/tests/test_btree_node.cpp +++ b/src/tests/test_btree_node.cpp @@ -104,7 +104,7 @@ struct NodeTest : public testing::Test { } } - void put_range(uint32_t k, uint32_t count) { + void put_range(uint64_t k, uint32_t count) { btree_put_type put_type; if constexpr (!std::is_same_v< V, TestIntervalValue >) { // For non-interval values we support only update, so we need to first put the value @@ -341,6 +341,41 @@ TYPED_TEST(NodeTest, SequentialInsert) { this->validate_get_any(98, 102); } +TYPED_TEST(NodeTest, SimpleInsert) { + auto oc = this->m_node1->occupied_size(); + this->put(1, btree_put_type::INSERT); + this->put(2, btree_put_type::INSERT); + this->put(3, btree_put_type::INSERT); + this->remove(2); + this->remove(1); + this->remove(3); + auto oc2 = this->m_node1->occupied_size(); + ASSERT_EQ(oc, oc2) << "Occupied size cannot be more than original size"; + this->put(1, btree_put_type::INSERT); + this->put(2, btree_put_type::INSERT); + this->put(3, btree_put_type::INSERT); + this->remove(3); + this->remove(2); + this->remove(1); + ASSERT_EQ(oc, oc2) << "Occupied size must be the same as original size"; + + this->put(2, btree_put_type::INSERT); + this->put(1, btree_put_type::INSERT); + this->put(4, btree_put_type::INSERT); + this->put(3, btree_put_type::INSERT); + for (uint32_t i = 5; i <= 50; ++i) { + this->put(i, btree_put_type::INSERT); + } + LOGDEBUG("Creating a hole with size of 11 for prefix compaction usecase"); + for (uint32_t i = 10; i <= 20; ++i) { + this->remove(i); + } + this->m_node1->move_out_to_right_by_entries(*this->m_node2, 20); + uint32_t copy_idx{0u}; + this->m_node1->append_copy_in_upto_size(*this->m_node2, copy_idx, std::numeric_limits< uint32_t >::max(), + /*copy_only_if_fits=*/false); +} + TYPED_TEST(NodeTest, ReverseInsert) { for (uint32_t i{100}; (i > 0 && this->has_room()); --i) { this->put(i - 1, btree_put_type::INSERT); diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 4df2a7231..404ba8247 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -32,6 +32,10 @@ #include #include +#ifdef _PRERELEASE +#include "common/crash_simulator.hpp" +#endif + const std::string SPDK_ENV_VAR_STRING{"USER_WANT_SPDK"}; const std::string HTTP_SVC_ENV_VAR_STRING{"USER_WANT_HTTP_OFF"}; const std::string CP_WATCHDOG_TIMER_SEC{"USER_SET_CP_WD_TMR_SEC"}; // used in nightly test; @@ -194,8 +198,8 @@ class HSTestHelper { } homestore::HomeStore::instance()->shutdown(); + iomanager.stop(); // Stop iomanager first in case any fiber is still referencing homestore resources homestore::HomeStore::reset_instance(); - iomanager.stop(); if (cleanup) { remove_files(m_generated_devs); @@ -208,9 +212,14 @@ class HSTestHelper { test_params& params(ServiceType svc) { return m_token.svc_params_[svc]; } #ifdef _PRERELEASE - void wait_for_crash_recovery() { + void wait_for_crash_recovery(bool check_will_crash = false) { + if(check_will_crash && !homestore::HomeStore::instance()->crash_simulator().will_crash()) { + return; + } + LOGDEBUG("Waiting for m_crash_recovered future"); m_crash_recovered.getFuture().get(); m_crash_recovered = folly::Promise< folly::Unit >(); + homestore::HomeStore::instance()->crash_simulator().set_will_crash(false); } #endif @@ -247,6 +256,11 @@ class HSTestHelper { m_fc.inject_delay_flip(flip_name, {null_cond}, freq, delay_usec); LOGDEBUG("Flip {} set", flip_name); } + + void remove_flip(const std::string flip_name) { + m_fc.remove_flip(flip_name); + LOGDEBUG("Flip {} removed", flip_name); + } #endif static void fill_data_buf(uint8_t* buf, uint64_t size, uint64_t pattern = 0) { @@ -335,7 +349,7 @@ class HSTestHelper { auto fut = homestore::hs()->cp_mgr().trigger_cp_flush(true /* force */); auto on_complete = [&](auto success) { HS_REL_ASSERT_EQ(success, true, "CP Flush failed"); - LOGINFO("CP Flush completed"); + LOGDEBUG("CP Flush completed"); }; if (wait) { @@ -458,7 +472,9 @@ class HSTestHelper { } else if ((svc == ServiceType::LOG)) { hsi->with_log_service(); } else if (svc == ServiceType::REPLICATION) { +#ifdef REPLICATION_SUPPORT hsi->with_repl_data_service(tp.repl_app, tp.custom_chunk_selector); +#endif } } #ifdef _PRERELEASE diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp index 67abe2f8e..c00788127 100644 --- a/src/tests/test_common/hs_repl_test_common.hpp +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -17,6 +17,8 @@ */ #pragma once +#include "raft_repl_test_base.hpp" + #include #include #include @@ -35,9 +37,13 @@ #include #include "test_common/homestore_test_common.hpp" +#include + SISL_OPTION_GROUP(test_repl_common_setup, (replicas, "", "replicas", "Total number of replicas", ::cxxopts::value< uint32_t >()->default_value("3"), "number"), + (spare_replicas, "", "spare_replicas", "Additional number of spare replicas not part of repldev", + ::cxxopts::value< uint32_t >()->default_value("1"), "number"), (base_port, "", "base_port", "Port number of first replica", ::cxxopts::value< uint16_t >()->default_value("4000"), "number"), (replica_num, "", "replica_num", @@ -113,6 +119,9 @@ class HSReplTestHelper : public HSTestHelper { create_repl_dev_listener(homestore::group_id_t group_id) override { return helper_.get_listener(group_id); } + void destroy_repl_dev_listener(homestore::group_id_t) override {} + + void on_repl_devs_init_completed() { LOGINFO("Repl dev init completed CB called"); } std::pair< std::string, uint16_t > lookup_peer(homestore::replica_id_t replica_id) const override { uint16_t port; @@ -134,11 +143,12 @@ class HSReplTestHelper : public HSTestHelper { HSReplTestHelper(std::string const& name, std::vector< std::string > const& args, char** argv) : name_{name}, args_{args}, argv_{argv} {} - void setup() { + void setup(uint32_t num_replicas) { + num_replicas_ = num_replicas; replica_num_ = SISL_OPTIONS["replica_num"].as< uint16_t >(); + sisl::logging::SetLogger(name_ + std::string("_replica_") + std::to_string(replica_num_)); sisl::logging::SetLogPattern("[%D %T%z] [%^%L%$] [%n] [%t] %v"); - auto const num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); boost::uuids::string_generator gen; for (uint32_t i{0}; i < num_replicas; ++i) { @@ -226,7 +236,7 @@ class HSReplTestHelper : public HSTestHelper { void reset_setup() { teardown(); - setup(); + setup(num_replicas_); } void restart(uint32_t shutdown_delay_secs = 5u) { @@ -249,6 +259,10 @@ class HSReplTestHelper : public HSTestHelper { start_homestore(); } + void reinit_repl_app() { + m_token.params(HS_SERVICE::REPLICATION).repl_app = std::make_unique< TestReplApplication >(*this); + } + uint16_t replica_num() const { return replica_num_; } homestore::replica_id_t my_replica_id() const { return my_replica_id_; } homestore::replica_id_t replica_id(uint16_t member_id) const { @@ -273,8 +287,12 @@ class HSReplTestHelper : public HSTestHelper { if (replica_num_ == 0) { std::set< homestore::replica_id_t > members; - std::transform(members_.begin(), members_.end(), std::inserter(members, members.end()), - [](auto const& p) { return p.first; }); + // By default we create repl dev with number of members equal to replicas argument. + // We dont add spare replica's to the group by default. + for (auto& m : members_) { + if (m.second < SISL_OPTIONS["replicas"].as< uint32_t >()) { members.insert(m.first); } + } + group_id_t repl_group_id = hs_utils::gen_random_uuid(); { std::unique_lock lg(groups_mtx_); @@ -283,7 +301,21 @@ class HSReplTestHelper : public HSTestHelper { auto v = hs()->repl_service().create_repl_dev(repl_group_id, members).get(); ASSERT_EQ(v.hasValue(), true) - << "Error in creating repl dev for group_id=" << boost::uuids::to_string(repl_group_id).c_str(); + << "Error in creating repl dev for group_id=" << boost::uuids::to_string(repl_group_id).c_str() + << ", err=" << v.error(); + auto& raftService = dynamic_cast< RaftReplService& >(hs()->repl_service()); + auto follower_priority = raftService.compute_raft_follower_priority(); + auto repl_dev = v.value(); + ASSERT_EQ(my_replica_id_, repl_dev->get_leader_id()); + auto peer_info = repl_dev->get_replication_status(); + for (auto pinfo : peer_info) { + LOGINFO("Replica={} has priority={}", boost::uuids::to_string(pinfo.id_), pinfo.priority_); + if (pinfo.id_ == my_replica_id_) { + ASSERT_EQ(raft_leader_priority, pinfo.priority_); + } else { + ASSERT_EQ(follower_priority, pinfo.priority_); + } + } } } @@ -299,6 +331,7 @@ class HSReplTestHelper : public HSTestHelper { auto listener = std::move(pending_listeners_[0]); repl_groups_.insert(std::pair(group_id, listener)); pending_listeners_.erase(pending_listeners_.begin()); + LOGINFO("Got listener for group_id={} replica={}", boost::uuids::to_string(group_id), replica_num_); return listener; } @@ -309,6 +342,11 @@ class HSReplTestHelper : public HSTestHelper { } } + void add_listener(std::shared_ptr< ReplDevListener > listener) { + std::unique_lock lg(groups_mtx_); + pending_listeners_.emplace_back(listener); + } + size_t num_listeners() const { std::unique_lock lg(groups_mtx_); return repl_groups_.size(); @@ -346,6 +384,7 @@ class HSReplTestHelper : public HSTestHelper { std::string name_; std::vector< std::string > args_; char** argv_; + uint32_t num_replicas_; std::vector< homestore::dev_info > dev_list_; diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp new file mode 100644 index 000000000..80eeb1573 --- /dev/null +++ b/src/tests/test_common/raft_repl_test_base.hpp @@ -0,0 +1,770 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include "common/homestore_config.hpp" +#include "common/homestore_assert.hpp" +#include "common/homestore_utils.hpp" + +#define private public +#include "test_common/hs_repl_test_common.hpp" +#include "replication/service/raft_repl_service.h" +#include "replication/repl_dev/raft_repl_dev.h" + +using namespace homestore; + +SISL_LOGGING_DEF(test_raft_repl_dev) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS, nuraft_mesg, nuraft) + +SISL_OPTION_GROUP(test_raft_repl_dev, + (block_size, "", "block_size", "block size to io", + ::cxxopts::value< uint32_t >()->default_value("4096"), "number"), + (num_raft_groups, "", "num_raft_groups", "number of raft groups per test", + ::cxxopts::value< uint32_t >()->default_value("1"), "number"), + // for below replication parameter, their default value always get from dynamic config, only used + // when specified by user + (snapshot_distance, "", "snapshot_distance", "distance between snapshots", + ::cxxopts::value< uint32_t >()->default_value("0"), "number"), + (num_raft_logs_resv, "", "num_raft_logs_resv", "number of raft logs reserved", + ::cxxopts::value< uint32_t >()->default_value("0"), "number"), + (res_mgr_audit_timer_ms, "", "res_mgr_audit_timer_ms", "resource manager audit timer", + ::cxxopts::value< uint32_t >()->default_value("0"), "number")); + +SISL_OPTIONS_ENABLE(logging, test_raft_repl_dev, iomgr, config, test_common_setup, test_repl_common_setup) + +static std::unique_ptr< test_common::HSReplTestHelper > g_helper; +static std::random_device g_rd{}; +static std::default_random_engine g_re{g_rd()}; + +class TestReplicatedDB : public homestore::ReplDevListener { +public: + struct Key { + uint64_t id_; + bool operator<(Key const& other) const { return id_ < other.id_; } + }; + + struct Value { + int64_t lsn_; + uint64_t data_size_; + uint64_t data_pattern_; + MultiBlkId blkid_; + uint64_t id_; + }; + + struct KeyValuePair { + Key key; + Value value; + }; + + struct test_req : public repl_req_ctx { + struct journal_header { + uint64_t data_size; + uint64_t data_pattern; + uint64_t key_id; // put it in header to test duplication in alloc_local_blks + }; + journal_header jheader; + uint64_t key_id; + sisl::sg_list write_sgs; + sisl::sg_list read_sgs; + + sisl::blob header_blob() { return sisl::blob(uintptr_cast(&jheader), sizeof(journal_header)); } + sisl::blob key_blob() { return sisl::blob{uintptr_cast(&key_id), sizeof(uint64_t)}; } + + test_req() { + write_sgs.size = 0; + read_sgs.size = 0; + key_id = (uint64_t)rand() << 32 | rand(); + jheader.key_id = key_id; + } + + ~test_req() { + for (auto const& iov : write_sgs.iovs) { + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + + for (auto const& iov : read_sgs.iovs) { + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + } + }; + + TestReplicatedDB() = default; + virtual ~TestReplicatedDB() = default; + + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override { + ASSERT_EQ(header.size(), sizeof(test_req::journal_header)); + ASSERT_EQ(blkids.size(), 1); + + auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); + Key k{.id_ = *(r_cast< uint64_t const* >(key.cbytes()))}; + Value v{.lsn_ = lsn, + .data_size_ = jheader->data_size, + .data_pattern_ = jheader->data_pattern, + .blkid_ = blkids[0], + .id_ = k.id_}; + + LOGINFOMOD(replication, "[Replica={}] Received commit on lsn={} dsn={} key={} value[blkid={} pattern={}]", + g_helper->replica_num(), lsn, ctx->dsn(), k.id_, v.blkid_.to_string(), v.data_pattern_); + + { + std::unique_lock lk(db_mtx_); + inmem_db_.insert_or_assign(k, v); + lsn_index_.emplace(lsn, v); + last_committed_lsn = lsn; + ++commit_count_; + } + + if (ctx->is_proposer()) { g_helper->runner().next_task(); } + } + + bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFOMOD(replication, "[Replica={}] Received pre-commit on lsn={} dsn={}", g_helper->replica_num(), lsn, + ctx->dsn()); + return true; + } + + void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFOMOD(replication, "[Replica={}] Received rollback on lsn={}", g_helper->replica_num(), lsn); + } + + void on_restart() { + LOGINFOMOD(replication, "restarted repl dev for [Replica={}] Group={}", g_helper->replica_num(), + boost::uuids::to_string(repl_dev()->group_id())); + } + + void on_error(ReplServiceError error, const sisl::blob& header, const sisl::blob& key, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFOMOD(replication, "[Replica={}] Received error={} on key={}", g_helper->replica_num(), enum_name(error), + *(r_cast< uint64_t const* >(key.cbytes()))); + g_helper->runner().comp_promise_.setException(folly::make_exception_wrapper< ReplServiceError >(error)); + } + + void notify_committed_lsn(int64_t lsn) override { + LOGINFOMOD(replication, "[Replica={}] Received notify_committed_lsn={}", g_helper->replica_num(), lsn); + } + + void on_config_rollback(int64_t lsn) override { + LOGINFOMOD(replication, "[Replica={}] Received config rollback at lsn={}", g_helper->replica_num(), lsn); + } + void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) override { + LOGINFOMOD(replication, "[Replica={}] Received no_space_left at lsn={}, chunk_id={}", g_helper->replica_num(), + lsn, chunk_id); + } + + AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { + std::lock_guard< std::mutex > lock(m_snapshot_lock); + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); + LOGINFOMOD(replication, "[Replica={}] Got snapshot callback term={} idx={}", g_helper->replica_num(), + s->get_last_log_term(), s->get_last_log_idx()); + m_last_snapshot = context; + return make_async_success<>(); + } + + static int64_t get_next_lsn(uint64_t& obj_id) { return obj_id & ((1ULL << 63) - 1); } + static void set_resync_msg_type_bit(uint64_t& obj_id) { obj_id |= 1ULL << 63; } + + int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override { + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); + if (RaftStateMachine::is_hs_snp_obj(snp_data->offset)) { + LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset); + return -1; + } + if ((snp_data->offset & snp_obj_id_type_app) == 0) { + LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset); + return -1; + } + + int64_t next_lsn = get_next_lsn(snp_data->offset); + if (next_lsn == 0) { + snp_data->is_last_obj = false; + snp_data->blob = sisl::io_blob_safe(sizeof(ulong)); + LOGINFOMOD(replication, + "[Replica={}] Read logical snapshot callback first message obj_id={} term={} idx={}", + g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx()); + return 0; + } + + std::vector< KeyValuePair > kv_snapshot_obj; + // we can not use find to get the next element, since if the next lsn is a config lsn , it will not be put into + // lsn_index_ and as a result, the find will return the end of the map. so here we use lower_bound to get the + // first element to be read and transfered. + for (auto iter = lsn_index_.lower_bound(next_lsn); iter != lsn_index_.end(); iter++) { + auto& v = iter->second; + kv_snapshot_obj.emplace_back(Key{v.id_}, v); + LOGTRACEMOD(replication, "[Replica={}] Read logical snapshot callback fetching lsn={} size={} pattern={}", + g_helper->replica_num(), v.lsn_, v.data_size_, v.data_pattern_); + if (kv_snapshot_obj.size() >= 10) { break; } + } + + if (kv_snapshot_obj.size() == 0) { + snp_data->is_last_obj = true; + LOGINFOMOD(replication, "Snapshot is_last_obj is true"); + return 0; + } + + int64_t kv_snapshot_obj_size = sizeof(KeyValuePair) * kv_snapshot_obj.size(); + sisl::io_blob_safe blob{static_cast< uint32_t >(kv_snapshot_obj_size)}; + std::memcpy(blob.bytes(), kv_snapshot_obj.data(), kv_snapshot_obj_size); + snp_data->blob = std::move(blob); + snp_data->is_last_obj = false; + LOGINFOMOD(replication, "[Replica={}] Read logical snapshot callback obj_id={} term={} idx={} num_items={}", + g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), + kv_snapshot_obj.size()); + + return 0; + } + + void snapshot_obj_write(uint64_t data_size, uint64_t data_pattern, MultiBlkId& out_blkids) { + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + auto write_sgs = test_common::HSTestHelper::create_sgs(data_size, block_size, data_pattern); + auto fut = homestore::data_service().async_alloc_write(write_sgs, blk_alloc_hints{}, out_blkids); + std::move(fut).get(); + for (auto const& iov : write_sgs.iovs) { + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + } + + void write_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override { + if (RaftStateMachine::is_hs_snp_obj(snp_data->offset)) { + LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset); + return; + } + int64_t next_lsn = get_next_lsn(snp_data->offset); + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); + auto last_committed_idx = + std::dynamic_pointer_cast< RaftReplDev >(repl_dev())->raft_server()->get_committed_log_idx(); + if (next_lsn == 0) { + snp_data->offset = last_committed_lsn + 1; + set_resync_msg_type_bit(snp_data->offset); + LOGINFOMOD(replication, "[Replica={}] Save logical snapshot callback return obj_id={}", + g_helper->replica_num(), snp_data->offset); + return; + } + + size_t kv_snapshot_obj_size = snp_data->blob.size(); + if (kv_snapshot_obj_size == 0) return; + + size_t num_items = kv_snapshot_obj_size / sizeof(KeyValuePair); + std::unique_lock lk(db_mtx_); + auto ptr = r_cast< const KeyValuePair* >(snp_data->blob.bytes()); + for (size_t i = 0; i < num_items; i++) { + auto key = ptr->key; + auto value = ptr->value; + LOGTRACEMOD(replication, "[Replica={}] Save logical snapshot got lsn={} data_size={} data_pattern={}", + g_helper->replica_num(), value.lsn_, value.data_size_, value.data_pattern_); + + // Write to data service and inmem map. + MultiBlkId out_blkids; + if (value.data_size_ != 0) { + snapshot_obj_write(value.data_size_, value.data_pattern_, out_blkids); + value.blkid_ = out_blkids; + } + inmem_db_.insert_or_assign(key, value); + last_committed_lsn = value.lsn_; + ++commit_count_; + ptr++; + } + + snp_data->offset = last_committed_lsn + 1; + set_resync_msg_type_bit(snp_data->offset); + LOGINFOMOD(replication, + "[Replica={}] Save logical snapshot callback obj_id={} term={} idx={} is_last={} num_items={}", + g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), + snp_data->is_last_obj, num_items); + } + + bool apply_snapshot(shared< snapshot_context > context) override { + std::lock_guard< std::mutex > lock(m_snapshot_lock); + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); + LOGINFOMOD(replication, "[Replica={}] Apply snapshot term={} idx={}", g_helper->replica_num(), + s->get_last_log_term(), s->get_last_log_idx()); + m_last_snapshot = context; + return true; + } + + shared< snapshot_context > last_snapshot() override { + std::lock_guard< std::mutex > lock(m_snapshot_lock); + if (!m_last_snapshot) return nullptr; + + auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(m_last_snapshot)->nuraft_snapshot(); + LOGINFOMOD(replication, "[Replica={}] Last snapshot term={} idx={}", g_helper->replica_num(), + s->get_last_log_term(), s->get_last_log_idx()); + return m_last_snapshot; + } + + void free_user_snp_ctx(void*& user_snp_ctx) override {} + + ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, + cintrusive< homestore::repl_req_ctx >& hs_ctx) override { + auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); + Key k{.id_ = jheader->key_id}; + auto iter = inmem_db_.find(k); + if (iter != inmem_db_.end()) { + LOGDEBUG("data already exists in mem db, key={}", k.id_); + auto hints = blk_alloc_hints{}; + hints.committed_blk_id = iter->second.blkid_; + return hints; + } + return blk_alloc_hints{}; + } + void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override { + LOGINFO("[Replica={}] start replace member out {} in {}", g_helper->replica_num(), + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + } + + void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override { + LOGINFO("[Replica={}] complete replace member out {} in {}", g_helper->replica_num(), + boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id)); + } + + void on_destroy(const group_id_t& group_id) override { + LOGINFOMOD(replication, "[Replica={}] Group={} is being destroyed", g_helper->replica_num(), + boost::uuids::to_string(group_id)); + g_helper->unregister_listener(group_id); + } + + void db_write(uint64_t data_size, uint32_t max_size_per_iov) { + static std::atomic< uint32_t > s_uniq_num{0}; + auto req = intrusive< test_req >(new test_req()); + req->jheader.data_size = data_size; + req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num; + req->jheader.key_id = req->key_id; + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + + LOGINFOMOD(replication, "[Replica={}] Db write key={} data_size={} pattern={} block_size={}", + g_helper->replica_num(), req->key_id, data_size, req->jheader.data_pattern, block_size); + + if (data_size != 0) { + req->write_sgs = + test_common::HSTestHelper::create_sgs(data_size, max_size_per_iov, req->jheader.data_pattern); + } + + repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req, false, s_uniq_num); + } + + void validate_db_data() { + g_helper->runner().set_num_tasks(inmem_db_.size()); + while (!repl_dev()->is_ready_for_traffic()) { + LOGINFO("not yet ready for traffic, waiting"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } + + LOGINFOMOD(replication, "[{}]: Total {} keys committed, validating them", + boost::uuids::to_string(repl_dev()->group_id()), inmem_db_.size()); + auto it = inmem_db_.begin(); + g_helper->runner().set_task([this, &it]() { + Key k; + Value v; + { + std::unique_lock lk(db_mtx_); + std::tie(k, v) = *it; + ++it; + } + + if (v.data_size_ != 0) { + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + auto read_sgs = test_common::HSTestHelper::create_sgs(v.data_size_, block_size); + + repl_dev()->async_read(v.blkid_, read_sgs, v.data_size_).thenValue([read_sgs, k, v](auto const ec) { + LOGINFOMOD(replication, "Validating key={} value[blkid={} pattern={}]", k.id_, v.blkid_.to_string(), + v.data_pattern_); + RELEASE_ASSERT(!ec, "Read of blkid={} for key={} error={}", v.blkid_.to_string(), k.id_, + ec.message()); + for (auto const& iov : read_sgs.iovs) { + test_common::HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, + v.data_pattern_); + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + g_helper->runner().next_task(); + }); + } else { + g_helper->runner().next_task(); + } + }); + g_helper->runner().execute().get(); + } + + uint64_t db_commit_count() const { + std::shared_lock lk(db_mtx_); + return commit_count_; + } + + uint64_t db_size() const { + std::shared_lock lk(db_mtx_); + return inmem_db_.size(); + } + + void create_snapshot() { + auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); + ulong snapshot_idx = raft_repl_dev->raft_server()->create_snapshot(); + LOGINFO("Manually create snapshot got index {}", snapshot_idx); + } + + void truncate(int num_reserved_entries) { + auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); + // raft_repl_dev->truncate(num_reserved_entries); + LOGINFO("Manually truncated"); + } + + void set_zombie() { zombie_ = true; } + bool is_zombie() { + // Wether a group is zombie(non recoverable) + return zombie_; + } + +private: + std::map< Key, Value > inmem_db_; + std::map< int64_t, Value > lsn_index_; + uint64_t commit_count_{0}; + std::shared_mutex db_mtx_; + uint64_t last_committed_lsn{0}; + std::shared_ptr< snapshot_context > m_last_snapshot{nullptr}; + std::mutex m_snapshot_lock; + bool zombie_{false}; +}; + +class RaftReplDevTestBase : public testing::Test { +public: + void SetUp() override { + // By default it will create one db + for (uint32_t i{0}; i < SISL_OPTIONS["num_raft_groups"].as< uint32_t >(); ++i) { + auto db = std::make_shared< TestReplicatedDB >(); + g_helper->register_listener(db); + dbs_.emplace_back(std::move(db)); + } + } + + void TearDown() override { + for (auto const& db : dbs_) { + if (db->is_zombie()) { continue; } + run_on_leader(db, [this, db]() { + auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get(); + ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group"; + }); + } + + for (auto const& db : dbs_) { + if (db->is_zombie()) { continue; } + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + if (!repl_dev) continue; + int i = 0; + bool force_leave = false; + do { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed"); + + // TODO: if leader is destroyed, but the follower does not receive the notification, it will not be + // destroyed for ever. we need handle this in raft_repl_dev. revisit here after making changes at + // raft_repl_dev side to hanle this case. this is a workaround to avoid the infinite loop for now. + if (i++ > 10 && !force_leave) { + LOGWARN("has already waited for repl dev to get destroyed for 10 times, so do a force leave"); + repl_dev->force_leave(); + force_leave = true; + } + + } while (!repl_dev->is_destroyed()); + } + } + + void generate_writes(uint64_t data_size, uint32_t max_size_per_iov, shared< TestReplicatedDB > db = nullptr) { + if (db == nullptr) { db = pick_one_db(); } + // LOGINFO("Writing on group_id={}", db->repl_dev()->group_id()); + db->db_write(data_size, max_size_per_iov); + } + + void wait_for_all_commits() { wait_for_commits(written_entries_); } + + void wait_for_commits(uint64_t exp_writes) { + uint64_t total_writes{0}; + while (true) { + total_writes = 0; + for (auto const& db : dbs_) { + total_writes += db->db_commit_count(); + } + + if (total_writes >= exp_writes) { break; } + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + LOGINFO("Replica={} received {} commits but expected {}", g_helper->replica_num(), total_writes, + exp_writes); + } + LOGINFO("Replica={} has received {} commits as expected", g_helper->replica_num(), total_writes); + } + + void validate_data() { + for (auto const& db : dbs_) { + db->validate_db_data(); + } + } + + shared< TestReplicatedDB > pick_one_db() { return dbs_[0]; } + + void assign_leader(uint16_t replica) { + LOGINFO("Switch the leader to replica_num = {}", replica); + if (g_helper->replica_num() == replica) { + for (auto const& db : dbs_) { + do { + auto result = db->repl_dev()->become_leader().get(); + if (result.hasError()) { + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } else { + break; + } + } while (true); + } + } else { + for (auto const& db : dbs_) { + homestore::replica_id_t leader_uuid; + while (true) { + leader_uuid = db->repl_dev()->get_leader_id(); + if (!leader_uuid.is_nil() && (g_helper->member_id(leader_uuid) == replica)) { break; } + + LOGINFO("Waiting for replica={} to become leader", replica); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } + } + } + } + + void run_on_leader(std::shared_ptr< TestReplicatedDB > db, auto&& lambda) { + if (!db || !db->repl_dev()) { + // Spare which are not added to group will not have repl dev. + return; + } + + do { + auto leader_uuid = db->repl_dev()->get_leader_id(); + + if (leader_uuid.is_nil()) { + LOGINFO("Waiting for leader to be elected for group={}", db->repl_dev()->group_id()); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } else if (leader_uuid == g_helper->my_replica_id()) { + lambda(); + break; + } else { + break; + } + } while (true); + } + + void write_on_leader(uint32_t num_entries, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr, + uint64_t* data_size = nullptr) { + if (dbs_[0]->repl_dev() == nullptr) return; + + do { + auto repl_dev = dbs_[0]->repl_dev(); + auto leader_uuid = repl_dev->get_leader_id(); + + if (leader_uuid.is_nil()) { + LOGINFO("Waiting for leader to be elected"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } else if (leader_uuid == g_helper->my_replica_id()) { + LOGINFO("Writing {} entries since I am the leader my_uuid={}", num_entries, + boost::uuids::to_string(g_helper->my_replica_id())); + if (!repl_dev->is_ready_for_traffic()) { + LOGINFO("leader is not yet ready for traffic, waiting"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } + auto const block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + g_helper->runner().set_num_tasks(num_entries); + + LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); + g_helper->runner().set_task([this, block_size, db, data_size]() { + static std::normal_distribution<> num_blks_gen{3.0, 2.0}; + uint64_t size = + data_size == nullptr ? std::abs(std::lround(num_blks_gen(g_re))) * block_size : *data_size; + this->generate_writes(size, block_size, db); + }); + if (wait_for_commit) { g_helper->runner().execute().get(); } + break; + } else { + LOGINFO("{} entries were written on the leader_uuid={} my_uuid={}", num_entries, + boost::uuids::to_string(leader_uuid), boost::uuids::to_string(g_helper->my_replica_id())); + break; + } + } while (true); + + written_entries_ += num_entries; + if (wait_for_commit) { this->wait_for_all_commits(); } + } + replica_id_t wait_and_get_leader_id() { + do { + auto leader_uuid = dbs_[0]->repl_dev()->get_leader_id(); + if (leader_uuid.is_nil()) { + LOGINFO("Waiting for leader to be elected"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + } else { + return leader_uuid; + } + } while (true); + } + + ReplServiceError write_with_id(uint64_t id, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr) { + if (dbs_[0]->repl_dev() == nullptr) return ReplServiceError::FAILED; + if (db == nullptr) { db = pick_one_db(); } + LOGINFO("Writing data {} since I am the leader my_uuid={}", id, + boost::uuids::to_string(g_helper->my_replica_id())); + auto const block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + + LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); + g_helper->runner().set_num_tasks(1); + g_helper->runner().set_task([this, block_size, db, id]() { + static std::normal_distribution<> num_blks_gen{3.0, 1.0}; + auto data_size = std::max(1L, std::abs(std::lround(num_blks_gen(g_re)))) * block_size; + ASSERT_GT(data_size, 0); + LOGINFO("data_size larger than 0, go ahead, data_size= {}.", data_size); + static std::atomic< uint32_t > s_uniq_num{0}; + auto req = intrusive(new TestReplicatedDB::test_req()); + req->jheader.data_size = data_size; + req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num; + // overwrite the key_id with the id passed in + req->jheader.key_id = id; + req->key_id = id; + + LOGINFOMOD(replication, "[Replica={}] Db write key={} data_size={} pattern={} block_size={}", + g_helper->replica_num(), req->key_id, data_size, req->jheader.data_pattern, block_size); + + if (data_size != 0) { + req->write_sgs = + test_common::HSTestHelper::create_sgs(data_size, block_size, req->jheader.data_pattern); + } + + db->repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req); + }); + + if (!wait_for_commit) { return ReplServiceError::OK; } + try { + g_helper->runner().execute().get(); + LOGDEBUG("write data task complete, id={}", id) + } catch (const ReplServiceError& e) { + LOGERRORMOD(replication, "[Replica={}] Error in writing data: id={}, error={}", g_helper->replica_num(), id, + enum_name(e)); + return e; + } + + written_entries_ += 1; + LOGINFO("wait_for_commit={}", written_entries_); + this->wait_for_all_commits(); + return ReplServiceError::OK; + } + + void remove_db(std::shared_ptr< TestReplicatedDB > db, bool wait_for_removal) { + this->run_on_leader(db, [this, db]() { + auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get(); + ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group"; + }); + + // Remove the db from the dbs_ list and check if count matches with repl_device + for (auto it = dbs_.begin(); it != dbs_.end(); ++it) { + if (*it == db) { + dbs_.erase(it); + break; + } + } + + if (wait_for_removal) { wait_for_listener_destroy(dbs_.size()); } + } + + void wait_for_listener_destroy(uint64_t exp_listeners) { + while (true) { + auto total_listeners = g_helper->num_listeners(); + if (total_listeners == exp_listeners) { break; } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + + void restart_replica(uint16_t replica, uint32_t shutdown_delay_sec = 5u) { + if (g_helper->replica_num() == replica) { + LOGINFO("Restart homestore: replica_num = {}", replica); + g_helper->restart(shutdown_delay_sec); + // g_helper->sync_for_test_start(); + } else { + LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica); + std::this_thread::sleep_for(std::chrono::seconds{5}); + } + } + + void shutdown_replica(uint16_t replica) { + if (g_helper->replica_num() == replica) { + LOGINFO("Shutdown homestore: replica_num = {}", replica); + g_helper->shutdown(); + } else { + LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica); + std::this_thread::sleep_for(std::chrono::seconds{5}); + } + } + + void start_replica(uint16_t replica) { + if (g_helper->replica_num() == replica) { + LOGINFO("Start homestore: replica_num = {}", replica); + g_helper->start(); + } + } + + void create_snapshot() { dbs_[0]->create_snapshot(); } + void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } + + void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in, + uint32_t commit_quorum = 0, ReplServiceError error = ReplServiceError::OK) { + this->run_on_leader(db, [this, error, db, member_out, member_in, commit_quorum]() { + LOGINFO("Start replace member out={} in={}", boost::uuids::to_string(member_out), + boost::uuids::to_string(member_in)); + + replica_member_info out{member_out, ""}; + replica_member_info in{member_in, ""}; + auto result = hs()->repl_service().replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get(); + if (error == ReplServiceError::OK) { + ASSERT_EQ(result.hasError(), false) << "Error in replacing member, err=" << result.error(); + } else { + ASSERT_EQ(result.hasError(), true); + ASSERT_EQ(result.error(), error) << "Error in replacing member, err=" << result.error(); + } + }); + } + +protected: + std::vector< std::shared_ptr< TestReplicatedDB > > dbs_; + uint32_t written_entries_{0}; + +#ifdef _PRERELEASE + flip::FlipClient m_fc{iomgr_flip::instance()}; +#endif +}; diff --git a/src/tests/test_data_service.cpp b/src/tests/test_data_service.cpp index 0974ca431..e6c47e211 100644 --- a/src/tests/test_data_service.cpp +++ b/src/tests/test_data_service.cpp @@ -445,7 +445,7 @@ class BlkDataServiceTest : public testing::Test { void read_io(uint32_t io_size) { auto remaining_io_size = io_size; while (remaining_io_size > 0) { - auto const bid = get_rand_blkid_to_read(io_size); + auto const bid = get_rand_blkid_to_read(remaining_io_size); if (!bid.is_valid()) { // didn't find any block to read, either write blk map is empty or // all blks are pending on free. @@ -455,6 +455,7 @@ class BlkDataServiceTest : public testing::Test { // every piece in bid is a single block, e.g. nblks = 1 auto const nbids = bid.num_pieces(); auto sub_io_size = nbids * inst().get_blk_size(); + HS_REL_ASSERT_LE(sub_io_size, remaining_io_size, "not expecting sub_io_size to exceed remaining_io_size"); // we pass crc from lambda becaues if there is any async_free_blk, the written blks in the blkcrc map will // be removed by the time read thenVlue is called; @@ -581,7 +582,7 @@ class BlkDataServiceTest : public testing::Test { auto nbids = io_size / inst().get_blk_size(); // number of blks to read; // nbids should not exceed max pieces that MultiBlkId can hold; - nbids = std::max(nbids, MultiBlkId::max_addln_pieces); + nbids = std::min(nbids, MultiBlkId::max_addln_pieces); // make sure skip + nbids are in the range of m_blk_crc_map; if (skip_nbids + nbids > m_blk_crc_map.size()) { skip_nbids = m_blk_crc_map.size() - nbids; } diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 77fdfb651..35b44eeaf 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -33,25 +33,32 @@ SISL_OPTIONS_ENABLE(logging, test_index_crash_recovery, iomgr, test_common_setup // TODO Add tests to do write,remove after recovery. // TODO Test with var len key with io mgr page size is 512. -SISL_OPTION_GROUP(test_index_crash_recovery, - (num_iters, "", "num_iters", "number of iterations for rand ops", - ::cxxopts::value< uint32_t >()->default_value("500"), "number"), - (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), - (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), - "seconds"), - (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", - ::cxxopts::value< uint32_t >()->default_value("0"), ""), - (operation_list, "", "operation_list", - "operation list instead of default created following by percentage", - ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), - (preload_size, "", "preload_size", "number of entries to preload tree with", - ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), - (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), - (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", - ::cxxopts::value< bool >()->default_value("1"), ""), - (seed, "", "seed", "random engine seed, use random if not defined", - ::cxxopts::value< uint64_t >()->default_value("0"), "number")) +SISL_OPTION_GROUP( + test_index_crash_recovery, + (num_iters, "", "num_iters", "number of iterations for rand ops", + ::cxxopts::value< uint32_t >()->default_value("500"), "number"), + (num_entries, "", "num_entries", "number of entries to test with", + ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), + (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), + (num_rounds, "", "num_rounds", "number of rounds to test with", + ::cxxopts::value< uint32_t >()->default_value("100"), "number"), + (num_entries_per_rounds, "", "num_entries_per_rounds", "number of entries per rounds", + ::cxxopts::value< uint32_t >()->default_value("40"), "number"), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("20"), + ""), + (min_keys_in_node, "", "min_keys_in_node", "min_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("6"), + ""), + (operation_list, "", "operation_list", "operation list instead of default created following by percentage", + ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), + (preload_size, "", "preload_size", "number of entries to preload tree with", + ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), + (load_from_file, "", "load_from_file", "load from file", ::cxxopts::value< bool >()->default_value("0"), ""), + (save_to_file, "", "save_to_file", "save to file", ::cxxopts::value< bool >()->default_value("0"), ""), + (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", + ::cxxopts::value< bool >()->default_value("1"), ""), + (seed, "", "seed", "random engine seed, use random if not defined", + ::cxxopts::value< uint64_t >()->default_value("0"), "number")) void log_obj_life_counter() { std::string str; @@ -73,8 +80,6 @@ class SequenceGenerator { public: SequenceGenerator(int putFreq, int removeFreq, uint64_t start_range, uint64_t end_range) : putFreq_(putFreq), removeFreq_(removeFreq), start_range_(start_range), end_range_(end_range) { - std::random_device rd; - gen_ = std::mt19937(rd()); keyDist_ = std::uniform_int_distribution<>(start_range_, end_range_); updateOperationTypeDistribution(); } @@ -95,27 +100,48 @@ class SequenceGenerator { keyDist_ = std::uniform_int_distribution<>(start_range_, end_range_); } + void fillRange(uint64_t start, uint64_t end) { + for (uint64_t i = start; i <= end; ++i) { + keyStates[i] = true; + } + } + OperationList generateOperations(size_t numOperations, bool reset = false) { std::vector< Operation > operations; if (reset) { this->reset(); } - for (size_t i = 0; i < numOperations; ++i) { - uint32_t key = keyDist_(gen_); + if (putFreq_ == 100 && end_range_ - start_range_ + 1 - in_use_key_cnt_.load() < numOperations) { + LOGDEBUG("All keys are in use, skipping operation generation. end_range_ {} start_range_ {} " + "in_use_key_cnt_ {}, numOperations {}", + end_range_, start_range_, in_use_key_cnt_.load(), numOperations); + return operations; + } + if (removeFreq_ == 100 && in_use_key_cnt_.load() < numOperations) { + LOGDEBUG("Not enough keys are in use, skipping operation generation. in_use_key_cnt_ {} numOperations {}", + in_use_key_cnt_.load(), numOperations); + return operations; + } + + while (operations.size() < numOperations) { + uint32_t key = keyDist_(g_re); auto [it, inserted] = keyStates.try_emplace(key, false); auto& inUse = it->second; - OperationType operation = static_cast< OperationType >(opTypeDist_(gen_)); + OperationType operation = static_cast< OperationType >(opTypeDist_(g_re)); if (operation == OperationType::Put && !inUse) { operations.emplace_back(key, OperationType::Put); inUse = true; + in_use_key_cnt_.fetch_add(1); } else if (operation == OperationType::Remove && inUse) { operations.emplace_back(key, OperationType::Remove); inUse = false; + in_use_key_cnt_.fetch_sub(1); } } return operations; } + __attribute__((noinline)) std::string showKeyState(uint64_t key) const { auto it = keyStates.find(key); if (it != keyStates.end()) { return it->second ? "Put" : "Remove"; } @@ -130,15 +156,18 @@ class SequenceGenerator { } return occurrences; } - __attribute__((noinline)) std::string printOperations(const OperationList& operations) const { + + __attribute__((noinline)) static std::string printOperations(const OperationList& operations) { std::ostringstream oss; + auto count = 1; for (const auto& [key, opType] : operations) { std::string opTypeStr = (opType == OperationType::Put) ? "Put" : "Remove"; - oss << "{" << key << ", " << opTypeStr << "}\n"; + oss << count++ << "- {" << key << ", " << opTypeStr << "}\n"; } return oss.str(); } - __attribute__((noinline)) std::string printKeysOccurrences(const OperationList& operations) const { + + __attribute__((noinline)) static std::string printKeysOccurrences(const OperationList& operations) { std::set< uint64_t > keys = collectUniqueKeys(operations); std::ostringstream oss; for (auto key : keys) { @@ -151,16 +180,52 @@ class SequenceGenerator { } return oss.str(); } - __attribute__((noinline)) std::string printKeyOccurrences(const OperationList& operations, uint64_t key ) const { + + __attribute__((noinline)) static std::string printKeyOccurrences(const OperationList& operations, uint64_t key) { std::ostringstream oss; auto keyOccurrences = inspect(operations, key); oss << "Occurrences of key " << key << ":\n"; for (const auto& [index, operation] : keyOccurrences) { std::string opTypeStr = (operation == OperationType::Put) ? "Put" : "Remove"; - oss << "Index: " << index << ", Operation: " << opTypeStr << "\n"; + oss << "Index: " << index << ", Operation: " << opTypeStr << "\n"; } return oss.str(); } + + static std::set< uint64_t > collectUniqueKeys(const OperationList& operations) { + std::set< uint64_t > keys; + for (const auto& [key, _] : operations) { + keys.insert(key); + } + return keys; + } + static void save_to_file(std::string filename, const OperationList& operations) { + std::ofstream file(filename); + if (file.is_open()) { + for (const auto& [key, opType] : operations) { + file << key << " " << static_cast< int >(opType) << "\n"; + } + file.close(); + } + } + + static OperationList load_from_file(std::string filename) { + std::ifstream file(filename); + OperationList operations; + if (file.is_open()) { + std::string line; + while (std::getline(file, line)) { + std::istringstream iss(line); + uint64_t key; + int opType; + iss >> key >> opType; + operations.emplace_back(key, static_cast< OperationType >(opType)); + } + file.close(); + } + return operations; + } + void reset() { keyStates.clear(); } private: @@ -168,25 +233,31 @@ class SequenceGenerator { int removeFreq_; uint64_t start_range_; uint64_t end_range_; - std::mt19937 gen_; std::uniform_int_distribution<> keyDist_; std::discrete_distribution<> opTypeDist_; std::map< uint64_t, bool > keyStates; + std::atomic< uint64_t > in_use_key_cnt_{0}; void updateOperationTypeDistribution() { opTypeDist_ = std::discrete_distribution<>({static_cast< double >(putFreq_), static_cast< double >(removeFreq_)}); } - - std::set< uint64_t > collectUniqueKeys(const OperationList& operations) const { - std::set< uint64_t > keys; - for (const auto& [key, _] : operations) { - keys.insert(key); - } - return keys; - } }; + #ifdef _PRERELEASE + +struct long_running_crash_options { + uint32_t put_freq; + std::vector< std::string > put_flips{}; + std::vector< std::string > remove_flips{}; + uint32_t num_entries{SISL_OPTIONS["num_entries"].as< uint32_t >()}; + uint32_t preload_size{SISL_OPTIONS["preload_size"].as< uint32_t >()}; + uint32_t rounds{SISL_OPTIONS["num_rounds"].as< uint32_t >()}; + uint32_t num_entries_per_rounds{SISL_OPTIONS["num_entries_per_rounds"].as< uint32_t >()}; + bool load_mode{SISL_OPTIONS.count("load_from_file") > 0}; + bool save_mode{SISL_OPTIONS.count("save_to_file") > 0}; +}; + template < typename TestType > struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestType >, public ::testing::Test { using T = TestType; @@ -197,12 +268,15 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT TestIndexServiceCallbacks(IndexCrashTest* test) : m_test(test) {} std::shared_ptr< IndexTableBase > on_index_table_found(superblk< index_table_sb >&& sb) override { - LOGINFO("Index table recovered, root bnode_id {} version {}", sb->root_node, sb->root_link_version); + LOGINFO("Index table recovered, root bnode_id {} uuid {} ordinal {} version {}", + static_cast< uint64_t >(sb->root_node), boost::uuids::to_string(sb->uuid), sb->ordinal, + sb->root_link_version); m_test->m_cfg = BtreeConfig(hs()->index_service().node_size()); m_test->m_cfg.m_leaf_node_type = T::leaf_node_type; m_test->m_cfg.m_int_node_type = T::interior_node_type; m_test->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); + m_test->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as< uint32_t >(); m_test->m_bt = std::make_shared< typename T::BtreeType >(std::move(sb), m_test->m_cfg); return m_test->m_bt; } @@ -228,9 +302,11 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT {HS_SERVICE::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}, nullptr, {}, SISL_OPTIONS["init_device"].as< bool >()); - LOGINFO("Node size {} ", hs()->index_service().node_size()); this->m_cfg = BtreeConfig(hs()->index_service().node_size()); this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); + this->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as< uint32_t >(); + LOGINFO("Node size {}, max_keys_in_node {}, min_keys_in_node {}", this->m_cfg.node_size(), + this->m_cfg.m_max_keys_in_node, this->m_cfg.m_min_keys_in_node); auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); @@ -240,28 +316,44 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT BtreeTestHelper< TestType >::SetUp(); if (this->m_bt == nullptr || SISL_OPTIONS["init_device"].as< bool >()) { this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); + auto num_keys = this->m_bt->count_keys(this->m_bt->root_node_id()); + // LOGINFO("Creating new index table with uuid {} - init_device:{:s} bt: {} root id {}, num of + // keys {}", boost::uuids::to_string(uuid), SISL_OPTIONS["init_device"].as< bool >(), + // this->m_bt, this->m_bt->root_node_id(), num_keys); + LOGINFO("Creating new index table with uuid {} - root id {}, num of keys {}", boost::uuids::to_string(uuid), + this->m_bt->root_node_id(), num_keys); + } else { populate_shadow_map(); } hs()->index_service().add_index_table(this->m_bt); - LOGINFO("Added index table to index service"); + LOGINFO("Added index table to index service with uuid {} - total tables in the system is currently {}", + boost::uuids::to_string(uuid), hs()->index_service().num_tables()); } void populate_shadow_map() { + LOGINFO("Populating shadow map"); this->m_shadow_map.load(m_shadow_filename); - ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) - << "shadow map size and tree size mismatch"; + auto num_keys = this->m_bt->count_keys(this->m_bt->root_node_id()); + LOGINFO("Shadow map size {} - btree keys {} - root id {}", this->m_shadow_map.size(), num_keys, + this->m_bt->root_node_id()); + ASSERT_EQ(this->m_shadow_map.size(), num_keys) << "shadow map size and tree size mismatch"; this->get_all(); } void reset_btree() { + hs()->index_service().remove_index_table(this->m_bt); this->m_bt->destroy(); + this->trigger_cp(true); + auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); hs()->index_service().add_index_table(this->m_bt); this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); + this->m_shadow_map.save(m_shadow_filename); + LOGINFO("Reset btree with uuid {} - erase shadow map {}", boost::uuids::to_string(uuid), m_shadow_filename); } void restart_homestore(uint32_t shutdown_delay_sec = 3) override { @@ -273,7 +365,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT void reapply_after_crash() { ShadowMap< K, V > snapshot_map{this->m_shadow_map.max_keys()}; snapshot_map.load(m_shadow_filename); - LOGDEBUG("\tSnapshot before crash\n{}", snapshot_map.to_string()); + // LOGINFO("\tSnapshot before crash\n{}", snapshot_map.to_string()); auto diff = this->m_shadow_map.diff(snapshot_map); // visualize tree after crash @@ -281,20 +373,28 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT // this->visualize_keys(recovered_tree_filename); // LOGINFO(" tree after recovered stored in {}", recovered_tree_filename); - std::string dif_str = "KEY \tADDITION\n"; - for (const auto& [k, addition] : diff) { - dif_str += fmt::format(" {} \t{}\n", k.key(), addition); + std::string dif_str = "Keys["; + for (const auto& [k, _] : diff) { + dif_str += fmt::format("{} ", k.key()); } - LOGDEBUG("Diff between shadow map and snapshot map\n{}\n", dif_str); + dif_str += "]"; + LOGINFO("Diff between shadow map and snapshot map\n{}\n", dif_str); for (const auto& [k, addition] : diff) { // this->print_keys(fmt::format("reapply: before inserting key {}", k.key())); // this->visualize_keys(recovered_tree_filename); - if (addition) { this->force_upsert(k.key()); } + if (addition) { + LOGDEBUG("Reapply: Inserting key {}", k.key()); + this->force_upsert(k.key()); + } else { + LOGDEBUG("Reapply: Removing key {}", k.key()); + this->remove_one(k.key(), false); + } } - test_common::HSTestHelper::trigger_cp(true); + trigger_cp(true); this->m_shadow_map.save(m_shadow_filename); } + void reapply_after_crash(OperationList& operations) { for (const auto& [key, opType] : operations) { switch (opType) { @@ -308,7 +408,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT break; } } - test_common::HSTestHelper::trigger_cp(true); + trigger_cp(true); } void TearDown() override { @@ -323,60 +423,321 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Error: failed to remove {}", m_shadow_filename); } } - LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(), - this->m_bt->count_keys(this->m_bt->root_node_id())); + LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(), this->tree_key_count()); BtreeTestHelper< TestType >::TearDown(); this->shutdown_homestore(false); } void crash_and_recover(uint32_t s_key, uint32_t e_key) { - this->print_keys("Btree prior to CP and susbsequent simulated crash: "); - test_common::HSTestHelper::trigger_cp(false); - this->wait_for_crash_recovery(); + // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + trigger_cp(false); + this->wait_for_crash_recovery(true); // this->visualize_keys("tree_after_crash_" + std::to_string(s_key) + "_" + std::to_string(e_key) + ".dot"); - this->print_keys("Post crash and recovery, btree structure: "); + // this->print_keys("Post crash and recovery, btree structure: "); this->reapply_after_crash(); + // this->print_keys("Post reapply, btree structure: "); + this->get_all(); LOGINFO("Expect to have [{},{}) in tree and it is actually{} ", s_key, e_key, tree_key_count()); - ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) - << "shadow map size and tree size mismatch"; + ASSERT_EQ(this->m_shadow_map.size(), this->tree_key_count()) << "shadow map size and tree size mismatch"; + } + + void sanity_check(OperationList& operations) const { + std::set< uint64_t > new_keys; + std::transform(operations.begin(), operations.end(), std::inserter(new_keys, new_keys.end()), + [](const Operation& operation) { return operation.first; }); + uint32_t count = 0; + this->m_shadow_map.foreach ([this, new_keys, &count](K key, V value) { + // discard the new keys to check + if (new_keys.find(key.key()) != new_keys.end()) { return; } + count++; + auto copy_key = std::make_unique< K >(); + *copy_key = key; + auto out_v = std::make_unique< V >(); + auto req = BtreeSingleGetRequest{copy_key.get(), out_v.get()}; + req.enable_route_tracing(); + const auto ret = this->m_bt->get(req); + if (ret != btree_status_t::success) { + this->print_keys(fmt::format("Sanity check: key {}", key.key())); + this->dump_to_file("sanity_fail.txt"); + } + ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map"; + }); + LOGINFO("Sanity check passed for {} keys!", count); } - void crash_and_recover(OperationList& operations, std::string filename = "") { - // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); - test_common::HSTestHelper::trigger_cp(false); - this->wait_for_crash_recovery(); - // this->print_keys("Post crash and recovery, btree structure:"); + void crash_and_recover_common(OperationList& operations, std::string filename = "") { + // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + LOGINFO("Before Crash: {} keys in shadow map and it is actually {} keys in tree - operations size {}", + this->m_shadow_map.size(), tree_key_count(), operations.size()); if (!filename.empty()) { - LOGINFO("Visualize the tree file {}", filename); - this->visualize_keys(filename); + std::string b_filename = filename + "_before_crash.dot"; + LOGINFO("Visualize the tree before crash file {}", b_filename); + this->visualize_keys(b_filename); } - this->reapply_after_crash(operations); + trigger_cp(false); + LOGINFO("waiting for crash to recover"); + this->wait_for_crash_recovery(true); - // this->print_keys("\n\nafter reapply keys"); if (!filename.empty()) { - LOGINFO("Visualize the tree file after_reapply__{}", filename); - this->visualize_keys("after_reapply__" + filename); + std::string rec_filename = filename + "_after_recovery.dot"; + LOGINFO("Visualize the tree file after recovery : {}", rec_filename); + this->visualize_keys(rec_filename); + } + // this->print_keys("Post crash and recovery, btree structure: "); + sanity_check(operations); + // Added to the index service right after recovery. Not needed here + // test_common::HSTestHelper::trigger_cp(true); + LOGINFO("Before Reapply: {} keys in shadow map and actually {} in trees operation size {}", + this->m_shadow_map.size(), tree_key_count(), operations.size()); + this->reapply_after_crash(operations); + if (!filename.empty()) { + std::string re_filename = filename + "_after_reapply.dot"; + LOGINFO("Visualize the tree after reapply {}", re_filename); + this->visualize_keys(re_filename); } + // this->print_keys("Post reapply, btree structure: "); this->get_all(); + LOGINFO("After reapply: {} keys in shadow map and actually {} in tress", this->m_shadow_map.size(), + tree_key_count()); + ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) + << "shadow map size and tree size mismatch"; + } + + void crash_and_recover(std::string& flip, OperationList& operations, std::string filename = "") { + this->remove_flip(flip); + this->crash_and_recover_common(operations, filename); + } + + void crash_and_recover(std::vector< std::string >& flips, OperationList& operations, std::string filename = "") { + for (auto const& flip : flips) { + this->remove_flip(flip); + } + this->crash_and_recover_common(operations, filename); } uint32_t tree_key_count() { return this->m_bt->count_keys(this->m_bt->root_node_id()); } + void long_running_crash(long_running_crash_options const& crash_test_options) { + // set putFreq 100 for the initial load + SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, + crash_test_options.num_entries - 1 /*end_range*/); + + std::vector< std::string > flips; + OperationList operations; + auto m_start_time = Clock::now(); + auto time_to_stop = [this, m_start_time]() { return (get_elapsed_time_sec(m_start_time) > this->m_run_time); }; + double elapsed_time, progress_percent, last_progress_time = 0; + bool renew_btree_after_crash = false; + auto cur_put_flip_idx = 0; + auto cur_remove_flip_idx = 0; + std::uniform_int_distribution<> dis(1, 100); + int flip_percentage = 90; // Set the desired percentage here + bool normal_execution = true; + bool clean_shutdown = true; + // if it is safe then delete all previous save files + if (crash_test_options.save_mode) { + std::filesystem::remove_all("/tmp/operations_*.txt"); + std::filesystem::remove_all("/tmp/flips_history.txt"); + } + // init tree + LOGINFO("Step 0: Fill up the tree with {} entries", crash_test_options.preload_size); + if (crash_test_options.load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_0.txt")); + } else { + operations = generator.generateOperations(crash_test_options.preload_size, true /* reset */); + if (crash_test_options.save_mode) { + SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); + } + } + + LOGDEBUG("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); + uint32_t num_keys{0}; + + for (auto [k, _] : operations) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + num_keys++; + } + + generator.setPutFrequency(crash_test_options.put_freq); + generator.setRemoveFrequency(100 - crash_test_options.put_freq); + + // Trigger the cp to make sure middle part is successful + LOGINFO("Step 0-1: Flush all the entries so far"); + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + this->m_shadow_map.save(this->m_shadow_filename); + // this->print_keys("reapply: after preload"); + this->visualize_keys("tree_after_preload.dot"); + + for (uint32_t round = 1; round <= crash_test_options.rounds && !time_to_stop(); round++) { + LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, crash_test_options.rounds); + bool print_time = false; + elapsed_time = get_elapsed_time_sec(m_start_time); + + if (crash_test_options.load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); + } else { + operations = generator.generateOperations(crash_test_options.num_entries_per_rounds, + renew_btree_after_crash /* reset */); + if (crash_test_options.save_mode) { + SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations); + } + } + if (operations.empty()) { + LOGDEBUG("No operations generated, skipping round {}", round); + continue; + } + + flips.clear(); + if (crash_test_options.load_mode) { + std::ifstream file("/tmp/flips_history.txt"); + std::string line; + bool found = false; + for (uint32_t i = 0; i < round && std::getline(file, line); i++) { + if (i == round - 1) { + found = true; + break; + } + } + if (found && !line.empty()) { + if (line == "normal") { + normal_execution = true; + } else { + normal_execution = false; + std::istringstream iss(line); + std::string flip; + while (iss >> flip) { + flips.emplace_back(flip); + } + auto log_str = fmt::format("Step 1-{}: Set flag", round); + for (auto const& f : flips) { + log_str += fmt::format(" {}", f); + this->set_basic_flip(f, 1, 100); + } + LOGINFO("{}", log_str); + } + } + file.close(); + } else { + if (dis(g_re) <= flip_percentage) { + if (!crash_test_options.put_flips.empty()) { + flips.emplace_back( + crash_test_options.put_flips[cur_put_flip_idx++ % crash_test_options.put_flips.size()]); + } + if (!crash_test_options.remove_flips.empty()) { + flips.emplace_back(crash_test_options.remove_flips[cur_remove_flip_idx++ % + crash_test_options.remove_flips.size()]); + } + auto log_str = fmt::format("Step 1-{}: Set flag", round); + for (auto const& f : flips) { + log_str += fmt::format(" {}", f); + this->set_basic_flip(f, 1, 100); + } + LOGINFO("{}", log_str); + normal_execution = false; + } else { + normal_execution = true; + LOGINFO("Step 1-{}: No flip set", round); + } + if (crash_test_options.save_mode) { + // save the filp name to a file for later use + std::ofstream file("/tmp/flips_history.txt", std::ios::app); + if (file.is_open()) { + std::string out_line{"normal"}; + if (!normal_execution) { + out_line = flips[0]; + for (size_t i = 1; i < flips.size(); i++) { + out_line += " " + flips[i]; + } + } + file << out_line << "\n"; + } + file.close(); + } + } + + LOGDEBUG("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); + + for (auto [k, op] : operations) { + if (op == OperationType::Remove) { + if (num_keys < 1) { + // remove flips and continue + for (auto const& flip : flips) { + this->remove_flip(flip); + } + continue; + } + LOGDEBUG("Removing key {}", k); + this->remove_one(k, true /* expect_success */); + num_keys--; + } else { + if (num_keys >= crash_test_options.num_entries) { + // remove flips and continue + for (auto const& flip : flips) { + this->remove_flip(flip); + } + continue; + } + LOGDEBUG("Inserting key {}", k); + this->put(k, btree_put_type::INSERT, true /* expect_success */); + num_keys++; + } + if (!time_to_stop()) { + static bool print_alert = false; + if (print_alert) { + LOGINFO("It is time to stop but let's finish this round and then stop!"); + print_alert = false; + } + } + } + if (normal_execution) { + if (clean_shutdown) { + this->m_shadow_map.save(this->m_shadow_filename); + this->restart_homestore(); + } else { + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + } + } else { + // remove the flips so that they do not get triggered erroneously + this->crash_and_recover(flips, operations, fmt::format("long_tree_{}", round)); + } + if (elapsed_time - last_progress_time > 30) { + last_progress_time = elapsed_time; + print_time = true; + } + if (print_time) { + LOGINFO( + "\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " + "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", + round, crash_test_options.rounds, round * 100.0 / crash_test_options.rounds, elapsed_time, + this->m_run_time, elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), + crash_test_options.num_entries, this->tree_key_count() * 100.0 / crash_test_options.num_entries); + } + // this->print_keys(fmt::format("reapply: after round {}", round)); + if (renew_btree_after_crash) { this->reset_btree(); }; + } + this->destroy_btree(); + log_obj_life_counter(); + } + protected: const std::string m_shadow_filename = "/tmp/shadow_map_index_recovery.txt"; }; // Crash recovery can test one simple btree, since focus is not on btree test itself, but index recovery -using BtreeTypes = testing::Types< FixedLenBtree >; +using BtreeTypes = testing::Types< FixedLenBtree, PrefixIntervalBtree >; TYPED_TEST_SUITE(IndexCrashTest, BtreeTypes); TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) { + this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); + this->m_shadow_map.save(this->m_shadow_filename); // Simulate the crash even before first cp this->set_basic_flip("crash_flush_on_root"); @@ -385,13 +746,15 @@ TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) { // Trigger a cp, which should induce the crash and wait for hs to recover test_common::HSTestHelper::trigger_cp(false); - this->wait_for_crash_recovery(); + this->wait_for_crash_recovery(true); // Post crash, load the shadow_map into a new instance and compute the diff. Redo the operation this->reapply_after_crash(); } TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) { + this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); + this->m_shadow_map.save(this->m_shadow_filename); // Insert into 4 phases, first fill up the last part, since we need to test split on left edge LOGINFO("Step 1: Fill up the last quarter of the tree"); auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); @@ -441,82 +804,6 @@ TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) { this->query_all_paginate(80); } -/* -TYPED_TEST(IndexCrashTest, ManualMergeCrash){ - // Define the lambda function - const uint32_t num_entries = 30; - - auto initTree = [this, num_entries]() { - for (uint64_t k = 0u; k < num_entries; ++k) { - this->force_upsert(k); - } - test_common::HSTestHelper::trigger_cp(true); - this->m_shadow_map.save(this->m_shadow_filename); - }; - - std::vector< OperationList > removing_scenarios = { - {{29, OperationType::Remove}, - {28, OperationType::Remove}, - {27, OperationType::Remove}, - {26, OperationType::Remove}, - {25, OperationType::Remove}, - {24, OperationType::Remove}} - }; - - auto scenario = removing_scenarios[0]; - - LOGINFO("Step 1-1: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init.dot"); - LOGINFO("Step 2-1: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_merge_at_parent"); - - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - this->visualize_keys("tree_before_crash.dot"); - - LOGINFO("Step 3-1: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_1.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - - LOGINFO("Step 1-2: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init_02.dot"); - LOGINFO("Step 2-2: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_merge_at_left_child"); - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - this->visualize_keys("tree_before_crash_2.dot"); - - LOGINFO("Step 3-2: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_2.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - - LOGINFO("Step 1-3: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init_03.dot"); - LOGINFO("Step 2-3: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_freed_child"); - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - LOGINFO("Step 2-3: Set crash flag, remove some keys in reverse order"); - this->visualize_keys("tree_before_crash_3.dot"); - - LOGINFO("Step 3-3: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_3.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); -} -*/ - TYPED_TEST(IndexCrashTest, SplitCrash1) { // Define the lambda function auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); @@ -524,11 +811,11 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) { vector< std::string > flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", "crash_flush_on_split_at_right_child"}; OperationList operations; + bool renew_btree_after_crash = true; for (size_t i = 0; i < flips.size(); ++i) { - this->reset_btree(); LOGINFO("Step 1-{}: Set flag {}", i + 1, flips[i]); this->set_basic_flip(flips[i]); - operations = generator.generateOperations(num_entries -1 , true /* reset */); + operations = generator.generateOperations(num_entries - 1, renew_btree_after_crash /* reset */); // LOGINFO("Batch {} Operations:\n {} \n ", i + 1, generator.printOperations(operations)); // LOGINFO("Detailed Key Occurrences for Batch {}:\n {} \n ", i + 1, // generator.printKeyOccurrences(operations)); @@ -536,52 +823,266 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) { // LOGINFO("\t\t\t\t\t\t\t\t\t\t\t\t\tupserting entry {}", k); this->put(k, btree_put_type::INSERT, true /* expect_success */); } - this->crash_and_recover(operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); + this->crash_and_recover(flips[i], operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); + if (renew_btree_after_crash) { this->reset_btree(); }; } } TYPED_TEST(IndexCrashTest, long_running_put_crash) { - // Define the lambda function - auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); - SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); - vector< std::string > flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", - "crash_flush_on_split_at_right_child"}; - OperationList operations; - auto m_start_time = Clock::now(); - auto time_to_stop = [this, m_start_time]() { return (get_elapsed_time_sec(m_start_time) > this->m_run_time); }; - double elapsed_time, progress_percent, last_progress_time = 0; - for (size_t i = 0; !time_to_stop(); ++i) { - bool print_time = false; - elapsed_time = get_elapsed_time_sec(m_start_time); + long_running_crash_options crash_test_options{ + .put_freq = 100, + .put_flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", + "crash_flush_on_split_at_right_child"}, + }; + this->long_running_crash(crash_test_options); +} + +TYPED_TEST(IndexCrashTest, long_running_remove_crash) { + long_running_crash_options crash_test_options{ + .put_freq = 0, + .remove_flips = {"crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child" + /*, "crash_flush_on_freed_child"*/}, + .preload_size = SISL_OPTIONS["num_entries"].as< uint32_t >(), + }; + this->long_running_crash(crash_test_options); +} + +TYPED_TEST(IndexCrashTest, long_running_put_remove_crash) { + long_running_crash_options crash_test_options{ + .put_freq = 50, + .put_flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", + "crash_flush_on_split_at_right_child"}, + .remove_flips = {"crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child" + /*, "crash_flush_on_freed_child"*/}, + }; + this->long_running_crash(crash_test_options); +} + +// Basic reverse and forward order remove with different flip points +TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { + vector< std::string > flip_points = { + "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child", + // "crash_flush_on_freed_child", + }; + for (size_t i = 0; i < flip_points.size(); ++i) { this->reset_btree(); - auto flip = flips[i % flips.size()]; - LOGINFO("Step 1-{}: Set flag {}", i + 1, flip); - this->set_basic_flip(flip, 1, 10); - operations = generator.generateOperations(num_entries -1, true /* reset */); - // operations = generator.generateOperations(num_entries/10, false /* reset */); - // LOGINFO("Batch {} Operations:\n {} \n ", i + 1, generator.printOperations(operations)); - // LOGINFO("Detailed Key Occurrences for Batch {}:\n {} \n ", i + 1, - // generator.printKeyOccurrences(operations)); - for (auto [k, _] : operations) { - // LOGINFO("\t\t\t\t\t\t\t\t\t\t\t\t\tupserting entry {}", k); + auto& flip_point = flip_points[i]; + LOGINFO("=== Testing flip point: {} - {} ===", i + 1, flip_point); + + // Populate some keys [1,num_entries) and trigger cp to persist + LOGINFO("Step {}-1: Populate some keys and flush", i + 1); + auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); + for (auto k = 0u; k < num_entries; ++k) { this->put(k, btree_put_type::INSERT, true /* expect_success */); } - this->crash_and_recover(operations/*, fmt::format("recover_tree_crash_{}.dot", i + 1)*/); - if (elapsed_time - last_progress_time > 30) { - last_progress_time = elapsed_time; - print_time = true; + test_common::HSTestHelper::trigger_cp(true); + this->m_shadow_map.save(this->m_shadow_filename); + + this->visualize_keys("tree_merge_full.dot"); + + // Split keys into batches and remove the last one in reverse order + LOGINFO("Step {}-2: Set crash flag, remove some keys in reverse order", i + 1); + int batch_num = 4; + { + int n = batch_num; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + OperationList ops; + for (auto k = r; k >= l; --k) { + ops.emplace_back(k, OperationType::Remove); + } + LOGINFO("Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, r, l); + + this->set_basic_flip(flip_point); + for (auto [k, _] : ops) { + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_first_crash.dot"); + + LOGINFO("Step {}-2-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); } - if (print_time) { - LOGINFO("\n\n\n\t\t\tProgress: {} iterations completed - Elapsed time: {:.0f} seconds of total " - "{} ({:.2f}%)\n\n\n", - i, elapsed_time, this->m_run_time, elapsed_time * 100.0 / this->m_run_time); + + // Remove the next batch of keys in forward order + LOGINFO("Step {}-3: Remove another batch in ascending order", i + 1) { + int n = batch_num - 1; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + OperationList ops; + for (auto k = l; k <= r; ++k) { + ops.emplace_back(k, OperationType::Remove); + } + LOGINFO("Step {}-3-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); + + this->set_basic_flip(flip_point); + for (auto [k, _] : ops) { + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_second_crash.dot"); + + LOGINFO("Step {}-3-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); } - this->print_keys(fmt::format("reapply: after iteration {}", i)); + // Remove the next batch of keys in random order + LOGINFO("Step {}-4: Remove another batch in random order", i + 1) { + int n = batch_num - 2; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + SequenceGenerator generator(0, 100, l, r); + generator.fillRange(l, r); + OperationList ops = generator.generateOperations(r - l + 1, false); + + LOGINFO("Step {}-4-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); + + this->set_basic_flip(flip_point); + for (auto [k, _] : ops) { + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_third_crash.dot"); + + LOGINFO("Step {}-4-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); + } + + LOGINFO("Step {}-5: Cleanup the tree", i + 1); + for (auto k = 0u; k < num_entries; ++k) { + this->remove_one(k, false); + } + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); } } + +// +// TYPED_TEST(IndexCrashTest, MergeCrash1) { +// auto const num_entries = SISL_OPTIONS["num_entries"].as(); +// vector flips = { +// "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child", +// }; +// SequenceGenerator generator(0 /*putFreq*/, 100 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 +// /*end_range*/); OperationList operations; for (size_t i = 0; i < flips.size(); ++i) { +// this->reset_btree(); +// LOGINFO("Step {}-1: Init btree", i + 1); +// for (auto k = 0u; k < num_entries; ++k) { +// this->put(k, btree_put_type::INSERT, true /* expect_success */); +// } +// test_common::HSTestHelper::trigger_cp(true); +// this->print_keys("Inited tree"); +// +// LOGINFO("Step {}-2: Set flag {}", i + 1, flips[i]); +// this->set_basic_flip(flips[i], 1, 10); +// generator.reset(); +// generator.fillRange(0, num_entries - 1); +// +// // Randomly remove some keys +// std::random_device rd; +// std::mt19937 gen(rd()); +// std::uniform_int_distribution<> dis(num_entries / 4, num_entries / 2); +// auto num_keys_to_remove = dis(gen); +// LOGINFO("Removing {} keys before crash", num_keys_to_remove); +// operations = generator.generateOperations(num_keys_to_remove, false /* reset */); +// for (auto [k, _]: operations) { +// LOGINFO("Removing key {}", k); +// this->remove_one(k, true); +// } +// +// LOGINFO("Step {}-3: Simulate crash and recover", i + 1); +// this->crash_and_recover(operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); +// } +// } +// +// TYPED_TEST(IndexCrashTest, MergeManualCrash) { +// std::vector flip_points = { +// "crash_flush_on_merge_at_parent", +// "crash_flush_on_merge_at_left_child", +// }; +// +// constexpr uint32_t num_entries = 28; // with max=5 & min=3 +// +// auto initTree = [this, num_entries]() { +// for (auto k = 0u; k < num_entries; ++k) { +// this->put(k, btree_put_type::INSERT, true /* expect_success */); +// } +// test_common::HSTestHelper::trigger_cp(true); +// this->m_shadow_map.save(this->m_shadow_filename); +// }; +// +// std::vector removing_scenarios = { +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// }, // Merge 2 rightmost leaf nodes in 1 action +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {20, OperationType::Remove}, +// {19, OperationType::Remove}, +// }, // Merge 3 rightmost leaf nodes in 1 action +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// {21, OperationType::Remove}, +// {20, OperationType::Remove}, +// {19, OperationType::Remove}, +// }, // Merge 3 rightmost leaf nodes in 2 actions +// { +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// {11, OperationType::Remove}, +// {10, OperationType::Remove}, +// {13, OperationType::Remove}, +// }, // Merge from level=0 then level=1 +// // { +// // {16, OperationType::Remove}, +// // }, // Merge from level=1 then level=0 - need to set min=4 +// }; +// +// for (int i = 0; i < static_cast(removing_scenarios.size()); i++) { +// auto scenario = removing_scenarios[i]; +// auto s_idx = i + 1; +// LOGINFO("\n\tTesting scenario {}", s_idx); +// for (int j = 0; j < static_cast(flip_points.size()); j++) { +// const auto &flip_point = flip_points[j]; +// auto f_idx = j + 1; +// LOGINFO("\n\t\t\t\tTesting flip point: {}", flip_point); +// +// LOGINFO("Step {}-{}-1: Populate keys and flush", s_idx, f_idx); +// initTree(); +// this->visualize_keys(fmt::format("tree_init.{}_{}.dot", s_idx, f_idx)); +// +// LOGINFO("Step {}-{}-2: Set crash flag, remove keys in reverse order", s_idx, f_idx); +// this->set_basic_flip(flip_point); +// for (auto k: scenario) { +// LOGINFO("Removing entry {}", k.first); +// this->remove_one(k.first); +// } +// this->visualize_keys(fmt::format("tree_before_first_crash.{}_{}.dot", s_idx, f_idx)); +// this->remove_flip(flip_point); +// +// LOGINFO("Step {}-{}-3: Trigger cp to crash", s_idx, f_idx); +// this->crash_and_recover(scenario); +// test_common::HSTestHelper::trigger_cp(true); +// this->get_all(); +// +// this->reset_btree(); +// test_common::HSTestHelper::trigger_cp(true); +// } +// } +// } #endif int main(int argc, char* argv[]) { diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp index cbe8ff760..871eafdaf 100644 --- a/src/tests/test_log_dev.cpp +++ b/src/tests/test_log_dev.cpp @@ -157,6 +157,32 @@ class LogDevTest : public ::testing::Test { } } + void insert_batch_sync(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& lsn, int64_t batch, + uint32_t fixed_size = 0) { + bool io_memory{false}; + std::vector< test_log_data* > data_vector; + + for (int64_t i = 0; i < batch; ++i) { + auto* d = prepare_data(lsn + i, io_memory, fixed_size); + data_vector.push_back(d); // Store the pointer in the vector + log_store->write_async(lsn + i, {uintptr_cast(d), d->total_size(), false}, nullptr, nullptr); + LOGINFO("Written async data for LSN -> {}:{}", log_store->get_store_id(), lsn + i); + } + + log_store->flush(); + LOGINFO("Flush data from {} to {}", lsn, lsn + batch); + lsn += batch; + + // Free all the allocated memory after the batch insert + for (auto* d : data_vector) { + if (io_memory) { + iomanager.iobuf_free(uintptr_cast(d)); + } else { + std::free(voidptr_cast(d)); + } + } + } + void kickstart_inserts(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& cur_lsn, int64_t batch, uint32_t fixed_size = 0) { auto last = cur_lsn + batch; @@ -200,8 +226,13 @@ class LogDevTest : public ::testing::Test { read_all_verify(log_store); } - void truncate_validate(std::shared_ptr< HomeLogStore > log_store) { + void truncate_validate(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t* trunc_lsn = nullptr) { auto upto = log_store->get_contiguous_completed_seq_num(-1); + if (trunc_lsn && *trunc_lsn != upto) { + LOGWARN("Truncate issued upto {} but real upto lsn in log store is {}", *trunc_lsn, upto); + upto = *trunc_lsn; + } + LOGINFO("truncate_validate upto {}", upto); log_store->truncate(upto); read_all_verify(log_store); @@ -212,6 +243,20 @@ class LogDevTest : public ::testing::Test { auto actual_count = log_store->get_logdev()->log_dev_meta().num_rollback_records(log_store->get_store_id()); ASSERT_EQ(actual_count, expected_count); } + + logid_t get_last_truncate_idx(logdev_id_t logdev_id) { + auto status = logstore_service().get_logdev(logdev_id)->get_status(0); + if (status.contains("last_truncate_log_idx")) { return s_cast< logid_t >(status["last_truncate_log_idx"]); } + LOGERROR("Failed to get last_truncate_log_idx from logdev status for logdev_id {}", logdev_id); + return static_cast< logid_t >(-1); + } + + logid_t get_current_log_idx(logdev_id_t logdev_id) { + auto status = logstore_service().get_logdev(logdev_id)->get_status(0); + if (status.contains("current_log_idx")) { return s_cast< logid_t >(status["current_log_idx"]); } + LOGERROR("Failed to get current_log_idx from logdev status for logdev_id {}", logdev_id); + return static_cast< logid_t >(-1); + } }; TEST_F(LogDevTest, WriteSyncThenRead) { @@ -219,7 +264,7 @@ TEST_F(LogDevTest, WriteSyncThenRead) { for (uint32_t iteration{0}; iteration < iterations; ++iteration) { LOGINFO("Iteration {}", iteration); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); const auto store_id = log_store->get_store_id(); @@ -239,7 +284,7 @@ TEST_F(LogDevTest, WriteSyncThenRead) { TEST_F(LogDevTest, Rollback) { LOGINFO("Step 1: Create a single logstore to start rollback test"); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); auto log_store = logstore_service().create_new_log_store(logdev_id, false); auto store_id = log_store->get_store_id(); @@ -247,7 +292,7 @@ TEST_F(LogDevTest, Rollback) { auto restart = [&]() { std::promise< bool > p; auto starting_cb = [&]() { - logstore_service().open_logdev(logdev_id); + logstore_service().open_logdev(logdev_id, flush_mode_t::EXPLICIT); logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) { log_store = store; p.set_value(true); @@ -304,6 +349,351 @@ TEST_F(LogDevTest, Rollback) { rollback_records_validate(log_store, 0 /* expected_count */); } +TEST_F(LogDevTest, ReTruncate) { + LOGINFO("Step 1: Create a single logstore to start re-truncate test"); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto log_store = logstore_service().create_new_log_store(logdev_id, false); + + LOGINFO("Step 2: Issue sequential inserts with q depth of 10"); + logstore_seq_num_t cur_lsn = 0; + kickstart_inserts(log_store, cur_lsn, 500); + + LOGINFO("Step 3: Truncate all entries"); + logstore_seq_num_t ls_last_lsn = 499; + truncate_validate(log_store, &ls_last_lsn); + ASSERT_EQ(log_store->start_lsn(), ls_last_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), ls_last_lsn); + ASSERT_EQ(log_store->truncated_upto(), ls_last_lsn); + + LOGINFO("Step 4: Truncate again"); + truncate_validate(log_store, &ls_last_lsn); + ASSERT_EQ(log_store->start_lsn(), ls_last_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), ls_last_lsn); + ASSERT_EQ(log_store->truncated_upto(), ls_last_lsn); + + LOGINFO("Step 5: Read and verify all entries again"); + read_all_verify(log_store); +} + +TEST_F(LogDevTest, TruncateWithExceedingLSN) { + LOGINFO("Step 1: Create a single logstore to start truncate with exceeding LSN test"); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto log_store = logstore_service().create_new_log_store(logdev_id, false); + + LOGINFO("Step 2: Insert 500 entries"); + logstore_seq_num_t cur_lsn = 0; + kickstart_inserts(log_store, cur_lsn, 500); + + LOGINFO("Step 3: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 4: Truncate 100 entries"); + logstore_seq_num_t trunc_lsn = 99; + truncate_validate(log_store, &trunc_lsn); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), 499); + ASSERT_EQ(log_store->next_lsn(), 500); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + + LOGINFO("Step 5: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 6: Truncate all with exceeding lsn"); + trunc_lsn = 1999999; + truncate_validate(log_store, &trunc_lsn); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), trunc_lsn); + ASSERT_EQ(log_store->next_lsn(), 2000000); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + + LOGINFO("Step 7 Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 8: Append 500 entries"); + cur_lsn = log_store->next_lsn(); + kickstart_inserts(log_store, cur_lsn, 500); + ASSERT_EQ(log_store->next_lsn(), 2000500); + + LOGINFO("Step 9: Read and verify all entries"); + read_all_verify(log_store); +} + +TEST_F(LogDevTest, TruncateAfterRestart) { + LOGINFO("Step 1: Create a single logstore to start truncate with overlapping LSN test"); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto log_store = logstore_service().create_new_log_store(logdev_id, false); + auto store_id = log_store->get_store_id(); + + auto restart = [&]() { + std::promise< bool > p; + auto starting_cb = [&]() { + logstore_service().open_logdev(logdev_id, flush_mode_t::EXPLICIT); + logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) { + log_store = store; + p.set_value(true); + }); + }; + start_homestore(true /* restart */, starting_cb); + p.get_future().get(); + }; + + LOGINFO("Step 2: Insert 500 entries"); + logstore_seq_num_t cur_lsn = 0; + kickstart_inserts(log_store, cur_lsn, 500); + + LOGINFO("Step 3: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 4: Truncate 100 entries"); + logstore_seq_num_t trunc_lsn = 99; + truncate_validate(log_store, &trunc_lsn); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), 499); + ASSERT_EQ(log_store->next_lsn(), 500); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + + LOGINFO("Step 5: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 6: Restart and verify all entries"); + restart(); + read_all_verify(log_store); + auto const [last_trunc_lsn, trunc_ld_key, tail_lsn] = log_store->truncate_info(); + ASSERT_EQ(last_trunc_lsn, trunc_lsn); + ASSERT_EQ(trunc_ld_key.idx, 0); + ASSERT_EQ(tail_lsn, log_store->tail_lsn()); + + LOGINFO("Step 7: call log dev truncate again and read verify") + logstore_service().device_truncate(); + read_all_verify(log_store); +} + +TEST_F(LogDevTest, TruncateAcrossMultipleStores) { + LOGINFO("Step 1: Create 3 log stores to start truncate across multiple stores test"); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto store1 = logstore_service().create_new_log_store(logdev_id, false); + auto store2 = logstore_service().create_new_log_store(logdev_id, false); + auto store3 = logstore_service().create_new_log_store(logdev_id, false); + + LOGINFO("Step 2: Insert 100 entries to store {}", store1->get_store_id()); + logstore_seq_num_t cur_lsn = 0; + kickstart_inserts(store1, cur_lsn, 100); + ASSERT_EQ(get_current_log_idx(logdev_id), 100); + + LOGINFO("Step 3: Insert 200 entries to store {}", store2->get_store_id()); + cur_lsn = 0; + kickstart_inserts(store2, cur_lsn, 200); + ASSERT_EQ(get_current_log_idx(logdev_id), 300); + + LOGINFO("Step 4: Insert 200 entries to store {}", store3->get_store_id()); + cur_lsn = 0; + kickstart_inserts(store3, cur_lsn, 200); + ASSERT_EQ(get_current_log_idx(logdev_id), 500); + + LOGINFO("Step 5: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 0); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), -1); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 0); + ASSERT_EQ(store2->tail_lsn(), 199); + ASSERT_EQ(store2->truncated_upto(), -1); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 0); + ASSERT_EQ(store3->tail_lsn(), 199); + ASSERT_EQ(store3->truncated_upto(), -1); + // log dev should not truncate any logs due to no truncate in log stores happened + ASSERT_EQ(get_last_truncate_idx(logdev_id), -1); + + LOGINFO("Step 6: Truncate 100 entries in store {}", store2->get_store_id()); + logstore_seq_num_t trunc_lsn = 99; + truncate_validate(store2, &trunc_lsn); + + LOGINFO("Step 7: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 0); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), -1); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 100); + ASSERT_EQ(store2->tail_lsn(), 199); + ASSERT_EQ(store2->truncated_upto(), 99); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 0); + ASSERT_EQ(store3->tail_lsn(), 199); + ASSERT_EQ(store3->truncated_upto(), -1); + // log dev should not truncate any logs due to store1 has valid logs + ASSERT_EQ(get_last_truncate_idx(logdev_id), -1); + + LOGINFO("Step 8: Truncate 500 entries in store {}", store3->get_store_id()); + trunc_lsn = 499; + truncate_validate(store3, &trunc_lsn); + + LOGINFO("Step 9: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 0); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), -1); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 100); + ASSERT_EQ(store2->tail_lsn(), 199); + ASSERT_EQ(store2->truncated_upto(), 99); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // log dev should truncate not truncate any logs due to store1 has valid logs + ASSERT_EQ(get_last_truncate_idx(logdev_id), -1); + + LOGINFO("Step 10: Truncate 100 entries in store {}", store1->get_store_id()); + trunc_lsn = 99; + truncate_validate(store1, &trunc_lsn); + + LOGINFO("Step 11: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 100); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), 99); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 100); + ASSERT_EQ(store2->tail_lsn(), 199); + ASSERT_EQ(store2->truncated_upto(), 99); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // log dev should truncate logs upto 199, as store2 has valid logs + ASSERT_EQ(get_last_truncate_idx(logdev_id), 199); + + LOGINFO("Step 12: Truncate 300 entries in store {}", store2->get_store_id()); + trunc_lsn = 299; + truncate_validate(store2, &trunc_lsn); + + LOGINFO("Step 13: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 100); + ASSERT_EQ(store1->tail_lsn(), 99); + ASSERT_EQ(store1->truncated_upto(), 99); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 300); + ASSERT_EQ(store2->tail_lsn(), 299); + ASSERT_EQ(store2->truncated_upto(), 299); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // log dev should truncate all logs as all stores are empty + ASSERT_EQ(get_last_truncate_idx(logdev_id), 499); + + LOGINFO("Step 14: Insert 100 entries in store {}", store1->get_store_id()); + cur_lsn = 100; + kickstart_inserts(store1, cur_lsn, 100); + ASSERT_EQ(get_current_log_idx(logdev_id), 600); + + LOGINFO("Step 15: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 100); + ASSERT_EQ(store1->tail_lsn(), 199); + ASSERT_EQ(store1->truncated_upto(), 99); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 300); + ASSERT_EQ(store2->tail_lsn(), 299); + ASSERT_EQ(store2->truncated_upto(), 299); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // log dev should not truncate since no new truncate happened + ASSERT_EQ(get_last_truncate_idx(logdev_id), 499); + + LOGINFO("Step 16: Truncate 500 entries in store {}", store1->get_store_id()); + trunc_lsn = 499; + truncate_validate(store1, &trunc_lsn); + + LOGINFO("Step 17: Read and verify all stores"); + read_all_verify(store1); + ASSERT_EQ(store1->start_lsn(), 500); + ASSERT_EQ(store1->tail_lsn(), 499); + ASSERT_EQ(store1->truncated_upto(), 499); + read_all_verify(store2); + ASSERT_EQ(store2->start_lsn(), 300); + ASSERT_EQ(store2->tail_lsn(), 299); + ASSERT_EQ(store2->truncated_upto(), 299); + read_all_verify(store3); + ASSERT_EQ(store3->start_lsn(), 500); + ASSERT_EQ(store3->tail_lsn(), 499); + ASSERT_EQ(store3->truncated_upto(), 499); + + // make sure new logs can truncate successfully when there are empty log stores + ASSERT_EQ(get_last_truncate_idx(logdev_id), 599); +} + +TEST_F(LogDevTest, TruncateLogsAfterFlushAndRestart) { + LOGINFO("Step 1: Create a single logstore to start truncate-logs-after-flush-and-restart test"); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); + s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + auto log_store = logstore_service().create_new_log_store(logdev_id, false); + auto store_id = log_store->get_store_id(); + + auto restart = [&]() { + std::promise< bool > p; + auto starting_cb = [&]() { + logstore_service().open_logdev(logdev_id, flush_mode_t::EXPLICIT); + logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) { + log_store = store; + p.set_value(true); + }); + }; + start_homestore(true /* restart */, starting_cb); + p.get_future().get(); + }; + + LOGINFO("Step 2: Insert 100 entries"); + logstore_seq_num_t cur_lsn = 0; + insert_batch_sync(log_store, cur_lsn, 100, 0); + + LOGINFO("Step 3: Read and verify all entries"); + read_all_verify(log_store); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 99); + + LOGINFO("Step 4: Append 100 entries"); + insert_batch_sync(log_store, cur_lsn, 100, 0); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199); + + LOGINFO("Step 5: Read and verify all entries"); + read_all_verify(log_store); + + LOGINFO("Step 6: restart and verify"); + restart(); + read_all_verify(log_store); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199); + + LOGINFO("Step 7: Truncate 50 entries"); + logstore_seq_num_t trunc_lsn = 49; + truncate_validate(log_store, &trunc_lsn); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), 199); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199); + + LOGINFO("Step 8: restart and verify"); + restart(); + read_all_verify(log_store); + ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1); + ASSERT_EQ(log_store->tail_lsn(), 199); + ASSERT_EQ(log_store->truncated_upto(), trunc_lsn); + ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199); +} + TEST_F(LogDevTest, CreateRemoveLogDev) { auto num_logdev = SISL_OPTIONS["num_logdevs"].as< uint32_t >(); std::vector< std::shared_ptr< HomeLogStore > > log_stores; @@ -317,7 +707,7 @@ TEST_F(LogDevTest, CreateRemoveLogDev) { ASSERT_EQ(vdev->num_descriptors(), 0); for (uint32_t i{0}; i < num_logdev; ++i) { - auto id = logstore_service().create_new_logdev(); + auto id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); s_max_flush_multiple = logstore_service().get_logdev(id)->get_flush_size_multiple(); auto store = logstore_service().create_new_log_store(id, false); log_stores.push_back(store); @@ -365,7 +755,7 @@ TEST_F(LogDevTest, DeleteUnopenedLogDev) { // Test deletion of unopened logdev. std::set< logdev_id_t > id_set, unopened_id_set; for (uint32_t i{0}; i < num_logdev; ++i) { - auto id = logstore_service().create_new_logdev(); + auto id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); id_set.insert(id); if (i >= num_logdev / 2) { unopened_id_set.insert(id); } s_max_flush_multiple = logstore_service().get_logdev(id)->get_flush_size_multiple(); @@ -389,7 +779,7 @@ TEST_F(LogDevTest, DeleteUnopenedLogDev) { auto starting_cb = [&]() { auto it = id_set.begin(); for (uint32_t i{0}; i < id_set.size() / 2; i++, it++) { - logstore_service().open_logdev(*it); + logstore_service().open_logdev(*it, flush_mode_t::EXPLICIT); } }; start_homestore(true /* restart */, starting_cb); diff --git a/src/tests/test_log_store.cpp b/src/tests/test_log_store.cpp index 43e57ff7c..1aa580bba 100644 --- a/src/tests/test_log_store.cpp +++ b/src/tests/test_log_store.cpp @@ -455,7 +455,7 @@ class SampleDB { for (uint32_t i{0}; i < n_log_stores; ++i) { SampleLogStoreClient* client = m_log_store_clients[i].get(); - logstore_service().open_logdev(client->m_logdev_id); + logstore_service().open_logdev(client->m_logdev_id, flush_mode_t::EXPLICIT); logstore_service() .open_log_store(client->m_logdev_id, client->m_store_id, false /* append_mode */) .thenValue([i, this, client](auto log_store) { client->set_log_store(log_store); }); @@ -479,7 +479,7 @@ class SampleDB { std::vector< logdev_id_t > logdev_id_vec; for (uint32_t i{0}; i < n_log_devs; ++i) { - logdev_id_vec.push_back(logstore_service().create_new_logdev()); + logdev_id_vec.push_back(logstore_service().create_new_logdev(flush_mode_t::EXPLICIT)); } for (uint32_t i{0}; i < n_log_stores; ++i) { @@ -1225,7 +1225,7 @@ TEST_F(LogStoreTest, WriteSyncThenRead) { for (uint32_t iteration{0}; iteration < iterations; ++iteration) { LOGINFO("Iteration {}", iteration); - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); auto tmp_log_store = logstore_service().create_new_log_store(logdev_id, false); const auto store_id = tmp_log_store->get_store_id(); LOGINFO("Created new log store -> id {}", store_id); diff --git a/src/tests/test_log_store_long_run.cpp b/src/tests/test_log_store_long_run.cpp index e9808da65..507e51633 100644 --- a/src/tests/test_log_store_long_run.cpp +++ b/src/tests/test_log_store_long_run.cpp @@ -294,7 +294,7 @@ class LogStoreLongRun : public ::testing::Test { HS_SETTINGS_FACTORY().save(); for (uint32_t i{0}; i < n_log_stores; ++i) { SampleLogStoreClient* client = m_log_store_clients[i].get(); - logstore_service().open_logdev(client->m_logdev_id); + logstore_service().open_logdev(client->m_logdev_id, flush_mode_t::EXPLICIT); logstore_service() .open_log_store(client->m_logdev_id, client->m_store_id, false /* append_mode */) .thenValue([i, this, client](auto log_store) { client->set_log_store(log_store); }); @@ -318,7 +318,7 @@ class LogStoreLongRun : public ::testing::Test { std::vector< logdev_id_t > logdev_id_vec; for (uint32_t i{0}; i < n_log_devs; ++i) - logdev_id_vec.push_back(logstore_service().create_new_logdev()); + logdev_id_vec.push_back(logstore_service().create_new_logdev(flush_mode_t::EXPLICIT)); for (uint32_t i{0}; i < n_log_stores; ++i) m_log_store_clients.push_back(std::make_unique< SampleLogStoreClient >( @@ -466,7 +466,7 @@ class LogStoreLongRun : public ::testing::Test { validate_num_stores(); // Create a new logstore. - auto logdev_id = logstore_service().create_new_logdev(); + auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT); m_log_store_clients.push_back(std::make_unique< SampleLogStoreClient >( logdev_id, bind_this(LogStoreLongRun::on_log_insert_completion, 3))); validate_num_stores(); diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 45681f412..83330422d 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -46,6 +46,8 @@ SISL_OPTION_GROUP( ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("0"), + ""), (seed, "", "seed", "random engine seed, use random if not defined", ::cxxopts::value< uint64_t >()->default_value("0"), "number"), (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds")) @@ -330,6 +332,10 @@ int main(int argc, char* argv[]) { auto seed = SISL_OPTIONS["seed"].as< uint64_t >(); LOGINFO("Using seed {} to sow the random generation", seed); g_re.seed(seed); + } else { + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + LOGINFO("No seed provided. Using randomly generated seed: {}", seed); + g_re.seed(seed); } auto ret = RUN_ALL_TESTS(); return ret; diff --git a/src/tests/test_meta_blk_mgr.cpp b/src/tests/test_meta_blk_mgr.cpp index 870dd5191..d3c5401e9 100644 --- a/src/tests/test_meta_blk_mgr.cpp +++ b/src/tests/test_meta_blk_mgr.cpp @@ -185,7 +185,7 @@ class VMetaBlkMgrTest : public ::testing::Test { uint64_t total_size_written(const void* cookie) { return m_mbm->meta_size(cookie); } void do_write_to_full() { - static constexpr uint64_t blkstore_overhead = 4 * 1024ul * 1024ul; // 4MB + static constexpr uint64_t blkstore_overhead = 256 * 1024ul * 1024ul; // 256MB ssize_t free_size = uint64_cast(m_mbm->total_size() - m_mbm->used_size() - blkstore_overhead); HS_REL_ASSERT_GT(free_size, 0); @@ -193,7 +193,10 @@ class VMetaBlkMgrTest : public ::testing::Test { uint64_t size_written{0}; while (free_size > 0) { - if (free_size >= gp.max_wrt_sz) { + LOGDEBUG("free size: {}, total size: {}, used size: {}, available blks: {}", free_size, m_mbm->total_size(), + m_mbm->used_size(), m_mbm->available_blks()); + // if it is overflow, 2 extra blocks are needed for ovf blk header and meta blk; + if (free_size - 2 * m_mbm->block_size() >= gp.max_wrt_sz) { size_written = do_sb_write(do_overflow(), 0); } else { size_written = do_sb_write(false, m_mbm->meta_blk_context_sz()); diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index f8aa06c5c..f6d458943 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -12,606 +12,57 @@ * specific language governing permissions and limitations under the License. * *********************************************************************************/ -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include "common/homestore_config.hpp" -#include "common/homestore_assert.hpp" -#include "common/homestore_utils.hpp" - -#define private public -#include "test_common/hs_repl_test_common.hpp" -#include "replication/service/raft_repl_service.h" -#include "replication/repl_dev/raft_repl_dev.h" - -using namespace homestore; - -SISL_OPTION_GROUP(test_raft_repl_dev, - (block_size, "", "block_size", "block size to io", - ::cxxopts::value< uint32_t >()->default_value("4096"), "number"), - (num_raft_groups, "", "num_raft_groups", "number of raft groups per test", - ::cxxopts::value< uint32_t >()->default_value("1"), "number"), - // for below replication parameter, their default value always get from dynamic config, only used - // when specified by user - (snapshot_distance, "", "snapshot_distance", "distance between snapshots", - ::cxxopts::value< uint32_t >()->default_value("0"), "number"), - (num_raft_logs_resv, "", "num_raft_logs_resv", "number of raft logs reserved", - ::cxxopts::value< uint32_t >()->default_value("0"), "number"), - (res_mgr_audit_timer_ms, "", "res_mgr_audit_timer_ms", "resource manager audit timer", - ::cxxopts::value< uint32_t >()->default_value("0"), "number")); - -SISL_OPTIONS_ENABLE(logging, test_raft_repl_dev, iomgr, config, test_common_setup, test_repl_common_setup) - -static std::unique_ptr< test_common::HSReplTestHelper > g_helper; -static std::random_device g_rd{}; -static std::default_random_engine g_re{g_rd()}; - -class TestReplicatedDB : public homestore::ReplDevListener { -public: - struct Key { - uint64_t id_; - bool operator<(Key const& other) const { return id_ < other.id_; } - }; - - struct Value { - int64_t lsn_; - uint64_t data_size_; - uint64_t data_pattern_; - MultiBlkId blkid_; - uint64_t id_; - }; - - struct KeyValuePair { - Key key; - Value value; - }; - - struct test_req : public repl_req_ctx { - struct journal_header { - uint64_t data_size; - uint64_t data_pattern; - }; - - journal_header jheader; - uint64_t key_id; - sisl::sg_list write_sgs; - sisl::sg_list read_sgs; - - sisl::blob header_blob() { return sisl::blob(uintptr_cast(&jheader), sizeof(journal_header)); } - sisl::blob key_blob() { return sisl::blob{uintptr_cast(&key_id), sizeof(uint64_t)}; } - - test_req() { - write_sgs.size = 0; - read_sgs.size = 0; - key_id = (uint64_t)rand() << 32 | rand(); - } - - ~test_req() { - for (auto const& iov : write_sgs.iovs) { - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - - for (auto const& iov : read_sgs.iovs) { - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - } - }; - - TestReplicatedDB() = default; - virtual ~TestReplicatedDB() = default; - - void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, - cintrusive< repl_req_ctx >& ctx) override { - ASSERT_EQ(header.size(), sizeof(test_req::journal_header)); - - auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); - Key k{.id_ = *(r_cast< uint64_t const* >(key.cbytes()))}; - Value v{.lsn_ = lsn, - .data_size_ = jheader->data_size, - .data_pattern_ = jheader->data_pattern, - .blkid_ = blkids, - .id_ = k.id_}; - - LOGINFOMOD(replication, "[Replica={}] Received commit on lsn={} dsn={} key={} value[blkid={} pattern={}]", - g_helper->replica_num(), lsn, ctx->dsn(), k.id_, v.blkid_.to_string(), v.data_pattern_); - - { - std::unique_lock lk(db_mtx_); - inmem_db_.insert_or_assign(k, v); - lsn_index_.emplace(lsn, v); - last_data_committed_lsn = lsn; - ++commit_count_; - } - - if (ctx->is_proposer()) { g_helper->runner().next_task(); } - } - - bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, - cintrusive< repl_req_ctx >& ctx) override { - LOGINFOMOD(replication, "[Replica={}] Received pre-commit on lsn={} dsn={}", g_helper->replica_num(), lsn, - ctx->dsn()); - return true; - } - - void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, - cintrusive< repl_req_ctx >& ctx) override { - LOGINFOMOD(replication, "[Replica={}] Received rollback on lsn={}", g_helper->replica_num(), lsn); - } - - void on_restart() { - LOGINFOMOD(replication, "restarted repl dev for [Replica={}] Group={}", g_helper->replica_num(), - boost::uuids::to_string(repl_dev()->group_id())); - } - - void on_error(ReplServiceError error, const sisl::blob& header, const sisl::blob& key, - cintrusive< repl_req_ctx >& ctx) override { - LOGINFOMOD(replication, "[Replica={}] Received error={} on key={}", g_helper->replica_num(), enum_name(error), - *(r_cast< uint64_t const* >(key.cbytes()))); - } - - AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { - std::lock_guard< std::mutex > lock(m_snapshot_lock); - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); - LOGINFOMOD(replication, "[Replica={}] Got snapshot callback term={} idx={}", g_helper->replica_num(), - s->get_last_log_term(), s->get_last_log_idx()); - m_last_snapshot = context; - return make_async_success<>(); - } - - int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); - - if (snp_data->offset == 0) { - snp_data->is_last_obj = false; - snp_data->blob = sisl::io_blob_safe(sizeof(ulong)); - LOGINFOMOD(replication, "[Replica={}] Read logical snapshot callback obj_id={} term={} idx={}", - g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx()); - return 0; - } - - int64_t next_lsn = snp_data->offset; - std::vector< KeyValuePair > kv_snapshot_data; - // we can not use find to get the next element, since if the next lsn is a config lsn , it will not be put into - // lsn_index_ and as a result, the find will return the end of the map. so here we use lower_bound to get the - // first element to be read and transfered. - for (auto iter = lsn_index_.lower_bound(next_lsn); iter != lsn_index_.end(); iter++) { - auto& v = iter->second; - kv_snapshot_data.emplace_back(Key{v.id_}, v); - LOGTRACEMOD(replication, "[Replica={}] Read logical snapshot callback fetching lsn={} size={} pattern={}", - g_helper->replica_num(), v.lsn_, v.data_size_, v.data_pattern_); - if (kv_snapshot_data.size() >= 1000) { break; } - } - - if (kv_snapshot_data.size() == 0) { - snp_data->is_last_obj = true; - LOGINFOMOD(replication, "Snapshot is_last_obj is true"); - return 0; - } - - int64_t kv_snapshot_data_size = sizeof(KeyValuePair) * kv_snapshot_data.size(); - sisl::io_blob_safe blob{static_cast< uint32_t >(kv_snapshot_data_size)}; - std::memcpy(blob.bytes(), kv_snapshot_data.data(), kv_snapshot_data_size); - snp_data->blob = std::move(blob); - snp_data->is_last_obj = false; - LOGINFOMOD(replication, "[Replica={}] Read logical snapshot callback obj_id={} term={} idx={} num_items={}", - g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), - kv_snapshot_data.size()); - - return 0; - } - - void snapshot_data_write(uint64_t data_size, uint64_t data_pattern, MultiBlkId& out_blkids) { - auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); - auto write_sgs = test_common::HSTestHelper::create_sgs(data_size, block_size, data_pattern); - auto fut = homestore::data_service().async_alloc_write(write_sgs, blk_alloc_hints{}, out_blkids); - std::move(fut).get(); - for (auto const& iov : write_sgs.iovs) { - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - } - - void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); - if (snp_data->offset == 0) { - snp_data->offset = last_data_committed_lsn + 1; - LOGINFOMOD(replication, "[Replica={}] Save logical snapshot callback return obj_id={}", - g_helper->replica_num(), snp_data->offset); - return; - } - - size_t kv_snapshot_data_size = snp_data->blob.size(); - if (kv_snapshot_data_size == 0) return; - - size_t num_items = kv_snapshot_data_size / sizeof(KeyValuePair); - std::unique_lock lk(db_mtx_); - auto ptr = r_cast< const KeyValuePair* >(snp_data->blob.bytes()); - for (size_t i = 0; i < num_items; i++) { - auto key = ptr->key; - auto value = ptr->value; - LOGTRACEMOD(replication, "[Replica={}] Save logical snapshot got lsn={} data_size={} data_pattern={}", - g_helper->replica_num(), value.lsn_, value.data_size_, value.data_pattern_); - - // Write to data service and inmem map. - MultiBlkId out_blkids; - if (value.data_size_ != 0) { - snapshot_data_write(value.data_size_, value.data_pattern_, out_blkids); - value.blkid_ = out_blkids; - } - last_data_committed_lsn = value.lsn_; - inmem_db_.insert_or_assign(key, value); - ++commit_count_; - ptr++; - } - - LOGINFOMOD(replication, - "[Replica={}] Save logical snapshot callback obj_id={} term={} idx={} is_last={} num_items={}", - g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(), - snp_data->is_last_obj, num_items); - - // before we finish install snapshot, raft_server()->get_committed_log_idx() will always be the same. so we need - // last_data_committed_lsn to notify leader to transfer new data to follower. - snp_data->offset = last_data_committed_lsn + 1; - } - - bool apply_snapshot(shared< snapshot_context > context) override { - std::lock_guard< std::mutex > lock(m_snapshot_lock); - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot(); - LOGINFOMOD(replication, "[Replica={}] Apply snapshot term={} idx={}", g_helper->replica_num(), - s->get_last_log_term(), s->get_last_log_idx()); - m_last_snapshot = context; - return true; - } - - shared< snapshot_context > last_snapshot() override { - std::lock_guard< std::mutex > lock(m_snapshot_lock); - if (!m_last_snapshot) return nullptr; - - auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(m_last_snapshot)->nuraft_snapshot(); - LOGINFOMOD(replication, "[Replica={}] Last snapshot term={} idx={}", g_helper->replica_num(), - s->get_last_log_term(), s->get_last_log_idx()); - return m_last_snapshot; - } - - void free_user_snp_ctx(void*& user_snp_ctx) override {} - - ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { - return blk_alloc_hints{}; - } - - void on_destroy() override { - LOGINFOMOD(replication, "[Replica={}] Group={} is being destroyed", g_helper->replica_num(), - boost::uuids::to_string(repl_dev()->group_id())); - g_helper->unregister_listener(repl_dev()->group_id()); - } - - void db_write(uint64_t data_size, uint32_t max_size_per_iov) { - static std::atomic< uint32_t > s_uniq_num{0}; - auto req = intrusive< test_req >(new test_req()); - req->jheader.data_size = data_size; - req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num; - auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); +#include "test_common/raft_repl_test_base.hpp" - LOGINFOMOD(replication, "[Replica={}] Db write key={} data_size={} pattern={} block_size={}", - g_helper->replica_num(), req->key_id, data_size, req->jheader.data_pattern, block_size); - - if (data_size != 0) { - req->write_sgs = - test_common::HSTestHelper::create_sgs(data_size, max_size_per_iov, req->jheader.data_pattern); - } - - repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req); - } - - void validate_db_data() { - g_helper->runner().set_num_tasks(inmem_db_.size()); - - LOGINFOMOD(replication, "[{}]: Total {} keys committed, validating them", - boost::uuids::to_string(repl_dev()->group_id()), inmem_db_.size()); - auto it = inmem_db_.begin(); - g_helper->runner().set_task([this, &it]() { - Key k; - Value v; - { - std::unique_lock lk(db_mtx_); - std::tie(k, v) = *it; - ++it; - } - - if (v.data_size_ != 0) { - auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); - auto read_sgs = test_common::HSTestHelper::create_sgs(v.data_size_, block_size); - - repl_dev()->async_read(v.blkid_, read_sgs, v.data_size_).thenValue([read_sgs, k, v](auto const ec) { - LOGINFOMOD(replication, "Validating key={} value[blkid={} pattern={}]", k.id_, v.blkid_.to_string(), - v.data_pattern_); - RELEASE_ASSERT(!ec, "Read of blkid={} for key={} error={}", v.blkid_.to_string(), k.id_, - ec.message()); - for (auto const& iov : read_sgs.iovs) { - test_common::HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, - v.data_pattern_); - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - g_helper->runner().next_task(); - }); - } else { - g_helper->runner().next_task(); - } - }); - g_helper->runner().execute().get(); - } - - uint64_t db_commit_count() const { - std::shared_lock lk(db_mtx_); - return commit_count_; - } - - uint64_t db_size() const { - std::shared_lock lk(db_mtx_); - return inmem_db_.size(); - } - - void create_snapshot() { - auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); - ulong snapshot_idx = raft_repl_dev->raft_server()->create_snapshot(); - LOGINFO("Manually create snapshot got index {}", snapshot_idx); - } - - void truncate(int num_reserved_entries) { - auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev()); - raft_repl_dev->truncate(num_reserved_entries); - LOGINFO("Manually truncated"); - } - - void set_zombie() { zombie_ = true; } - bool is_zombie() { - // Wether a group is zombie(non recoverable) - return zombie_; - } - -private: - std::map< Key, Value > inmem_db_; - std::map< int64_t, Value > lsn_index_; - uint64_t commit_count_{0}; - // this is the last lsn for data, might not be the same with the real last committed lsn - // which should be get by raft_server()->get_committed_log_idx() - uint64_t last_data_committed_lsn{0}; - std::shared_mutex db_mtx_; - std::shared_ptr< snapshot_context > m_last_snapshot{nullptr}; - std::mutex m_snapshot_lock; - bool zombie_{false}; -}; - -class RaftReplDevTest : public testing::Test { -public: - void SetUp() override { - // By default it will create one db - for (uint32_t i{0}; i < SISL_OPTIONS["num_raft_groups"].as< uint32_t >(); ++i) { - auto db = std::make_shared< TestReplicatedDB >(); - g_helper->register_listener(db); - dbs_.emplace_back(std::move(db)); - } - } - - void TearDown() override { - for (auto const& db : dbs_) { - if (db->is_zombie()) { continue; } - run_on_leader(db, [this, db]() { - auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get(); - ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group"; - }); - } - - for (auto const& db : dbs_) { - if (db->is_zombie()) { continue; } - auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); - int i = 0; - bool force_leave = false; - do { - std::this_thread::sleep_for(std::chrono::seconds(1)); - auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); - raft_repl_svc.gc_repl_devs(); - LOGINFO("Waiting for repl dev to get destroyed"); - - // TODO: if leader is destroyed, but the follower does not receive the notification, it will not be - // destroyed for ever. we need handle this in raft_repl_dev. revisit here after making changes at - // raft_repl_dev side to hanle this case. this is a workaround to avoid the infinite loop for now. - if (i++ > 10 && !force_leave) { - LOGWARN("has already waited for repl dev to get destroyed for 10 times, so do a force leave"); - repl_dev->force_leave(); - force_leave = true; - } - - } while (!repl_dev->is_destroyed()); - } - } - - void generate_writes(uint64_t data_size, uint32_t max_size_per_iov, shared< TestReplicatedDB > db = nullptr) { - if (db == nullptr) { db = pick_one_db(); } - // LOGINFO("Writing on group_id={}", db->repl_dev()->group_id()); - db->db_write(data_size, max_size_per_iov); - } - - void wait_for_all_commits() { wait_for_commits(written_entries_); } - - void wait_for_commits(uint64_t exp_writes) { - uint64_t total_writes{0}; - while (true) { - total_writes = 0; - for (auto const& db : dbs_) { - total_writes += db->db_commit_count(); - } - - if (total_writes >= exp_writes) { break; } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - LOGINFO("Replica={} has received {} commits as expected", g_helper->replica_num(), total_writes); - } - - void validate_data() { - for (auto const& db : dbs_) { - db->validate_db_data(); - } - } - - shared< TestReplicatedDB > pick_one_db() { return dbs_[0]; } - - void assign_leader(uint16_t replica) { - LOGINFO("Switch the leader to replica_num = {}", replica); - if (g_helper->replica_num() == replica) { - for (auto const& db : dbs_) { - do { - auto result = db->repl_dev()->become_leader().get(); - if (result.hasError()) { - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } else { - break; - } - } while (true); - } - } else { - for (auto const& db : dbs_) { - homestore::replica_id_t leader_uuid; - while (true) { - leader_uuid = db->repl_dev()->get_leader_id(); - if (!leader_uuid.is_nil() && (g_helper->member_id(leader_uuid) == replica)) { break; } - - LOGINFO("Waiting for replica={} to become leader", replica); - std::this_thread::sleep_for(std::chrono::milliseconds{500}); - } - } - } - } - - void run_on_leader(std::shared_ptr< TestReplicatedDB > db, auto&& lambda) { - do { - auto leader_uuid = db->repl_dev()->get_leader_id(); - - if (leader_uuid.is_nil()) { - LOGINFO("Waiting for leader to be elected for group={}", db->repl_dev()->group_id()); - std::this_thread::sleep_for(std::chrono::milliseconds{500}); - } else if (leader_uuid == g_helper->my_replica_id()) { - lambda(); - break; - } else { - break; - } - } while (true); - } - - void write_on_leader(uint32_t num_entries, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr) { - do { - auto leader_uuid = dbs_[0]->repl_dev()->get_leader_id(); - - if (leader_uuid.is_nil()) { - LOGINFO("Waiting for leader to be elected"); - std::this_thread::sleep_for(std::chrono::milliseconds{500}); - } else if (leader_uuid == g_helper->my_replica_id()) { - LOGINFO("Writing {} entries since I am the leader my_uuid={}", num_entries, - boost::uuids::to_string(g_helper->my_replica_id())); - auto const block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); - g_helper->runner().set_num_tasks(num_entries); - - LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); - g_helper->runner().set_task([this, block_size, db]() { - static std::normal_distribution<> num_blks_gen{3.0, 2.0}; - this->generate_writes(std::abs(std::lround(num_blks_gen(g_re))) * block_size, block_size, db); - }); - if (wait_for_commit) { g_helper->runner().execute().get(); } - break; - } else { - LOGINFO("{} entries were written on the leader_uuid={} my_uuid={}", num_entries, - boost::uuids::to_string(leader_uuid), boost::uuids::to_string(g_helper->my_replica_id())); - break; - } - } while (true); - - written_entries_ += num_entries; - if (wait_for_commit) { this->wait_for_all_commits(); } - } - - void remove_db(std::shared_ptr< TestReplicatedDB > db, bool wait_for_removal) { - this->run_on_leader(db, [this, db]() { - auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get(); - ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group"; - }); - - // Remove the db from the dbs_ list and check if count matches with repl_device - for (auto it = dbs_.begin(); it != dbs_.end(); ++it) { - if (*it == db) { - dbs_.erase(it); - break; - } - } - - if (wait_for_removal) { wait_for_listener_destroy(dbs_.size()); } - } - - void wait_for_listener_destroy(uint64_t exp_listeners) { - while (true) { - auto total_listeners = g_helper->num_listeners(); - if (total_listeners == exp_listeners) { break; } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - } - - void restart_replica(uint16_t replica, uint32_t shutdown_delay_sec = 5u) { - if (g_helper->replica_num() == replica) { - LOGINFO("Restart homestore: replica_num = {}", replica); - g_helper->restart(shutdown_delay_sec); - // g_helper->sync_for_test_start(); - } else { - LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica); - std::this_thread::sleep_for(std::chrono::seconds{5}); - } - } - - void shutdown_replica(uint16_t replica) { - if (g_helper->replica_num() == replica) { - LOGINFO("Shutdown homestore: replica_num = {}", replica); - g_helper->shutdown(); - } else { - LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica); - std::this_thread::sleep_for(std::chrono::seconds{5}); - } +class RaftReplDevTest : public RaftReplDevTestBase {}; +TEST_F(RaftReplDevTest, Write_Duplicated_Data) { + uint64_t total_writes = 1; + g_helper->runner().qdepth_ = total_writes; + g_helper->runner().total_tasks_ = total_writes; + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + auto leader_uuid = wait_and_get_leader_id(); + + uint64_t id; + TestReplicatedDB::Key stored_key; + TestReplicatedDB::Value stored_val; + if (leader_uuid == g_helper->my_replica_id()) { + id = (uint64_t)rand() << 32 | rand(); + LOGINFO("going to write data with id={}", id); + this->write_with_id(id, true /* wait_for_commit */); + stored_key = dbs_[0]->inmem_db_.cbegin()->first; + ASSERT_EQ(id, stored_key.id_); + } else { + LOGINFO("I am not leader, leader_uuid={} my_uuid={}, do nothing", boost::uuids::to_string(leader_uuid), + boost::uuids::to_string(g_helper->my_replica_id())); } + wait_for_commits(total_writes); - void start_replica(uint16_t replica) { - if (g_helper->replica_num() == replica) { - LOGINFO("Start homestore: replica_num = {}", replica); - g_helper->start(); - } + g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + /* test duplication + if duplication found in leader proposal, reject it; + if duplication found in the followers, skip it. + */ + // 1. write the same data again on leader, should fail + if (leader_uuid == g_helper->my_replica_id()) { + auto err = this->write_with_id(id, true /* wait_for_commit */); + ASSERT_EQ(ReplServiceError::DATA_DUPLICATED, err); + + // 2. delete it from the db to simulate duplication in followers(skip the duplication check in leader side) + dbs_[0]->inmem_db_.erase(stored_key); + LOGINFO("data with id={} has been deleted from db", id); + err = this->write_with_id(id, true /* wait_for_commit */); + ASSERT_EQ(ReplServiceError::OK, err); + } + if (leader_uuid != g_helper->my_replica_id()) { + wait_for_commits(total_writes + 1); + ASSERT_EQ(dbs_[0]->inmem_db_.size(), total_writes); } - void create_snapshot() { dbs_[0]->create_snapshot(); } - void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); } - -protected: - std::vector< std::shared_ptr< TestReplicatedDB > > dbs_; - uint32_t written_entries_{0}; - -#ifdef _PRERELEASE - flip::FlipClient m_fc{iomgr_flip::instance()}; -#endif -}; + g_helper->sync_for_cleanup_start(); +} TEST_F(RaftReplDevTest, Write_Restart_Write) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); @@ -658,6 +109,41 @@ TEST_F(RaftReplDevTest, Follower_Fetch_OnActive_ReplicaGroup) { g_helper->sync_for_cleanup_start(); } + +TEST_F(RaftReplDevTest, Write_With_Diabled_Leader_Push_Data) { + g_helper->set_basic_flip("disable_leader_push_data", std::numeric_limits< int >::max(), 100); + LOGINFO("Homestore replica={} setup completed, all the push_data from leader are disabled", + g_helper->replica_num()); + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + + this->write_on_leader(20, true /* wait_for_commit */); + + g_helper->sync_for_verify_start(); + + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + + g_helper->sync_for_cleanup_start(); + g_helper->remove_flip("disable_leader_push_data"); +} + +TEST_F(RaftReplDevTest, Write_With_Handling_No_Space_Left) { + g_helper->set_basic_flip("simulate_no_space_left", std::numeric_limits< int >::max(), 50); + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + + this->write_on_leader(20, true /* wait_for_commit */); + + g_helper->sync_for_verify_start(); + + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + + g_helper->sync_for_cleanup_start(); + g_helper->remove_flip("simulate_no_space_left"); +} + #endif // do some io before restart; @@ -749,6 +235,7 @@ TEST_F(RaftReplDevTest, Resync_From_Non_Originator) { } #if 0 + TEST_F(RaftReplDevTest, Leader_Restart) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); g_helper->sync_for_test_start(); @@ -773,7 +260,6 @@ TEST_F(RaftReplDevTest, Leader_Restart) { g_helper->sync_for_cleanup_start(); } - TEST_F(RaftReplDevTest, Drop_Raft_Entry_Switch_Leader) { LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); g_helper->sync_for_test_start(); @@ -958,7 +444,7 @@ TEST_F(RaftReplDevTest, BaselineTest) { // Leader does manual snapshot and truncate LOGINFO("Leader create snapshot and truncate"); this->create_snapshot(); - this->truncate(0); + // this->truncate(0); } } @@ -982,6 +468,96 @@ TEST_F(RaftReplDevTest, BaselineTest) { LOGINFO("BaselineTest done"); } +TEST_F(RaftReplDevTest, LargeDataWrite) { + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + + // TODO: Increase the data size (e.g., to 16MB) for testing. + // For now, use 4MB to ensure the test passes since there are issues with larger IO sizes on the uring drive. + uint64_t entries_per_attempt = SISL_OPTIONS["num_io"].as< uint64_t >(); + uint64_t data_size = 4 * 1024 * 1024; + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */, nullptr, &data_size); + + g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + g_helper->sync_for_cleanup_start(); +} + +TEST_F(RaftReplDevTest, PriorityLeaderElection) { + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + uint64_t entries_per_attempt = SISL_OPTIONS["num_io"].as< uint64_t >(); + if (g_helper->replica_num() == 0) { + auto leader = this->wait_and_get_leader_id(); + ASSERT_EQ(leader, g_helper->my_replica_id()); + } + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + + g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); + this->validate_data(); + g_helper->sync_for_cleanup_start(); + + LOGINFO("Restart leader"); + if (g_helper->replica_num() == 0) { g_helper->restart_homestore(); } + g_helper->sync_for_test_start(); + + LOGINFO("Validate leader switched"); + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + auto leader = this->wait_and_get_leader_id(); + if (g_helper->replica_num() == 0) { ASSERT_NE(leader, g_helper->my_replica_id()); } + g_helper->sync_for_verify_start(); + + if (leader == g_helper->my_replica_id()) { + LOGINFO("Resign and trigger a priority leader election"); + // resign and trigger a priority leader election + g_helper->restart_homestore(); + } + g_helper->sync_for_test_start(); + + std::this_thread::sleep_for(std::chrono::milliseconds{500}); + leader = this->wait_and_get_leader_id(); + LOGINFO("Validate leader switched back to initial replica"); + if (g_helper->replica_num() == 0) { ASSERT_EQ(leader, g_helper->my_replica_id()); } + g_helper->sync_for_verify_start(); + + LOGINFO("Post restart write the data again on the leader"); + this->write_on_leader(entries_per_attempt, true /* wait_for_commit */); + + LOGINFO("Validate all data written (including pre-restart data) by reading them"); + this->validate_data(); + g_helper->sync_for_cleanup_start(); +} + +TEST_F(RaftReplDevTest, ComputePriority) { + g_helper->sync_for_test_start(); + auto& raftService = dynamic_cast< RaftReplService& >(hs()->repl_service()); + + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.max_wait_rounds_of_priority_election = 0; }); + HS_SETTINGS_FACTORY().save(); + ASSERT_EQ(raftService.compute_raft_follower_priority(), raft_leader_priority); + + for (auto i = 1; i <= int(raft_priority_election_round_upper_limit); i++) { + HS_SETTINGS_FACTORY().modifiable_settings( + [i](auto& s) { s.consensus.max_wait_rounds_of_priority_election = i; }); + HS_SETTINGS_FACTORY().save(); + auto follower_priority = raftService.compute_raft_follower_priority(); + // Simulate nuraft algorithm + auto decayed_priority = raft_leader_priority; + for (auto j = 1; j <= i; j++) { + int gap = std::max((int)10, decayed_priority / 5); + decayed_priority = std::max(1, decayed_priority - gap); + } + LOGINFO("Follower priority={} decayed_priority={}", follower_priority, decayed_priority); + ASSERT_TRUE(follower_priority >= decayed_priority); + } + // Set back to default value + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.max_wait_rounds_of_priority_election = 2; }); + HS_SETTINGS_FACTORY().save(); + g_helper->sync_for_cleanup_start(); +} + int main(int argc, char* argv[]) { int parsed_argc = argc; char** orig_argv = argv; @@ -1012,7 +588,6 @@ int main(int argc, char* argv[]) { // Snapshot and truncation tests needs num reserved to be 0 and distance 10. s.consensus.num_reserved_log_items = 0; - s.consensus.snapshot_freq_distance = 10; s.resource_limits.resource_audit_timer_ms = 0; // only reset when user specified the value for test; @@ -1030,7 +605,8 @@ int main(int argc, char* argv[]) { FLAGS_folly_global_cpu_executor_threads = 4; g_helper = std::make_unique< test_common::HSReplTestHelper >("test_raft_repl_dev", args, orig_argv); - g_helper->setup(); + // No spare replica's are created. Test cases in this file expects fixed number of replica's. + g_helper->setup(SISL_OPTIONS["replicas"].as< uint32_t >()); auto ret = RUN_ALL_TESTS(); g_helper->teardown(); diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp new file mode 100644 index 000000000..4ae56a9c3 --- /dev/null +++ b/src/tests/test_raft_repl_dev_dynamic.cpp @@ -0,0 +1,460 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include "test_common/raft_repl_test_base.hpp" +#include +#include "common/homestore_config.hpp" + +// Dynamic tests spawn spare replica's also which can be used to add and remove from a repl dev. +class ReplDevDynamicTest : public RaftReplDevTestBase { +private: + bool is_replica_num_in(const std::set< uint32_t >& replicas) { + // Check if the current replica process is in this set. + return replicas.count(g_helper->replica_num()) != 0 ? true : false; + } +}; + +TEST_F(ReplDevDynamicTest, ReplaceMember) { + LOGINFO("ReplaceMember test started replica={}", g_helper->replica_num()); + // Write some IO's, replace a member, validate all members data except which is out. + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + if (g_helper->replica_num() < num_replicas) { + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + std::this_thread::sleep_for(std::chrono::seconds(3)); + } else if (g_helper->replica_num() == member_in) { + LOGINFO("Wait for commits replica={}", g_helper->replica_num()); + wait_for_commits(num_io_entries); + } + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (is_replica_num_in({0, 1, member_in})) { + // Skip the member which is going to be replaced. Validate data on all other replica's. + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } + g_helper->sync_for_verify_start(num_members); + LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); + + //wait for background reaper thread to trigger complete_replace_member + if (g_helper->replica_num() == member_out) { + // The out member will have the repl dev destroyed. + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + while (repl_dev && !repl_dev->is_destroyed()) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num()); + } + LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); + } + + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("ReplaceMember test done replica={}", g_helper->replica_num()); +} + +TEST_F(ReplDevDynamicTest, TwoMemberDown) { + LOGINFO("TwoMemberDown test started replica={}", g_helper->replica_num()); + + // Make two members down in a group and leader cant reach a quorum. + // We set the custom quorum size to 1 and call replace member. + // Leader should do some writes to validate it has reach quorum size. + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + + // Shutdown replica 1 and replica 2 to simulate two member down. + if (g_helper->replica_num() == 1) { + this->shutdown_replica(1); + LOGINFO("Shutdown replica 1"); + } + + if (g_helper->replica_num() == 2) { + this->shutdown_replica(2); + LOGINFO("Shutdown replica 2"); + } + + if (g_helper->replica_num() == 0) { + // Replace down replica 2 with spare replica 3 with commit quorum 1 + // so that leader can go ahead with replacing member. + LOGINFO("Replace member started"); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + LOGINFO("Leader completed num_io={}", num_io_entries); + } + + if (g_helper->replica_num() == member_in) { + wait_for_commits(num_io_entries); + LOGINFO("Member in got all commits"); + } + + if (is_replica_num_in({0, member_in})) { + // Validate data on leader replica 0 and replica 3 + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } + + if (g_helper->replica_num() == 1) { + LOGINFO("Start replica 1"); + db->set_zombie(); + this->start_replica(1); + } + if (g_helper->replica_num() == 2) { + LOGINFO("Start replica 2"); + db->set_zombie(); + this->start_replica(2); + } + + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("TwoMemberDown test done replica={}", g_helper->replica_num()); +} + +TEST_F(ReplDevDynamicTest, OutMemberDown) { + // replica0(leader) and replica1 up, replica2 is down. Replace replica2 with replica3. + // replica0 should be able to baseline resync to replica4(new member). + // Write some IO's, replace a member, validate all members data except which is out. + LOGINFO("OutMemberDown test started replica={}", g_helper->replica_num()); + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + + std::this_thread::sleep_for(std::chrono::seconds(3)); + if (g_helper->replica_num() == 0) { + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + } + //shut down before replace member + this->shutdown_replica(2); + LOGINFO("Shutdown replica 2"); + + if (g_helper->replica_num() == 0) { + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + std::this_thread::sleep_for(std::chrono::seconds(3)); + } else if (g_helper->replica_num() == member_in) { + LOGINFO("Wait for commits replica={}", g_helper->replica_num()); + wait_for_commits(num_io_entries); + } + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (is_replica_num_in({0, 1, member_in})) { + // Skip the member which is going to be replaced. Validate data on all other replica's. + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } + + // shutdown after becoming learner, in this case, the member_out won't remove replDev after restart. + // this->shutdown_replica(2); + // LOGINFO("Shutdown replica 2"); + // std::this_thread::sleep_for(std::chrono::seconds(2)); + + // data synced, waiting for removing learner + LOGINFO("data synced, sync for completing replace member, replica={}", g_helper->replica_num()); + g_helper->sync_for_verify_start(num_members); + // Since the out_member stopped, it cannot response to remove_srv req, as a result the first time will get CANCELLED + // error, so waiting time is longer than other tests. + if (g_helper->replica_num() == 2) { + LOGINFO("Start replica 2"); + this->start_replica(2); + // The out member will have the repl dev destroyed. + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + while (repl_dev && !repl_dev->is_destroyed()) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num()); + } + LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); + db->set_zombie(); + } + + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("OneMemberDown test done replica={}", g_helper->replica_num()); +} + +TEST_F(ReplDevDynamicTest, LeaderReplace) { + // replica0(leader) and replica1 and replica2 is up. Replace replica0(leader) with replica3. + // replica0 will yield leadership and any other replica will be come leader and leader + // will do baseline resync to replica4(new member). + // Write some IO's, replace a member, validate all members data except which is out. + LOGINFO("LeaderReplace test started replica={}", g_helper->replica_num()); + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the leader in the group with index(0) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = 0; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + + if (g_helper->replica_num() == member_out) { + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + + // Leader will return error NOT_LEADER and yield leadership, sleep and connect again + // to the new leader. + LOGINFO("Replace old leader"); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + ReplServiceError::NOT_LEADER); + LOGINFO("Replace member leader yield done"); + } + std::this_thread::sleep_for(std::chrono::seconds(3)); + if (g_helper->replica_num() != member_in) { + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + LOGINFO("Replace member old leader done"); + } + + if (g_helper->replica_num() == member_in) { + LOGINFO("Wait for commits replica={}", g_helper->replica_num()); + wait_for_commits(num_io_entries); + } + + g_helper->sync_for_verify_start(num_members); + if (is_replica_num_in({0, 1, member_in})) { + // Skip the member which is going to be replaced. Validate data on all other replica's. + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } + + g_helper->sync_for_verify_start(num_members); + LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); + if (g_helper->replica_num() == member_out) { + // The out member will have the repl dev destroyed. + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + while (repl_dev && !repl_dev->is_destroyed()) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num()); + } + LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); + db->set_zombie(); + } + + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("LeaderReplace test done replica={}", g_helper->replica_num()); +} + +TEST_F(ReplDevDynamicTest, OneMemberRestart) { + // replica0(leader) is up and replica1 is restated, replica2 is down. Replace replica2 with replica3. + // replica0 should be able to baseline resync to replica4(new member). + // Write some IO's, replace a member, validate all members data except which is out. + LOGINFO("OneMemberRestart test started replica={}", g_helper->replica_num()); + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + if (g_helper->replica_num() == 1) { + LOGINFO("Restart replica 1, "); + this->restart_replica(15); + } + + if (g_helper->replica_num() == 0) { + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in)); + std::this_thread::sleep_for(std::chrono::seconds(3)); + } else if (g_helper->replica_num() == member_in) { + LOGINFO("Wait for commits replica={}", g_helper->replica_num()); + wait_for_commits(num_io_entries); + } + + g_helper->sync_for_verify_start(num_members); + LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num()); + if (is_replica_num_in({0, 1, member_in})) { + // Skip the member which is going to be replaced. Validate data on all other replica's. + LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num()); + this->validate_data(); + } + + g_helper->sync_for_verify_start(num_members); + LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num()); + if (g_helper->replica_num() == member_out) { + // The out member will have the repl dev destroyed. + auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev()); + while (repl_dev && !repl_dev->is_destroyed()) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service()); + raft_repl_svc.gc_repl_devs(); + LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num()); + } + LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num()); + } + + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("OneMemberRestart test done replica={}", g_helper->replica_num()); +} + +TEST_F(ReplDevDynamicTest, ValidateRequest) { + LOGINFO("ValidateRequest test started replica={}", g_helper->replica_num()); + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.consensus.laggy_threshold = 0; + LOGINFO("setup consensus.laggy_threshold to {}", 0); + HS_SETTINGS_FACTORY().save(); + }); + + auto db = dbs_.back(); + auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >(); + + // Replace the last member in the group with index(num_replicas - 1) with a spare + // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N + uint32_t member_out = num_replicas - 1; + uint32_t member_in = num_replicas; + + g_helper->sync_for_test_start(num_members); + + //shut down before replace member + this->shutdown_replica(1); + LOGINFO("Shutdown replica 1"); + + //wait for shutdown + std::this_thread::sleep_for(std::chrono::seconds(3)); + g_helper->sync_for_verify_start(num_members); + if (g_helper->replica_num() == 0) { + // With existing raft repl dev group, write IO's, validate and call replace_member on leader. + LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num()); + this->write_on_leader(num_io_entries, true /* wait_for_commit */); + } + g_helper->sync_for_verify_start(num_members); + if (g_helper->replica_num() == 0) { + // generate uuid + replica_id_t fake_member_out = boost::uuids::random_generator()(); + replica_id_t fake_member_in = boost::uuids::random_generator()(); + LOGINFO("test SERVER_NOT_FOUND"); + replace_member(db, fake_member_out, fake_member_in, 0, ReplServiceError::SERVER_NOT_FOUND); + LOGINFO("test replace_member already complete"); + replace_member(db, fake_member_out, g_helper->replica_id(0)); + LOGINFO("test QUORUM_NOT_MET", num_io_entries, g_helper->replica_num()); + replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0, + ReplServiceError::QUORUM_NOT_MET); + } + + if (g_helper->replica_num() == 1) { + LOGINFO("Start replica 1"); + this->start_replica(1); + } + g_helper->sync_for_cleanup_start(num_members); + LOGINFO("ValidateRequest test done replica={}", g_helper->replica_num()); +} + +int main(int argc, char* argv[]) { + int parsed_argc = argc; + char** orig_argv = argv; + + // Save the args for replica use + std::vector< std::string > args; + for (int i = 0; i < argc; ++i) { + args.emplace_back(argv[i]); + } + + ::testing::InitGoogleTest(&parsed_argc, argv); + + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, config, test_raft_repl_dev, iomgr, test_common_setup, + test_repl_common_setup); + + // + // Entire test suite assumes that once a replica takes over as leader, it stays until it is explicitly yielded. + // Otherwise it is very hard to control or accurately test behavior. Hence we forcibly override the + // leadership_expiry time. + // + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.generic.repl_dev_cleanup_interval_sec = 1; + + // Disable implicit flush and timer. + s.logstore.flush_threshold_size = 0; + s.logstore.flush_timer_frequency_us = 0; + + // Snapshot and truncation tests needs num reserved to be 0 and distance 10. + s.consensus.num_reserved_log_items = 0; + s.resource_limits.resource_audit_timer_ms = 0; + + // only reset when user specified the value for test; + if (SISL_OPTIONS.count("snapshot_distance")) { + s.consensus.snapshot_freq_distance = SISL_OPTIONS["snapshot_distance"].as< uint32_t >(); + } + if (SISL_OPTIONS.count("num_raft_logs_resv")) { + s.resource_limits.raft_logstore_reserve_threshold = SISL_OPTIONS["num_raft_logs_resv"].as< uint32_t >(); + } + if (SISL_OPTIONS.count("res_mgr_audit_timer_ms")) { + s.resource_limits.resource_audit_timer_ms = SISL_OPTIONS["res_mgr_audit_timer_ms"].as< uint32_t >(); + } + }); + HS_SETTINGS_FACTORY().save(); + + FLAGS_folly_global_cpu_executor_threads = 4; + g_helper = std::make_unique< test_common::HSReplTestHelper >("test_raft_repl_dev_dynamic", args, orig_argv); + + // We spawn spare replica's also for dynamic repl dev tests. + auto total_replicas = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >(); + g_helper->setup(total_replicas); + + auto ret = RUN_ALL_TESTS(); + g_helper->teardown(); + + std::string str; + sisl::ObjCounterRegistry::foreach ([&str](const std::string& name, int64_t created, int64_t alive) { + fmt::format_to(std::back_inserter(str), "{}: created={} alive={}\n", name, created, alive); + }); + LOGINFO("Object Life Counter\n:{}", str); + + return ret; +} diff --git a/src/tests/test_scripts/CMakeLists.txt b/src/tests/test_scripts/CMakeLists.txt index e1b5ff78c..4bb54bad5 100644 --- a/src/tests/test_scripts/CMakeLists.txt +++ b/src/tests/test_scripts/CMakeLists.txt @@ -1,15 +1,4 @@ -file(COPY vol_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY home_blk_flip.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY home_blk_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY index_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY log_meta_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY data_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) -file(COPY long_running.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts) - -#add_test(NAME TestVolRecovery COMMAND ${CMAKE_BINARY_DIR}/bin/scripts/vol_test.py --test_suits=recovery --dirpath=${CMAKE_BINARY_DIR}/bin/) -#SET_TESTS_PROPERTIES(TestVolRecovery PROPERTIES DEPENDS TestVol) - -#add_test(NAME PerfTestVol COMMAND perf_test_volume) -#add_test(NAME RecoveryVol COMMAND python vol_test.py) -#add_test(NAME CheckBtree COMMAND check_btree) - +file(COPY index_test.py DESTINATION ../test_scripts) +file(COPY log_meta_test.py DESTINATION ../test_scripts) +file(COPY data_test.py DESTINATION ../test_scripts) +file(COPY long_running.py DESTINATION ../test_scripts) diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index 4e4814ccb..b9e55a15e 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -20,11 +20,13 @@ def run_test(options, type): raise TestFailedError(f"Test failed for type {type}") print("Test completed") + def run_crash_test(options): - cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --max_keys_in_node={options['max_keys_in_node']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} {options['dev_list']}" + cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --log_mods=wbcache:trace --max_keys_in_node={options['max_keys_in_node']} --num_entries_per_rounds={options['num_entries_per_rounds']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} --num_rounds={options['num_rounds']} {options['dev_list']} " # print(f"Running test with options: {cmd_opts}") try: - subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT, shell=True) + subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT, + shell=True) except subprocess.CalledProcessError as e: print(f"Test failed: {e}") raise TestFailedError(f"Test failed for type {type}") @@ -49,7 +51,10 @@ def parse_arguments(): parser.add_argument('--dev_list', help='Device list', default='') parser.add_argument('--cleanup_after_shutdown', help='Cleanup after shutdown', type=bool, default=False) parser.add_argument('--init_device', help='Initialize device', type=bool, default=True) - parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=20) + parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=10) + parser.add_argument('--min_keys_in_node', help='Minimum num of keys in btree nodes', type=int, default=2) + parser.add_argument('--num_rounds', help='number of rounds for crash test', type=int, default=1000) + parser.add_argument('--num_entries_per_rounds', help='number of rounds for crash test', type=int, default=100) # Parse the known arguments and ignore any unknown arguments args, unknown = parser.parse_known_args() @@ -73,8 +78,7 @@ def long_runnig_index(options, type=0): def long_running_clean_shutdown(options, type=0): print("Long running clean shutdown started") - options['run_time'] = int(options['run_time']) // 10 # 20 minutes - + options['run_time'] = options['run_time'] // 10 try: run_test(options, type) options['init_device'] = False @@ -87,14 +91,42 @@ def long_running_clean_shutdown(options, type=0): raise print("Long running clean shutdown completed") + def long_running_crash_put(options): print("Long running crash put started") - options['num_entries'] = 20480 # 20K + options['num_entries'] = 1310720 # 1280K options['init_device'] = True + options['run_time'] = 14400 # 4 hours + options['preload_size'] = 1024 print(f"options: {options}") - run_crash_test(options) + run_crash_test(options, 'put', 0) print("Long running crash put completed") +def long_running_crash_remove(options): + print("Long running crash remove started") + options['num_entries'] = 1000 + options['init_device'] = True + options['run_time'] = 14400 # 4 hours + options['num_entries_per_rounds'] = 100 + options['min_keys_in_node'] = 2 + options['max_keys_in_node'] = 10 + print(f"options: {options}") + run_crash_test(options, 'remove', 0) + print("Long running crash put completed") + +def long_running_crash_put_remove(options): + print("Long running crash put_remove started") + options['num_entries'] = 2000 # 1280K + options['init_device'] = True + options['run_time'] = 14400 # 4 hours + options['preload_size'] = 1024 + options['min_keys_in_node'] = 3 + options['max_keys_in_node'] = 10 + print(f"options: {options}") + run_crash_test(options, 'put_remove', 0) + print("Long running crash put_remove completed") + + def main(): options = parse_arguments() test_suite_name = options['test_suits'] @@ -112,6 +144,19 @@ def main(): def long_running(*args): options = parse_arguments() + long_runnig_index(options, 0) + long_running_clean_shutdown(options, 0) + long_runnig_index(options, 1) + long_running_clean_shutdown(options, 1) + for i in range(20): + print(f"Iteration {i + 1}") + long_running_crash_put_remove(options) + for i in range(50): + print(f"Iteration {i + 1}") + long_running_crash_remove(options) + for i in range(5): + print(f"Iteration {i + 1}") + long_running_crash_put(options) long_runnig_index(options) long_running_clean_shutdown(options) long_running_crash_put(options) diff --git a/src/tests/test_scripts/log_meta_test.py b/src/tests/test_scripts/log_meta_test.py index 5ffda0018..83c8f994f 100755 --- a/src/tests/test_scripts/log_meta_test.py +++ b/src/tests/test_scripts/log_meta_test.py @@ -85,7 +85,7 @@ def meta_nightly(options, addln_opts): subprocess.check_call(options.dirpath + "test_meta_blk_mgr " + cmd_opts + addln_opts, stderr=subprocess.STDOUT, shell=True) - cmd_opts = "--gtest_filter=VMetaBlkMgrTest.random_load_test --gtest_filter=VMetaBlkMgrTest.write_to_full_test --use_file=true" # write to file instead of real disk to save time; + cmd_opts = "--gtest_filter=VMetaBlkMgrTest.write_to_full_test --use_file=true" # write to file instead of real disk to save time; subprocess.check_call(options.dirpath + "test_meta_blk_mgr " + cmd_opts + addln_opts, stderr=subprocess.STDOUT, shell=True) diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index c2b2460b5..57247dad7 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -62,22 +62,15 @@ struct test_repl_req : public repl_req_ctx { sisl::byte_array header; sisl::byte_array key; sisl::sg_list write_sgs; - sisl::sg_list read_sgs; - MultiBlkId written_blkids; + std::vector< MultiBlkId > written_blkids; - test_repl_req() { - write_sgs.size = 0; - read_sgs.size = 0; - } + test_repl_req() { write_sgs.size = 0; } ~test_repl_req() { for (auto const& iov : write_sgs.iovs) { iomanager.iobuf_free(uintptr_cast(iov.iov_base)); } - - for (auto const& iov : read_sgs.iovs) { - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } } + struct journal_header { uint32_t key_size; uint64_t key_pattern; @@ -96,8 +89,9 @@ class SoloReplDevTest : public testing::Test { Listener(SoloReplDevTest& test) : m_test{test} {} virtual ~Listener() = default; - void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, - cintrusive< repl_req_ctx >& ctx) override { + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, + std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override { + LOGINFO("Received on_commit lsn={}", lsn); if (ctx == nullptr) { m_test.validate_replay(*repl_dev(), lsn, header, key, blkids); } else { @@ -110,10 +104,10 @@ class SoloReplDevTest : public testing::Test { AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override { return make_async_success<>(); } - int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override { + int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override { return 0; } - void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override {} + void write_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override {} bool apply_snapshot(shared< snapshot_context > context) override { return true; } shared< snapshot_context > last_snapshot() override { return nullptr; } void free_user_snp_ctx(void*& user_snp_ctx) override {} @@ -125,7 +119,8 @@ class SoloReplDevTest : public testing::Test { void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, cintrusive< repl_req_ctx >& ctx) override {} - ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { + ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size, + cintrusive< homestore::repl_req_ctx >& hs_ctx) override { return blk_alloc_hints{}; } @@ -135,7 +130,12 @@ class SoloReplDevTest : public testing::Test { cintrusive< repl_req_ctx >& ctx) override { LOGINFO("Received error={} on repl_dev", enum_name(error)); } - void on_destroy() override {} + void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {} + void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {} + void on_destroy(const group_id_t& group_id) override {} + void notify_committed_lsn(int64_t lsn) override {} + void on_config_rollback(int64_t lsn) override {} + void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) override {} }; class Application : public ReplApplication { @@ -151,6 +151,8 @@ class SoloReplDevTest : public testing::Test { shared< ReplDevListener > create_repl_dev_listener(uuid_t) override { return std::make_shared< Listener >(m_test); } + void destroy_repl_dev_listener(uuid_t) override {} + void on_repl_devs_init_completed() { LOGINFO("Repl dev init completed CB called"); } std::pair< std::string, uint16_t > lookup_peer(uuid_t uuid) const override { return std::make_pair("", 0u); } replica_id_t get_my_repl_id() const override { return hs_utils::gen_random_uuid(); } }; @@ -221,60 +223,116 @@ class SoloReplDevTest : public testing::Test { rdev->async_alloc_write(*req->header, req->key ? *req->key : sisl::blob{}, req->write_sgs, req); } + void async_write_data_and_journal(uint32_t key_size, uint64_t data_size, uint32_t max_size_per_iov) { + data_size = data_size == 0 ? g_block_size : data_size; + auto req = intrusive< test_repl_req >(new test_repl_req()); + req->header = sisl::make_byte_array(sizeof(test_repl_req::journal_header)); + auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); + hdr->key_size = key_size; + hdr->key_pattern = ((long long)rand() << 32) | rand(); + hdr->data_size = data_size; + hdr->data_pattern = ((long long)rand() << 32) | rand(); + + if (key_size != 0) { + req->key = sisl::make_byte_array(key_size); + HSTestHelper::fill_data_buf(req->key->bytes(), key_size, hdr->key_pattern); + } + + req->write_sgs = HSTestHelper::create_sgs(data_size, max_size_per_iov, hdr->data_pattern); + + auto& rdev = (rand() % 2) ? m_repl_dev1 : m_repl_dev2; + + auto const cap = hs()->repl_service().get_cap_stats(); + LOGDEBUG("Before write, cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); + + std::vector< MultiBlkId > blkids; + blk_alloc_hints hints; + auto err = rdev->alloc_blks(data_size, hints, blkids); + RELEASE_ASSERT(!err, "Error during alloc_blks"); + RELEASE_ASSERT(!blkids.empty(), "Empty blkids"); + + rdev->async_write(blkids, req->write_sgs).thenValue([this, rdev, blkids, data_size, req](auto&& err) { + RELEASE_ASSERT(!err, "Error during async_write"); + rdev->async_write_journal(blkids, *req->header, req->key ? *req->key : sisl::blob{}, data_size, req); + }); + } + void validate_replay(ReplDev& rdev, int64_t lsn, sisl::blob const& header, sisl::blob const& key, - MultiBlkId const& blkids) { + std::vector< MultiBlkId > const& blkids) { + if (blkids.empty()) { + m_task_waiter.one_complete(); + return; + } + auto const jhdr = r_cast< test_repl_req::journal_header const* >(header.cbytes()); HSTestHelper::validate_data_buf(key.cbytes(), key.size(), jhdr->key_pattern); - - uint32_t size = blkids.blk_count() * g_block_size; - if (size) { - auto read_sgs = HSTestHelper::create_sgs(size, size); - LOGDEBUG("[{}] Validating replay of lsn={} blkid = {}", boost::uuids::to_string(rdev.group_id()), lsn, - blkids.to_string()); - rdev.async_read(blkids, read_sgs, size) - .thenValue([this, hdr = *jhdr, read_sgs, lsn, blkids, &rdev](auto&& err) { - RELEASE_ASSERT(!err, "Error during async_read"); - HS_REL_ASSERT_EQ(hdr.data_size, read_sgs.size, "journal hdr data size mismatch with actual size"); - - for (auto const& iov : read_sgs.iovs) { - HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr.data_pattern); - iomanager.iobuf_free(uintptr_cast(iov.iov_base)); - } - LOGDEBUG("[{}] Replay of lsn={} blkid={} validated successfully", - boost::uuids::to_string(rdev.group_id()), lsn, blkids.to_string()); - m_task_waiter.one_complete(); - }); - } else { - m_task_waiter.one_complete(); + uint64_t total_io = blkids.size(); + auto io_count = std::make_shared< std::atomic< uint64_t > >(0); + for (const auto& blkid : blkids) { + uint32_t size = blkid.blk_count() * g_block_size; + if (size) { + auto read_sgs = HSTestHelper::create_sgs(size, size); + LOGDEBUG("[{}] Validating replay of lsn={} blkid = {}", boost::uuids::to_string(rdev.group_id()), lsn, + blkid.to_string()); + rdev.async_read(blkid, read_sgs, size) + .thenValue([this, io_count, total_io, hdr = *jhdr, read_sgs, lsn, blkid, &rdev](auto&& err) { + RELEASE_ASSERT(!err, "Error during async_read"); + // HS_REL_ASSERT_EQ(hdr.data_size, read_sgs.size, + // "journal hdr data size mismatch with actual size"); + + for (auto const& iov : read_sgs.iovs) { + HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr.data_pattern); + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + LOGDEBUG("[{}] Replay of lsn={} blkid={} validated successfully", + boost::uuids::to_string(rdev.group_id()), lsn, blkid.to_string()); + + io_count->fetch_add(1); + if (*io_count == total_io) { m_task_waiter.one_complete(); } + }); + } else { + m_task_waiter.one_complete(); + } } } void on_write_complete(ReplDev& rdev, intrusive< test_repl_req > req) { - // If we did send some data to the repl_dev, validate it by doing async_read - if (req->write_sgs.size != 0) { - req->read_sgs = HSTestHelper::create_sgs(req->write_sgs.size, req->write_sgs.size); - - auto const cap = hs()->repl_service().get_cap_stats(); - LOGDEBUG("Write complete with cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); - - rdev.async_read(req->written_blkids, req->read_sgs, req->read_sgs.size) - .thenValue([this, &rdev, req](auto&& err) { - RELEASE_ASSERT(!err, "Error during async_read"); - - LOGDEBUG("[{}] Write complete with lsn={} for size={} blkids={}", - boost::uuids::to_string(rdev.group_id()), req->lsn(), req->write_sgs.size, - req->written_blkids.to_string()); - auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); - HS_REL_ASSERT_EQ(hdr->data_size, req->read_sgs.size, - "journal hdr data size mismatch with actual size"); - - for (auto const& iov : req->read_sgs.iovs) { - HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr->data_pattern); - } - m_io_runner.next_task(); - }); - } else { + if (req->written_blkids.empty()) { m_io_runner.next_task(); + return; + } + + // If we did send some data to the repl_dev, validate it by doing async_read + auto io_count = std::make_shared< std::atomic< uint64_t > >(0); + for (const auto blkid : req->written_blkids) { + if (req->write_sgs.size != 0) { + auto const cap = hs()->repl_service().get_cap_stats(); + LOGDEBUG("Write complete with cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); + + auto sgs_size = blkid.blk_count() * g_block_size; + auto read_sgs = HSTestHelper::create_sgs(sgs_size, sgs_size); + rdev.async_read(blkid, read_sgs, read_sgs.size) + .thenValue([this, io_count, blkid, &rdev, sgs_size, read_sgs, req](auto&& err) { + RELEASE_ASSERT(!err, "Error during async_read"); + + LOGINFO("[{}] Write complete with lsn={} for size={} blkid={}", + boost::uuids::to_string(rdev.group_id()), req->lsn(), sgs_size, blkid.to_string()); + auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); + // HS_REL_ASSERT_EQ(hdr->data_size, read_sgs.size, + // "journal hdr data size mismatch with actual size"); + + for (auto const& iov : read_sgs.iovs) { + LOGDEBUG("Read data blkid={} len={} data={}", blkid.to_integer(), iov.iov_len, + *(uint64_t*)iov.iov_base); + HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr->data_pattern); + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + io_count->fetch_add(1); + if (*io_count == req->written_blkids.size()) { m_io_runner.next_task(); } + }); + } else { + m_io_runner.next_task(); + } } } }; @@ -295,7 +353,9 @@ TEST_F(SoloReplDevTest, TestRandomSizedDataBlock) { uint32_t key_size = rand() % 512 + 8; this->write_io(key_size, nblks * g_block_size, g_block_size); }); + this->m_io_runner.execute().get(); + LOGINFO("Step 2: Restart homestore and validate replay data.", g_block_size); this->m_task_waiter.start([this]() { this->restart(); }).get(); } @@ -303,6 +363,20 @@ TEST_F(SoloReplDevTest, TestHeaderOnly) { LOGINFO("Step 1: run on worker threads to schedule write"); this->m_io_runner.set_task([this]() { this->write_io(0u, 0u, g_block_size); }); this->m_io_runner.execute().get(); + LOGINFO("Step 2: Restart homestore and validate replay data.", g_block_size); + this->m_task_waiter.start([this]() { this->restart(); }).get(); +} + +TEST_F(SoloReplDevTest, TestAsyncWriteJournal) { + LOGINFO("Step 1: run on worker threads to schedule write for random bytes ranging {}-{}.", 0, 1 * Mi); + this->m_io_runner.set_task([this]() { + uint32_t nblks = rand() % ((1 * Mi) / g_block_size); + uint32_t key_size = rand() % 512 + 8; + this->async_write_data_and_journal(key_size, nblks * g_block_size, g_block_size); + }); + + this->m_io_runner.execute().get(); + LOGINFO("Step 2: Restart homestore and validate replay data.", g_block_size); this->m_task_waiter.start([this]() { this->restart(); }).get(); }