diff --git a/.github/workflows/build_dependencies.yml b/.github/workflows/build_dependencies.yml
index 4c92419ec..12d2093de 100644
--- a/.github/workflows/build_dependencies.yml
+++ b/.github/workflows/build_dependencies.yml
@@ -211,6 +211,8 @@ jobs:
 
     - name: Code Coverage Run
       run: |
+        du -sh ~/.conan2/p/*
+        df -h
         conan build \
             -o "sisl/*:prerelease=${{ inputs.prerelease }}" \
             -o "sisl/*:malloc_impl=${{ inputs.malloc-impl }}" \
diff --git a/.jenkins/Dockerfile b/.jenkins/Dockerfile
index 20c4489b0..dcfdd9d65 100644
--- a/.jenkins/Dockerfile
+++ b/.jenkins/Dockerfile
@@ -1,5 +1,5 @@
 # ##########   #######   ############
-FROM hub.tess.io/sds/sds_develop:4.x-latest
+FROM hub.tess.io/sds/sds_develop:7.x-latest
 LABEL description="Automated HomeStore compilation"
 
 WORKDIR /output
diff --git a/.jenkins/jenkinsfile_nightly b/.jenkins/jenkinsfile_nightly
index 7efd9b935..8083c816b 100644
--- a/.jenkins/jenkinsfile_nightly
+++ b/.jenkins/jenkinsfile_nightly
@@ -1,5 +1,5 @@
 pipeline {
-    agent { label 'sds-builder-2204' }
+    agent { label 'sds-builder-v5' }
     triggers {
           cron('TZ=US/Pacific\nH H(0-2) * * *')
     }
@@ -8,7 +8,7 @@ pipeline {
         ORG = 'sds'
         ECR_URL = 'hub.tess.io'
         ARTIFACTORY_PASS = credentials('ARTIFACTORY_PASS')
-        CONAN_USER = 'sds'
+        CONAN_USER = 'oss'
         failed_stage = ""
     }
     stages {
@@ -26,6 +26,7 @@ pipeline {
                     VER = sh(script: "grep -m 1 ' version =' conanfile.py | awk '{print \$3}' | tr -d '\n' | tr -d '\"'", returnStdout: true)
                     NIGHTLY_TAG = "master-nightly-debug-4.0"
                     ECR_PATH = "${ECR_URL}/${ORG}/${PROJECT}"
+                    CONAN_FLAGS="--name ${PROJECT} --user ${CONAN_USER} --channel ${NIGHTLY_TAG}"
                     failed_stage = ""
                 }
             }
@@ -40,20 +41,25 @@ pipeline {
                 }
                 stage("Build") {
                     steps {
-                        sh "conan create --build missing -o homestore:sanitize=True -pr debug . ${PROJECT}/${VER}@"
-                        sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_index_btree' -exec cp {} .jenkins/test_index_btree \\;"
-                        sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_index_crash_recovery' -exec cp {} .jenkins/test_index_crash_recovery \\;"
-                        sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_meta_blk_mgr' -exec cp {} .jenkins/test_meta_blk_mgr \\;"
-                        sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_log_store' -exec cp {} .jenkins/test_log_store \\;"
-                        sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_home_raft_logstore' -exec cp {} .jenkins/test_home_raft_logstore \\;"
-                        sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_log_store_long_run' -exec cp {} .jenkins/test_log_store_long_run \\;"
-                        sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_data_service' -exec cp {} .jenkins/test_data_service \\;"
-                        sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_raft_repl_dev' -exec cp {} .jenkins/test_raft_repl_dev \\;"
-                        sh "find ${CONAN_USER_HOME} -type f -wholename '*tests/test_solo_repl_dev' -exec cp {} .jenkins/test_solo_repl_dev \\;"
-                        sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/index_test.py' -exec install -Dm755 {} .jenkins/index_test.py \\; "
-                        sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/log_meta_test.py' -exec install -Dm755 {} .jenkins/log_meta_test.py \\; "
-                        sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/data_test.py' -exec install -Dm755 {} .jenkins/data_test.py \\; "
-                        sh "find ${CONAN_USER_HOME} -type f -wholename '*bin/scripts/long_running.py' -exec install -Dm755 {} .jenkins/long_running.py \\; "
+                        sh '''
+                            hostname
+                            echo $NODE_NAME
+                            conan create --build missing -s:h build_type=Debug -o ${PROJECT}/*:sanitize=True ${CONAN_FLAGS} .
+                            
+                            find /home/jenkins -type f -wholename '*/test_index_btree' -exec cp {} .jenkins/test_index_btree \\;
+                            find /home/jenkins -type f -wholename '*/test_index_crash_recovery' -exec cp {} .jenkins/test_index_crash_recovery \\;
+                            find /home/jenkins -type f -wholename '*/test_meta_blk_mgr' -exec cp {} .jenkins/test_meta_blk_mgr \\;
+                            find /home/jenkins -type f -wholename '*/test_log_store' -exec cp {} .jenkins/test_log_store \\;
+                            find /home/jenkins -type f -wholename '*/test_home_raft_logstore' -exec cp {} .jenkins/test_home_raft_logstore \\;
+                            find /home/jenkins -type f -wholename '*/test_log_store_long_run' -exec cp {} .jenkins/test_log_store_long_run \\;
+                            find /home/jenkins -type f -wholename '*/test_data_service' -exec cp {} .jenkins/test_data_service \\;
+                            find /home/jenkins -type f -wholename '*/test_raft_repl_dev' -exec cp {} .jenkins/test_raft_repl_dev \\;
+                            find /home/jenkins -type f -wholename '*/test_solo_repl_dev' -exec cp {} .jenkins/test_solo_repl_dev \\;
+                            find /home/jenkins -type f -wholename '*/test_scripts/index_test.py' -exec install -Dm755 {} .jenkins/index_test.py \\;
+                            find /home/jenkins -type f -wholename '*/test_scripts/log_meta_test.py' -exec install -Dm755 {} .jenkins/log_meta_test.py \\;
+                            find /home/jenkins -type f -wholename '*/test_scripts/data_test.py' -exec install -Dm755 {} .jenkins/data_test.py \\;
+                            find /home/jenkins -type f -wholename '*/test_scripts/long_running.py' -exec install -Dm755 {} .jenkins/long_running.py \\;
+                        '''
                     }
                     post {
                         failure {
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e90a498b..728a2bdbc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,6 +87,18 @@ endif ()
 add_flags("-DPACKAGE_NAME=\\\"${PROJECT_NAME}\\\"")
 add_flags("-DPACKAGE_VERSION=\\\"${PACKAGE_REVISION}\\\"")
 
+# add replication flag
+if (DEFINED REPLICATION)
+  if (${REPLICATION} STREQUAL "ON")
+    add_flags("-DREPLICATION_SUPPORT")
+    message(STATUS "Building with REPLICATION enabled")
+  else()
+    message(STATUS "Building with REPLICATION disabled")
+  endif()
+else()
+  message(STATUS "Building with REPLICATION disabled")
+endif()
+
 if(UNIX)
     # enable proper pread/pwrite and large file
     add_flags("-D_POSIX_C_SOURCE=200809L -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE")
diff --git a/cmake/test_mode.cmake b/cmake/test_mode.cmake
index 486186bd5..4195a68b1 100644
--- a/cmake/test_mode.cmake
+++ b/cmake/test_mode.cmake
@@ -39,6 +39,9 @@ if (DEFINED TEST_TARGET)
           set(${ret} true)
       endif()
   endmacro()
+  macro(can_build_repl_tests ret)
+      set(${ret} false)
+  endmacro()
 else()
   macro(can_build_io_tests ret)
       set(${ret} false)
@@ -55,4 +58,7 @@ else()
   macro(can_build_epoll_io_tests ret)
       set(${ret} false)
   endmacro()
+  macro(can_build_repl_tests ret)
+      set(${ret} false)
+  endmacro()
 endif()
diff --git a/conanfile.py b/conanfile.py
index 445bd4e0a..fab1039da 100644
--- a/conanfile.py
+++ b/conanfile.py
@@ -9,7 +9,7 @@
 
 class HomestoreConan(ConanFile):
     name = "homestore"
-    version = "5.2.2"
+    version = "5.3.1"
 
     homepage = "https://github.com/eBay/Homestore"
     description = "HomeStore Storage Engine"
@@ -25,6 +25,7 @@ class HomestoreConan(ConanFile):
                 "coverage": ['True', 'False'],
                 "sanitize": ['True', 'False'],
                 "testing" : ['full', 'min', 'off', 'epoll_mode', 'spdk_mode'],
+                "replication" : ['off', 'on'],
             }
     default_options = {
                 'shared':       False,
@@ -32,6 +33,7 @@ class HomestoreConan(ConanFile):
                 'coverage':     False,
                 'sanitize':     False,
                 'testing':      'epoll_mode',
+                'replication':  'off',
             }
 
     exports_sources = "cmake/*", "src/*", "CMakeLists.txt", "test_wrap.sh", "LICENSE"
@@ -54,18 +56,27 @@ def build_requirements(self):
     def requirements(self):
         self.requires("iomgr/[^12.1]@oss/master", transitive_headers=True)
         self.requires("sisl/[^13.3]@oss/master", transitive_headers=True)
-        self.requires("nuraft_mesg/[^4.1]@oss/main", transitive_headers=True)
+        if str(self.options.replication) == "on":
+            self.requires("nuraft_mesg/[^4.1]@oss/main", transitive_headers=True)
 
         self.requires("farmhash/cci.20190513@", transitive_headers=True)
         if self.settings.arch in ['x86', 'x86_64']:
             self.requires("isa-l/2.30.0", transitive_headers=True)
 
+        # Tests require OpenSSL 3.x
+        self.requires("openssl/[^3.1]", override=True)
+
     def imports(self):
         self.copy(root_package="sisl", pattern="*", dst="bin/scripts/python/flip/", src="bindings/flip/python/", keep_path=False)
 
     def layout(self):
         self.folders.source = "."
-        self.folders.build = join("build", str(self.settings.build_type))
+        if self.options.get_safe("sanitize"):
+            self.folders.build = join("build", "Sanitized")
+        elif self.options.get_safe("coverage"):
+            self.folders.build = join("build", "Coverage")
+        else:
+            self.folders.build = join("build", str(self.settings.build_type))
         self.folders.generators = join(self.folders.build, "generators")
 
         self.cpp.source.includedirs = ["src/include"]
@@ -94,6 +105,12 @@ def generate(self):
                 tc.variables['BUILD_COVERAGE'] = 'ON'
             elif self.options.get_safe("sanitize"):
                 tc.variables['MEMORY_SANITIZER_ON'] = 'ON'
+        tc.variables["CONAN_PACKAGE_NAME"] = self.name
+        tc.variables["CONAN_PACKAGE_VERSION"] = self.version
+        if str(self.options.replication) == "on":
+            tc.variables["REPLICATION"] = "ON"
+        else:
+            tc.variables["REPLICATION"] = "OFF"
         tc.generate()
 
         # This generates "boost-config.cmake" and "grpc-config.cmake" etc in self.generators_folder
diff --git a/docs/imgs/HomeStore_Disk_Layout2.png b/docs/imgs/HomeStore_Disk_Layout2.png
new file mode 100644
index 000000000..8775927ee
Binary files /dev/null and b/docs/imgs/HomeStore_Disk_Layout2.png differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c032ed95d..7b33a68e8 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,15 +8,27 @@ find_package(isa-l QUIET)
 find_package(iomgr QUIET REQUIRED)
 find_package(farmhash QUIET REQUIRED)
 find_package(GTest QUIET REQUIRED)
-find_package(NuraftMesg QUIET REQUIRED)
+if (DEFINED REPLICATION)
+  if (${REPLICATION} STREQUAL "ON")
+    find_package(NuraftMesg QUIET REQUIRED)
+  endif()
+endif()
 
 list(APPEND COMMON_DEPS
     iomgr::iomgr
     farmhash::farmhash
-    nuraft_mesg::proto
-    nuraft::nuraft
     sisl::sisl
 )
+
+if (DEFINED REPLICATION)
+  if (${REPLICATION} STREQUAL "ON")
+    list(APPEND COMMON_DEPS 
+        nuraft_mesg::proto
+        nuraft::nuraft
+    )
+  endif()
+endif()
+
 if (${isa-l_FOUND})
     list(APPEND COMMON_DEPS isa-l::isa-l)
 else ()
@@ -42,7 +54,11 @@ add_subdirectory(lib/logstore)
 add_subdirectory(lib/meta)
 add_subdirectory(lib/index)
 add_subdirectory(lib/blkdata_svc/)
-add_subdirectory(lib/replication/)
+if (DEFINED REPLICATION)
+  if (${REPLICATION} STREQUAL "ON")
+    add_subdirectory(lib/replication/)
+  endif()
+endif()
 
 if(NOT DEFINED BUILD_TESTING OR BUILD_TESTING)
     add_subdirectory(tests)
@@ -59,20 +75,19 @@ set(HOMESTORE_OBJECTS
     $<TARGET_OBJECTS:hs_mem_btree>
     $<TARGET_OBJECTS:hs_cow_btree>
     $<TARGET_OBJECTS:hs_datasvc>
-    $<TARGET_OBJECTS:hs_replication>
     lib/homestore.cpp
     lib/crc.cpp
 )
-#target_link_libraries(homestore_objs ${COMMON_DEPS})
-if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-    add_library(homestore STATIC
-        ${HOMESTORE_OBJECTS} 
-    )
-else()
-    add_library(homestore STATIC 
-        ${HOMESTORE_OBJECTS} 
-    )
+
+if (DEFINED REPLICATION)
+  if (${REPLICATION} STREQUAL "ON")
+    list(APPEND HOMESTORE_OBJECTS $<TARGET_OBJECTS:hs_replication>)
+  endif()
 endif()
+#target_link_libraries(homestore_objs ${COMMON_DEPS})
 
+add_library(homestore STATIC 
+    ${HOMESTORE_OBJECTS} 
+)
 target_compile_definitions (homestore PRIVATE LOG_MODS_V2_SUPPORT)
 target_link_libraries(homestore PRIVATE ${COMMON_DEPS})
diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h
index b9e22740c..a3e0a7768 100644
--- a/src/include/homestore/blk.h
+++ b/src/include/homestore/blk.h
@@ -251,9 +251,14 @@ VENUM(BlkAllocStatus, uint32_t,
 
 struct blk_alloc_hints {
     blk_temp_t desired_temp{0};                  // Temperature hint for the device
-    std::optional< uint32_t > pdev_id_hint{std::nullopt};      // which physical device to pick (hint if any)
-    std::optional< chunk_num_t > chunk_id_hint{std::nullopt};  // any specific chunk id to pick for this allocation
+    std::optional< uint32_t > reserved_blks{std::nullopt};    // Reserved blks in a chunk
+    std::optional< uint32_t > pdev_id_hint{std::nullopt};     // which physical device to pick (hint if any)
+    std::optional< chunk_num_t > chunk_id_hint{std::nullopt}; // any specific chunk id to pick for this allocation
+    std::optional< MultiBlkId > committed_blk_id{
+        std::nullopt}; //  blk id indicates the blk was already allocated and committed, don't allocate and commit again
     std::optional< stream_id_t > stream_id_hint{std::nullopt}; // any specific stream to pick
+    std::optional< uint64_t > application_hint{
+        std::nullopt};                           // hints in uint64 what will be passed opaque to select_chunk
     bool can_look_for_other_chunk{true};         // If alloc on device not available can I pick other device
     bool is_contiguous{true};                    // Should the entire allocation be one contiguous block
     bool partial_alloc_ok{false};   // ok to allocate only portion of nblks? Mutually exclusive with is_contiguous
diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp
index b82ec886b..69b2f2ee4 100644
--- a/src/include/homestore/blkdata_service.hpp
+++ b/src/include/homestore/blkdata_service.hpp
@@ -56,17 +56,19 @@ class BlkDataService {
 
     /**
      * @brief Creates a new virtual device with the specified size and block size, using the specified
-     * block allocator and chunk selector types. The virtual device will be composed of the specified
-     * number of chunks.
+     * block allocator and chunk selector types. The virtual device will be composed of a number of chunks.
+     * Either `num_chunks` or `chunk_size` must be specified.
+     * Prioritize `num_chunks` over `chunk_size` if both are provided.
      *
      * @param size The size of the virtual device, in bytes.
      * @param blk_size The size of each block in the virtual device, in bytes.
      * @param alloc_type The type of block allocator to use for the virtual device.
      * @param chunk_sel_type The type of chunk selector to use for the virtual device.
      * @param num_chunks The number of chunks to use for the virtual device.
+     * @param chunk_size The size of chunks to use for the virtual device, in bytes.
      */
     void create_vdev(uint64_t size, HSDevType devType, uint32_t blk_size, blk_allocator_type_t alloc_type,
-                     chunk_selector_type_t chunk_sel_type, uint32_t num_chunks);
+                     chunk_selector_type_t chunk_sel_type, uint32_t num_chunks, uint32_t chunk_size);
 
     /**
      * @brief Opens a virtual device with the specified virtual device information.
@@ -112,6 +114,18 @@ class BlkDataService {
     folly::Future< std::error_code > async_write(sisl::sg_list const& sgs, MultiBlkId const& in_blkids,
                                                  bool part_of_batch = false);
 
+    /**
+     * @brief : asynchronous write with input block ids;
+     *
+     * @param sgs : the data buffer that needs to be written
+     * @param hints : blk alloc hints
+     * @param in_blkids : input block ids that this write should be written to;
+     * @param cb : callback that will be triggered after write completes
+     * @param part_of_batch : is this write part of a batch;
+     */
+    folly::Future< std::error_code > async_write(sisl::sg_list const& sgs, std::vector< MultiBlkId > const& in_blkids,
+                                                 bool part_of_batch = false);
+
     /**
      * @brief Asynchronously reads data from the specified block ID into the provided buffer.
      *
@@ -137,6 +151,13 @@ class BlkDataService {
     folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size,
                                                 bool part_of_batch = false);
 
+    /**
+     * @brief Submit the io batch, which is a mandatory method to be called if read/write are issued with part_of_batch
+     * is set to true. In those cases, without this method, IOs might not be even issued. No-op if previous io requests
+     * are not part of batch.
+     * */
+    void submit_io_batch();
+
     /**
      * @brief Commits the block with the given MultiBlkId.
      *
@@ -145,7 +166,8 @@ class BlkDataService {
     BlkAllocStatus commit_blk(MultiBlkId const& bid);
 
     /**
-     * @brief Allocates a contiguous block of disk space of the given size.
+     * @brief Allocates a contiguous block of disk space of the given size. This API should be called that when consumer
+     * is expecting blks only allocated on same chunk.
      *
      * @param size The size of the block to allocate, in bytes.
      * @param hints Hints for how to allocate the block.
@@ -154,6 +176,17 @@ class BlkDataService {
      */
     BlkAllocStatus alloc_blks(uint32_t size, blk_alloc_hints const& hints, MultiBlkId& out_blkids);
 
+    /**
+     * @brief Allocates blocks of disk space of the given size.This API should be called when consumer is expecting blk
+     * allocation happen on different chunks is possible and acceptable.
+     *
+     * @param size The size of the block to allocate, in bytes.
+     * @param hints Hints for how to allocate the block.
+     * @param out_blkids Output parameter that will be filled with the IDs of the allocated blocks.
+     * @return The status of the block allocation attempt.
+     */
+    BlkAllocStatus alloc_blks(uint32_t size, blk_alloc_hints const& hints, std::vector< BlkId >& out_blkids);
+
     /**
      * @brief Asynchronously frees the specified block IDs.
      * It is asynchronous because it might need to wait for pending read to complete if same block is being read and not
@@ -194,10 +227,35 @@ class BlkDataService {
      */
     void start();
 
+    /**
+     * @brief Gets the total capacity of the block data service.
+     *
+     * This function returns the total capacity of the block data service, in bytes.
+     *
+     * @return The total capacity of the block data service, in bytes.
+     */
     uint64_t get_total_capacity() const;
 
+    /**
+     * @brief Gets the used capacity of the block data service.
+     *
+     * This function returns the used capacity of the block data service, in bytes.
+     *
+     * @return The used capacity of the block data service, in bytes.
+     */
     uint64_t get_used_capacity() const;
 
+    /**
+     * @brief Gets the drive type of the data service.
+     *
+     * Data Service doesn't support mixed drive types.
+     *
+     * @return The drive type of the data service, HDD or NVME.
+     */
+    HSDevType get_dev_type() const;
+
+    void stop();
+
 private:
     /**
      * @brief Initializes the block data service.
diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp
index ee65a8d0d..c159dc2f3 100644
--- a/src/include/homestore/btree/btree.hpp
+++ b/src/include/homestore/btree/btree.hpp
@@ -261,8 +261,10 @@ class Btree : public BtreeBase {
 
     btree_status_t do_sweep_query(BtreeNodePtr& my_node, BtreeQueryRequest< K >& qreq,
                                   std::vector< std::pair< K, V > >& out_values);
+
     btree_status_t do_traversal_query(const BtreeNodePtr& my_node, BtreeQueryRequest< K >& qreq,
                                       std::vector< std::pair< K, V > >& out_values);
+
 #ifdef SERIALIZABLE_QUERY_IMPLEMENTATION
     btree_status_t do_serialzable_query(const BtreeNodePtr& my_node, BtreeSerializableQueryRequest& qreq,
                                         std::vector< std::pair< K, V > >& out_values);
diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp
index 0df733575..0a8f57686 100644
--- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp
+++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp
@@ -357,6 +357,11 @@ btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const
     child_node1->inc_link_version();
 
     // Update the existing parent node entry to point to second child ptr.
+    // Don't change the order. First update the parent node and then insert the new key. This is important for casee
+    // where the split key is the last key in the parent node. In this case, the split key should be inserted in the
+    // parent node. If we insert the split key first, then the split key will be inserted in the parent node and the
+    // last key in the parent node will be lost. This will lead to inconsistency in the tree. In case of empty parent
+    // (i.e., new root) or updating the edge, this order made sure that edge is updated.
     parent_node->update(parent_ind, child_node2->link_info());
     parent_node->insert(parent_ind, *out_split_key, child_node1->link_info());
 
diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp
index b58174dc3..8bf83966c 100644
--- a/src/include/homestore/btree/detail/btree_node.hpp
+++ b/src/include/homestore/btree/detail/btree_node.hpp
@@ -364,6 +364,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {
 
     template < typename K >
     K get_first_key() const {
+        if (total_entries() == 0) { return K{}; }
         return get_nth_key< K >(0, true);
     }
 
@@ -463,6 +464,12 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {
             }
             fmt::format_to(std::back_inserter(str), "]");
         }
+
+        // Should not happen
+        if (this->is_node_deleted()) {
+            fmt::format_to(std::back_inserter(str), " **DELETED** ");
+        }
+
         return str;
     }
 
diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp
index 67acd7d5a..04e483377 100644
--- a/src/include/homestore/btree/detail/btree_remove_impl.ipp
+++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp
@@ -469,6 +469,9 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const
         BT_NODE_LOG_ASSERT_EQ(child->is_node_deleted(), false, child);
 
         old_nodes.push_back(child);
+        // Todo: need a more precise calculation considering compacted size for prefix nodes because when merge happens
+        // compaction will occur for both leftmost and new nodes. This calculation makes available size not be balanced
+        // for the leftmost node and new nodes.
         total_size += child->occupied_size();
     }
 
@@ -506,6 +509,13 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const
         auto const nentries = old_nodes[i]->num_entries_by_size(0, available_size);
         if ((old_nodes[i]->total_entries() - nentries) == 0) { // Entire node goes in
             available_size -= old_nodes[i]->occupied_size();
+            // For prefix nodes, compaction will make the size smaller, so we can compact saving to available size;
+            // hence it cannot get negative.
+            if (old_nodes[i]->get_node_type() == btree_node_type::PREFIX) {
+                auto cur_node = static_cast< FixedPrefixNode< K, V >* >(old_nodes[i].get());
+                available_size += cur_node->compact_saving();
+            }
+            BT_NODE_DBG_ASSERT_EQ(available_size >= 0, true, leftmost_node, "negative available size");
             if (i >= old_nodes.size() - 1) {
                 src_cursor.ith_node = i + 1;
                 src_cursor.nth_entry = std::numeric_limits< uint32_t >::max();
diff --git a/src/include/homestore/btree/node_variant/prefix_node.hpp b/src/include/homestore/btree/node_variant/prefix_node.hpp
index 2892aec63..7db486f88 100644
--- a/src/include/homestore/btree/node_variant/prefix_node.hpp
+++ b/src/include/homestore/btree/node_variant/prefix_node.hpp
@@ -316,6 +316,7 @@ class FixedPrefixNode : public VariantNode< K, V > {
 
     ///////////////////////////// All overrides of BtreeNode ///////////////////////////////////
     void get_nth_key_internal(uint32_t idx, BtreeKey& out_key, bool) const override {
+        DEBUG_ASSERT_LT(idx, this->total_entries(), "node={}", to_string());
         suffix_entry const* sentry = get_suffix_entry_c(idx);
         prefix_entry const* pentry = get_prefix_entry_c(sentry->prefix_slot);
         DEBUG_ASSERT(prefix_bitset_.is_bit_set(cbitset_blob(), sentry->prefix_slot),
@@ -337,10 +338,16 @@ class FixedPrefixNode : public VariantNode< K, V > {
         }
     }
 
+    uint16_t get_nth_suffix_slot_num(uint32_t idx) const { return get_suffix_entry_c(idx)->prefix_slot; }
+
+    uint16_t get_nth_prefix_ref_count(uint32_t idx) const {
+        return get_prefix_entry_c(get_suffix_entry_c(idx)->prefix_slot)->ref_count;
+    }
+
     uint32_t available_size() const override {
         auto num_holes = num_prefix_holes();
         if (num_holes > prefix_node_header::min_holes_to_compact) {
-            return available_size_without_compaction() + (num_holes * prefix_entry::size());
+            return available_size_with_compaction();
         } else {
             return available_size_without_compaction();
         }
@@ -424,7 +431,6 @@ class FixedPrefixNode : public VariantNode< K, V > {
         // part of Step 1, except generation count
         this->inc_gen();
         dst_node.inc_gen();
-        auto new_phdr = dst_node.prefix_header();
 
         if (!this->is_leaf() && (dst_node.total_entries() != 0)) {
             // Incase this node is an edge node, move the stick to the right hand side node
@@ -660,10 +666,10 @@ class FixedPrefixNode : public VariantNode< K, V > {
     }
 
     std::string to_string(bool print_friendly = false) const override {
-        auto str = fmt::format("{}id={} level={} nEntries={} {} next_node={} ",
+        auto str = fmt::format("{}id={} level={} nEntries={} {} next_node={} available_size={} ",
                                (print_friendly ? "------------------------------------------------------------\n" : ""),
                                this->node_id(), this->level(), this->total_entries(),
-                               (this->is_leaf() ? "LEAF" : "INTERIOR"), this->next_bnode());
+                               (this->is_leaf() ? "LEAF" : "INTERIOR"), this->next_bnode(), this->available_size());
         if (!this->is_leaf() && (this->has_valid_edge())) {
             fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid,
                            this->edge_info().m_link_version);
@@ -674,9 +680,10 @@ class FixedPrefixNode : public VariantNode< K, V > {
                        prefix_bitset_.to_string(cbitset_blob()));
 
         for (uint32_t i{0}; i < this->total_entries(); ++i) {
-            fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={}]", (print_friendly ? "\n\t" : " "), i + 1,
-                           BtreeNode::get_nth_key< K >(i, false).to_string(),
-                           this->get_nth_value(i, false).to_string());
+            fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={} slot#={} ref_count={}]",
+                           (print_friendly ? "\n\t" : " "), i + 1, BtreeNode::get_nth_key< K >(i, false).to_string(),
+                           this->get_nth_value(i, false).to_string(), this->get_nth_suffix_slot_num(i),
+                           this->get_nth_prefix_ref_count(i));
         }
         return str;
     }
@@ -705,7 +712,10 @@ class FixedPrefixNode : public VariantNode< K, V > {
 
         auto phdr = prefix_header();
         ++phdr->used_slots;
-        if (slot_num > phdr->tail_slot) { phdr->tail_slot = slot_num; }
+        if (s_cast< uint16_t >(slot_num) >= phdr->tail_slot) { phdr->tail_slot = slot_num + 1; }
+
+        DEBUG_ASSERT_LE(phdr->used_slots, phdr->tail_slot, "Prefix slot number {} is not less than tail slot number {}",
+                        slot_num, phdr->tail_slot);
         return slot_num;
     }
 
@@ -720,9 +730,9 @@ class FixedPrefixNode : public VariantNode< K, V > {
         if (--pentry->ref_count == 0) {
             --phdr->used_slots;
             prefix_bitset_.reset_bit(sisl::blob{bitset_area(), uint32_cast(bitset_size())}, slot_num);
-            if ((slot_num != 0) && (slot_num == phdr->tail_slot)) {
+            if ((slot_num == phdr->tail_slot - 1)) {
                 uint16_t prev_slot = prefix_bitset_.get_prev_set_bit(cbitset_blob(), slot_num);
-                if (prev_slot != std::numeric_limits< uint16_t >::max()) { phdr->tail_slot = prev_slot; }
+                phdr->tail_slot = prev_slot + 1u;
             }
         }
     }
@@ -736,12 +746,14 @@ class FixedPrefixNode : public VariantNode< K, V > {
 
     uint32_t available_size_without_compaction() const {
         uint8_t const* suffix = r_cast< uint8_t const* >(get_suffix_entry_c(this->total_entries()));
-        uint8_t const* prefix = r_cast< uint8_t const* >(get_prefix_entry_c(cprefix_header()->tail_slot));
+        uint8_t const* prefix =
+            r_cast< uint8_t const* >(get_prefix_entry_c(cprefix_header()->tail_slot)) + prefix_entry::size();
 
         if (suffix <= prefix) {
             return prefix - suffix;
         } else {
-            DEBUG_ASSERT(false, "Node data is corrupted, suffix area is overlapping prefix area");
+            DEBUG_ASSERT(false, "Node data is corrupted, suffix area is overlapping prefix area {}",
+                         int64_t(suffix - prefix));
             return 0;
         }
     }
@@ -760,7 +772,8 @@ class FixedPrefixNode : public VariantNode< K, V > {
 
     uint32_t num_prefix_holes() const {
         auto phdr = cprefix_header();
-        return (phdr->tail_slot + 1 - phdr->used_slots);
+        DEBUG_ASSERT_LE(phdr->used_slots, phdr->tail_slot, "Prefix slot number is not less than tail slot number");
+        return (phdr->tail_slot - phdr->used_slots);
     }
 
     bool is_compaction_suggested() const { return (num_prefix_holes() > prefix_node_header::min_holes_to_compact); }
@@ -803,6 +816,9 @@ class FixedPrefixNode : public VariantNode< K, V > {
         // Finally adjust the tail offset to the compacted area.
         auto phdr = prefix_header();
         phdr->tail_slot = phdr->used_slots;
+        DEBUG_ASSERT_EQ(phdr->tail_slot, prefix_bitset_.get_next_reset_bit(cbitset_blob(), 0u),
+                        "Tail slot is not equal to the next reset bit, not expected");
+        DEBUG_ASSERT_EQ(this->num_prefix_holes(), 0, "Shouldn't be any hole after compression, not expected");
     }
 
 #ifndef NDEBUG
@@ -843,13 +859,15 @@ class FixedPrefixNode : public VariantNode< K, V > {
     uint8_t const* csuffix_kv_area() const { return cbitset_area() + bitset_size(); }
 
     prefix_entry* get_prefix_entry(uint16_t slot_num) {
-        return r_cast< prefix_entry* >(this->node_data_area() +
-                                       (this->node_data_size() - ((slot_num + 1) * prefix_entry::size())));
+        return r_cast< prefix_entry* >(
+            this->node_data_area() +
+            (this->node_data_size() - (s_cast< uint16_t >(slot_num + 1) * prefix_entry::size())));
     }
 
     prefix_entry const* get_prefix_entry_c(uint16_t slot_num) const {
-        return r_cast< prefix_entry const* >(this->node_data_area_const() +
-                                             (this->node_data_size() - ((slot_num + 1) * prefix_entry::size())));
+        return r_cast< prefix_entry const* >(
+            this->node_data_area_const() +
+            (this->node_data_size() - (static_cast< uint16_t >(slot_num + 1) * prefix_entry::size())));
     }
 
     suffix_entry* get_suffix_entry(uint16_t idx) {
diff --git a/src/include/homestore/homestore.hpp b/src/include/homestore/homestore.hpp
index 099cef8ac..1bdb86ab3 100644
--- a/src/include/homestore/homestore.hpp
+++ b/src/include/homestore/homestore.hpp
@@ -62,11 +62,6 @@ using HomeStoreSafePtr = std::shared_ptr< HomeStore >;
 
 using hs_before_services_starting_cb_t = std::function< void(void) >;
 
-struct hs_stats {
-    uint64_t total_capacity{0ul};
-    uint64_t used_capacity{0ul};
-};
-
 ENUM(ServiceType, uint32_t, // List of all services we support
      META = 0,              // Meta Service
      LOG = 1,               // Log Service
@@ -83,6 +78,13 @@ ENUM(ServiceSubType, uint32_t,      // All sub types within services. At this po
      INDEX_BTREE_MEMORY = 3,        // Memory based index
 );
 
+using hs_before_services_starting_cb_t = std::function< void(void) >;
+
+struct hs_stats {
+    uint64_t total_capacity{0ul};
+    uint64_t used_capacity{0ul};
+};
+
 VENUM(hs_vdev_type_t, uint32_t, DATA_VDEV = 1, INDEX_VDEV = 2, META_VDEV = 3, LOGDEV_VDEV = 4);
 
 #pragma pack(1)
@@ -131,7 +133,9 @@ class HomeStore {
     std::unique_ptr< MetaBlkService > m_meta_service;
     std::unique_ptr< LogStoreService > m_log_service;
     std::unique_ptr< IndexService > m_index_service;
+#ifdef REPLICATION_SUPPORT
     std::shared_ptr< ReplicationService > m_repl_service;
+#endif
 
     std::unique_ptr< DeviceManager > m_dev_mgr;
     shared< sisl::logging::logger_t > m_periodic_logger;
@@ -163,8 +167,10 @@ class HomeStore {
     HomeStore& with_log_service();
     HomeStore& with_index_service(std::unique_ptr< IndexServiceCallbacks > cbs,
                                   std::vector< ServiceSubType > sub_types);
+#ifdef REPLICATION_SUPPORT
     HomeStore& with_repl_data_service(cshared< ReplApplication >& repl_app,
                                       cshared< ChunkSelector >& custom_chunk_selector = nullptr);
+#endif
 
     bool start(const hs_input_params& input, hs_before_services_starting_cb_t svcs_starting_cb = nullptr);
     void format_and_start(std::map< ServiceId, hs_format_params >&& format_opts);
@@ -189,7 +195,9 @@ class HomeStore {
         if (!m_index_service) { throw std::runtime_error("index_service is nullptr"); }
         return *m_index_service;
     }
+#ifdef REPLICATION_SUPPORT
     ReplicationService& repl_service() { return *m_repl_service; }
+#endif
     DeviceManager* device_mgr() { return m_dev_mgr.get(); }
     ResourceMgr& resource_mgr() { return *m_resource_mgr.get(); }
     CPManager& cp_mgr() { return *m_cp_mgr.get(); }
diff --git a/src/include/homestore/homestore_decl.hpp b/src/include/homestore/homestore_decl.hpp
index 3d1f75135..859b4c59c 100644
--- a/src/include/homestore/homestore_decl.hpp
+++ b/src/include/homestore/homestore_decl.hpp
@@ -20,6 +20,7 @@
 
 #include <boost/uuid/uuid.hpp>
 #include <boost/uuid/uuid_io.hpp>
+#include <boost/uuid/uuid_hash.hpp>
 #include <boost/intrusive_ptr.hpp>
 #include <sisl/utility/enum.hpp>
 #include <sisl/fds/utils.hpp>
@@ -169,6 +170,8 @@ struct hs_input_params {
     uint64_t app_mem_size{static_cast< uint64_t >(1024) * static_cast< uint64_t >(1024) *
                           static_cast< uint64_t >(1024)}; // memory available for the app (including cache)
     uint64_t hugepage_size{0};                            // memory available for the hugepage
+    int max_data_size{0};                                 // max data size in byte on the data plane
+    int max_snapshot_batch_size{0};                       // max snapshot batch size in byte for the raft state machine
     bool is_read_only{false};                             // Is read only
     bool auto_recovery{true};                             // Recovery of data is automatic or controlled by the caller
 
diff --git a/src/include/homestore/logstore/log_store.hpp b/src/include/homestore/logstore/log_store.hpp
index a2091f114..91735be79 100644
--- a/src/include/homestore/logstore/log_store.hpp
+++ b/src/include/homestore/logstore/log_store.hpp
@@ -173,6 +173,15 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > {
 
     logdev_key get_trunc_ld_key() const { return m_trunc_ld_key; }
 
+    /**
+     * @brief Get the truncation information for this log store. It is called during log device truncation
+     *
+     * @return tuple of (start_lsn, trunc_ld_key, tail_lsn) If the log store is empty, it will return
+     * an out_of_bound_ld_key as trunc_ld_key.
+     *
+     * @note ensure that no new logs are flushed between calling this function and completing the truncation,
+     * as this could result in an inaccurate out_of_bound_ld_key.
+     * */
     std::tuple< logstore_seq_num_t, logdev_key, logstore_seq_num_t > truncate_info() const;
 
     sisl::StreamTracker< logstore_record >& log_records() { return m_records; }
@@ -231,6 +240,8 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > {
     bool rollback(logstore_seq_num_t to_lsn);
 
     auto start_lsn() const { return m_start_lsn.load(std::memory_order_acquire); }
+    auto tail_lsn() const { return m_tail_lsn.load(std::memory_order_acquire); }
+    auto next_lsn() const { return m_next_lsn.load(std::memory_order_acquire); }
 
     nlohmann::json dump_log_store(const log_dump_req& dump_req = log_dump_req());
 
diff --git a/src/include/homestore/logstore/log_store_internal.hpp b/src/include/homestore/logstore/log_store_internal.hpp
index 551f15ea8..7768086ee 100644
--- a/src/include/homestore/logstore/log_store_internal.hpp
+++ b/src/include/homestore/logstore/log_store_internal.hpp
@@ -52,6 +52,12 @@ typedef std::function< void(std::shared_ptr< HomeLogStore >, logstore_seq_num_t)
 
 typedef int64_t logid_t;
 
+VENUM(flush_mode_t, uint32_t, // Various flush modes (can be or'ed together)
+      INLINE = 1 << 0,        // Allow flush inline with the append
+      TIMER = 1 << 1,         // Allow timer based automatic flush
+      EXPLICIT = 1 << 2,      // Allow explcitly user calling flush
+);
+
 struct logdev_key {
     logid_t idx;
     off_t dev_offset;
@@ -85,7 +91,8 @@ struct logdev_key {
     std::string to_string() const { return fmt::format("Logid={} devoffset={}", idx, dev_offset); }
 
     static const logdev_key& out_of_bound_ld_key() {
-        static constexpr logdev_key s_out_of_bound_ld_key{std::numeric_limits< logid_t >::max(), 0};
+        static constexpr logdev_key s_out_of_bound_ld_key{std::numeric_limits< logid_t >::max(),
+                                                          std::numeric_limits< off_t >::max()};
         return s_out_of_bound_ld_key;
     }
 };
@@ -171,4 +178,5 @@ struct logstore_superblk {
     logstore_seq_num_t m_first_seq_num{0};
 };
 #pragma pack()
+
 } // namespace homestore
\ No newline at end of file
diff --git a/src/include/homestore/logstore_service.hpp b/src/include/homestore/logstore_service.hpp
index 44ba1ab53..039e14114 100644
--- a/src/include/homestore/logstore_service.hpp
+++ b/src/include/homestore/logstore_service.hpp
@@ -93,7 +93,7 @@ class LogStoreService {
      * chunks. Logdev can start with zero chunks and dynamically add chunks based on write request.
      * @return Newly created log dev id.
      */
-    logdev_id_t create_new_logdev();
+    logdev_id_t create_new_logdev(flush_mode_t flush_mode);
 
     /**
      * @brief Open a log dev.
@@ -101,7 +101,7 @@ class LogStoreService {
      * @param logdev_id: Logdev ID
      * @return Newly created log dev id.
      */
-    void open_logdev(logdev_id_t logdev_id);
+    void open_logdev(logdev_id_t logdev_id, flush_mode_t flush_mode);
 
     /**
      * @brief Destroy a log dev.
@@ -132,7 +132,8 @@ class LogStoreService {
      * @return std::shared_ptr< HomeLogStore >
      */
     folly::Future< shared< HomeLogStore > > open_log_store(logdev_id_t logdev_id, logstore_id_t store_id,
-                                                           bool append_mode);
+                                                           bool append_mode, log_found_cb_t log_found_cb = nullptr,
+                                                           log_replay_done_cb_t log_replay_done_cb = nullptr);
 
     /**
      * @brief Close the log store instance and free-up the resources
@@ -176,7 +177,7 @@ class LogStoreService {
     void delete_unopened_logdevs();
 
 private:
-    std::shared_ptr< LogDev > create_new_logdev_internal(logdev_id_t logdev_id);
+    std::shared_ptr< LogDev > create_new_logdev_internal(logdev_id_t logdev_id, flush_mode_t flush_mode);
     void on_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie);
     logdev_id_t get_next_logdev_id();
     void logdev_super_blk_found(const sisl::byte_view& buf, void* meta_cookie);
diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h
index 994da7d97..88a928aa3 100644
--- a/src/include/homestore/replication/repl_decls.h
+++ b/src/include/homestore/replication/repl_decls.h
@@ -15,20 +15,24 @@ namespace homestore {
 VENUM(ReplServiceError, int32_t,
       OK = 0,         // Everything OK
       CANCELLED = -1, // Request was cancelled
-      TIMEOUT = -2, 
-      NOT_LEADER = -3, 
-      BAD_REQUEST = -4, 
-      SERVER_ALREADY_EXISTS = -5, 
+      TIMEOUT = -2,
+      NOT_LEADER = -3,
+      BAD_REQUEST = -4,
+      SERVER_ALREADY_EXISTS = -5,
       CONFIG_CHANGING = -6,
-      SERVER_IS_JOINING = -7, 
-      SERVER_NOT_FOUND = -8, 
-      CANNOT_REMOVE_LEADER = -9, 
+      SERVER_IS_JOINING = -7,
+      SERVER_NOT_FOUND = -8,
+      CANNOT_REMOVE_LEADER = -9,
       SERVER_IS_LEAVING = -10,
-      TERM_MISMATCH = -11, 
-      RESULT_NOT_EXIST_YET = -10000, 
+      TERM_MISMATCH = -11,
+      RETRY_REQUEST = -12,
+      RESULT_NOT_EXIST_YET = -10000,
       NOT_IMPLEMENTED = -10001,
       NO_SPACE_LEFT = -20000,
       DRIVE_WRITE_ERROR = -20001,
+      DATA_DUPLICATED = -20002,
+      QUIENCE_STATE = -20003,
+      QUORUM_NOT_MET = -20004,
       FAILED = -32768);
 // clang-format on
 
@@ -68,9 +72,20 @@ struct peer_info {
     // Peer ID.
     replica_id_t id_;
     // The last replication index that the peer has, from this server's point of view.
-    uint64_t replication_idx_;
+    uint64_t replication_idx_ = 0;
     // The elapsed time since the last successful response from this peer, set to 0 on leader
-    uint64_t last_succ_resp_us_;
+    uint64_t last_succ_resp_us_ = 0;
+    // The priority for leader election
+    uint32_t priority_ = 0;
+    // Whether the peer can vote. If a peer is learner, this will be false. Hide the raft details.
+    bool can_vote = true;
+};
+
+struct replica_member_info {
+    static constexpr uint64_t max_name_len = 128;
+    replica_id_t id;
+    char name[max_name_len];
+    int32_t priority{0};
 };
 
 } // namespace homestore
diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h
index 9965ada5d..45e2488c6 100644
--- a/src/include/homestore/replication/repl_dev.h
+++ b/src/include/homestore/replication/repl_dev.h
@@ -11,6 +11,7 @@
 #include <sisl/grpc/generic_service.hpp>
 #include <sisl/grpc/rpc_client.hpp>
 #include <homestore/replication/repl_decls.h>
+#include <homestore/blkdata_service.hpp>
 #include <libnuraft/snapshot.hxx>
 
 namespace nuraft {
@@ -28,6 +29,7 @@ struct repl_req_ctx;
 using raft_buf_ptr_t = nuraft::ptr< nuraft::buffer >;
 using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >;
 using repl_req_ptr_t = boost::intrusive_ptr< repl_req_ctx >;
+using trace_id_t = u_int64_t;
 
 VENUM(repl_req_state_t, uint32_t,
       INIT = 0,               // Initial state
@@ -36,19 +38,27 @@ VENUM(repl_req_state_t, uint32_t,
       DATA_WRITTEN = 1 << 2,  // Data has been written to the storage
       LOG_RECEIVED = 1 << 3,  // Log is received and waiting for data
       LOG_FLUSHED = 1 << 4,   // Log has been flushed
-      ERRORED = 1 << 5        // Error has happened and cleaned up
+      ERRORED = 1 << 5,        // Error has happened and cleaned up
+      DATA_COMMITTED = 1 << 6  // Data has already been committed, used in duplication handling, will skip commit_blk
 )
 
 VENUM(journal_type_t, uint16_t,
       HS_DATA_LINKED = 0,  // Linked data where each entry will store physical blkid where data reside
       HS_DATA_INLINED = 1, // Data is inlined in the header of journal entry
-      HS_CTRL_DESTROY = 2  // Control message to destroy the repl_dev
+      HS_CTRL_DESTROY = 2, // Control message to destroy the repl_dev
+      HS_CTRL_START_REPLACE = 3, // Control message to start replace a member
+      HS_CTRL_COMPLETE_REPLACE = 4, // Control message to complete replace a member
 )
 
+// magic num comes from the first 8 bytes of 'echo homestore_resync_data | md5sum'
+static constexpr uint64_t HOMESTORE_RESYNC_DATA_MAGIC = 0xa65dbd27c213f327;
+static constexpr uint32_t HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1 = 0x01;
+
 struct repl_key {
-    int32_t server_id{0}; // Server Id which this req is originated from
-    uint64_t term;        // RAFT term number
-    uint64_t dsn{0};      // Data sequence number to tie the data with the raft journal entry
+    int32_t server_id{0};  // Server Id which this req is originated from
+    uint64_t term;         // RAFT term number
+    uint64_t dsn{0};       // Data sequence number to tie the data with the raft journal entry
+    trace_id_t traceID{0}; // tracing ID provided by application that connects logs.
 
     struct Hasher {
         size_t operator()(repl_key const& rk) const {
@@ -66,14 +76,13 @@ struct repl_key {
 using repl_snapshot = nuraft::snapshot;
 using repl_snapshot_ptr = nuraft::ptr< nuraft::snapshot >;
 
-// Consumers of the ReplDevListener dont have to know what underlying
-// snapshot implementation is used. Consumers can export and save the state
-// of the snapshot using serialize and load the state using deserialize.
+// Consumers of ReplDevListener don't have to know what underlying snapshot context implementation is used by the
+// ReplDev. The state of the snapshot can be exported with serialize() and loaded with
+// repl_dev.deserialize_snapshot_context().
 class snapshot_context {
 public:
     snapshot_context(int64_t lsn) : lsn_(lsn) {}
     virtual ~snapshot_context() = default;
-    virtual void deserialize(const sisl::io_blob_safe& snp_ctx) = 0;
     virtual sisl::io_blob_safe serialize() = 0;
     int64_t get_lsn() { return lsn_; }
 
@@ -81,74 +90,69 @@ class snapshot_context {
     int64_t lsn_;
 };
 
-class nuraft_snapshot_context : public snapshot_context {
-public:
-    nuraft_snapshot_context(nuraft::snapshot& snp) : snapshot_context(snp.get_last_log_idx()) {
-        auto snp_buf = snp.serialize();
-        snapshot_ = nuraft::snapshot::deserialize(*snp_buf);
-    }
-
-    void deserialize(const sisl::io_blob_safe& snp_ctx) override {
-        // Load the context from the io blob to nuraft buffer.
-        auto snp_buf = nuraft::buffer::alloc(snp_ctx.size());
-        nuraft::buffer_serializer bs(snp_buf);
-        bs.put_raw(snp_ctx.cbytes(), snp_ctx.size());
-        snapshot_ = nuraft::snapshot::deserialize(bs);
-        lsn_ = snapshot_->get_last_log_idx();
-    }
-
-    sisl::io_blob_safe serialize() override {
-        // Dump the context from nuraft buffer to the io blob.
-        auto snp_buf = snapshot_->serialize();
-        sisl::io_blob_safe blob{s_cast< size_t >(snp_buf->size())};
-        std::memcpy(blob.bytes(), snp_buf->data_begin(), snp_buf->size());
-        return blob;
-    }
-
-    nuraft::ptr< nuraft::snapshot > nuraft_snapshot() { return snapshot_; }
-
-private:
-    nuraft::ptr< nuraft::snapshot > snapshot_;
-};
-
-struct snapshot_data {
+struct snapshot_obj {
     void* user_ctx{nullptr};
-    int64_t offset{0};
+    uint64_t offset{0};
     sisl::io_blob_safe blob;
     bool is_first_obj{false};
     bool is_last_obj{false};
 };
 
+// HomeStore has some meta information to be transmitted during the baseline resync,
+// Although now only dsn needs to be synced, this structure is defined as a general message, and we can easily add data
+// if needed in the future.
+struct snp_repl_dev_data {
+    uint64_t magic_num{HOMESTORE_RESYNC_DATA_MAGIC};
+    uint32_t protocol_version{HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1};
+    uint32_t crc{0};
+    uint64_t dsn{0};
+};
+
 struct repl_journal_entry;
 struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost::thread_safe_counter >,
                       sisl::ObjLifeCounter< repl_req_ctx > {
     friend class SoloReplDev;
 
 public:
-    repl_req_ctx() {}
+    repl_req_ctx() { m_start_time = Clock::now(); }
     virtual ~repl_req_ctx();
-    void init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header,
-              sisl::blob const& key, uint32_t data_size);
+    ReplServiceError init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header,
+              sisl::blob const& key, uint32_t data_size, cshared< ReplDevListener >& listener);
 
     /////////////////////// All getters ///////////////////////
     repl_key const& rkey() const { return m_rkey; }
     uint64_t dsn() const { return m_rkey.dsn; }
     uint64_t term() const { return m_rkey.term; }
+    trace_id_t traceID() const { return m_rkey.traceID; }
     int64_t lsn() const { return m_lsn; }
     bool is_proposer() const { return m_is_proposer; }
     journal_type_t op_code() const { return m_op_code; }
+    bool is_volatile() const { return m_is_volatile.load(); }
 
     sisl::blob const& header() const { return m_header; }
     sisl::blob const& key() const { return m_key; }
-    MultiBlkId const& local_blkid() const { return m_local_blkid; }
+    MultiBlkId const& local_blkid() const {
+        // Currently used by raft repl dev only where a single blob is expected.
+        // Code checks if its a valid blkid so return a dummy blkid.
+        if (!m_local_blkids.empty())
+            return m_local_blkids[0];
+        else
+            return dummy_blkid;
+    }
+
+    std::vector< MultiBlkId >& local_blkids() { return m_local_blkids; }
     RemoteBlkId const& remote_blkid() const { return m_remote_blkid; }
-    const char* data() const { return r_cast< const char* >(m_data); }
+    const char* data() const {
+        DEBUG_ASSERT(m_data != nullptr,
+                     "m_data is nullptr, use before save_pushed/fetched_data or after release_data()");
+        return r_cast< const char* >(m_data);
+    }
     repl_req_state_t state() const { return repl_req_state_t(m_state.load()); }
     bool has_state(repl_req_state_t s) const { return m_state.load() & uint32_cast(s); }
     repl_journal_entry const* journal_entry() const { return m_journal_entry; }
     uint32_t journal_entry_size() const;
+    uint32_t blkids_serialized_size() const;
     bool is_localize_pending() const { return m_is_jentry_localize_pending; }
-    bool is_data_inlined() const { return (m_op_code == journal_type_t::HS_DATA_INLINED); }
     bool has_linked_data() const { return (m_op_code == journal_type_t::HS_DATA_LINKED); }
 
     raft_buf_ptr_t& raft_journal_buf();
@@ -156,6 +160,7 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost::
     /////////////////////// Non modifiers methods //////////////////
     std::string to_string() const;
     std::string to_compact_string() const;
+    std::string blkids_to_string() const;
     Clock::time_point created_time() const { return m_start_time; }
     void set_created_time() { m_start_time = Clock::now(); }
     bool is_expired() const;
@@ -202,12 +207,14 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost::
     bool save_fetched_data(sisl::GenericClientResponse const& fetched_data, uint8_t const* data, uint32_t data_size);
 
     void set_remote_blkid(RemoteBlkId const& rbid) { m_remote_blkid = rbid; }
-    void set_local_blkid(MultiBlkId const& lbid) { m_local_blkid = lbid; } // Only used during recovery
+    void set_local_blkids(std::vector< MultiBlkId > const& lbids) { m_local_blkids = std::move(lbids); }
+    void set_is_volatile(bool is_volatile) { m_is_volatile.store(is_volatile); }
     void set_lsn(int64_t lsn);
     void add_state(repl_req_state_t s);
     bool add_state_if_not_already(repl_req_state_t s);
     void set_lentry(nuraft::ptr< nuraft::log_entry > const& lentry) { m_lentry = lentry; }
     void clear();
+    void release_data();
     flatbuffers::FlatBufferBuilder& create_fb_builder() { return m_fb_builder; }
     void release_fb_builder() { m_fb_builder.Release(); }
 
@@ -228,11 +235,13 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost::
     bool m_is_proposer{false};                                 // Is the repl_req proposed by this node
     Clock::time_point m_start_time;                            // Start time of the request
     journal_type_t m_op_code{journal_type_t::HS_DATA_INLINED}; // Operation code for this request
+    std::atomic< bool > m_is_volatile{true};                   // Is the log still in memory and not flushed to disk yet
 
     /////////////// Data related section /////////////////
-    MultiBlkId m_local_blkid;   // Local BlkId for the data
-    RemoteBlkId m_remote_blkid; // Corresponding remote blkid for the data
-    uint8_t const* m_data;      // Raw data pointer containing the actual data
+    static inline MultiBlkId dummy_blkid;
+    std::vector< MultiBlkId > m_local_blkids; // Local BlkId for the data
+    RemoteBlkId m_remote_blkid;               // Corresponding remote blkid for the data
+    uint8_t const* m_data;                    // Raw data pointer containing the actual data
 
     /////////////// Journal/Buf related section /////////////////
     std::variant< std::unique_ptr< uint8_t[] >, raft_buf_ptr_t > m_journal_buf; // Buf for the journal entry
@@ -268,11 +277,19 @@ class ReplDevListener {
     /// @param lsn - The log sequence number
     /// @param header - Header originally passed with replica_set::write() api
     /// @param key - Key originally passed with replica_set::write() api
-    /// @param blkids - List of blkids where data is written to the storage engine.
+    /// @param blkids - List of independent blkids where data is written to the storage engine.
     /// @param ctx - Context passed as part of the replica_set::write() api
     ///
-    virtual void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids,
-                           cintrusive< repl_req_ctx >& ctx) = 0;
+    virtual void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key,
+                           std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) = 0;
+
+    /// @brief periodically called to notify the lastest committed lsn to the listener.
+    /// NOTE: this callback will block the thread of flushing the latest committed lsn into repl_dev superblk as DC_LSN,
+    /// pls take care if there is any heavy or blocking operation in this callback.
+    ///
+    /// @param lsn - The lasted committed log sequence number so far
+    ///
+    virtual void notify_committed_lsn(int64_t lsn) = 0;
 
     /// @brief Called when the log entry has been received by the replica dev.
     ///
@@ -311,6 +328,10 @@ class ReplDevListener {
     virtual void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key,
                              cintrusive< repl_req_ctx >& ctx) = 0;
 
+    /// @brief Called when the config log entry has been rolled back.
+    /// @param lsn - The log sequence number getting rolled back
+    virtual void on_config_rollback(int64_t lsn) = 0;
+
     /// @brief Called when the replDev is created after restart. The consumer is expected to recover all the modules
     /// necessary to replay/commit the logs.
     virtual void on_restart() = 0;
@@ -339,12 +360,21 @@ class ReplDevListener {
     /// @return Expected to return blk_alloc_hints for this write. If the hints are not available, then return the
     /// error. It is to be noted this method should return error only in very abnornal cases as in some code flow, an
     /// error would result in a crash or stall of the entire commit thread.
-    virtual ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) = 0;
+    virtual ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size,
+                                                              cintrusive< homestore::repl_req_ctx >& hs_ctx) = 0;
 
     /// @brief Called when the repl_dev is being destroyed. The consumer is expected to clean up any related resources.
     /// However, it is expected that this call be idempotent. It is possible in rare scenarios that this can be called
     /// after restart in case crash happened during the destroy.
-    virtual void on_destroy() = 0;
+    virtual void on_destroy(const group_id_t& group_id) = 0;
+
+    /// @brief Called when start replace member.
+    virtual void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in,
+                                         trace_id_t tid) = 0;
+
+    /// @brief Called when complete replace member.
+    virtual void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in,
+                                            trace_id_t tid) = 0;
 
     /// @brief Called when the snapshot is being created by nuraft
     virtual AsyncReplResult<> create_snapshot(shared< snapshot_context > context) = 0;
@@ -359,18 +389,37 @@ class ReplDevListener {
     /// uses offset given by the follower to the know the current state of the follower.
     /// Leader sends the snapshot data to the follower in batch. This callback is called multiple
     /// times on the leader till all the data is transferred to the follower. is_last_obj in
-    /// snapshot_data will be true once all the data has been trasnferred. After this the raft on
+    /// snapshot_obj will be true once all the data has been trasnferred. After this the raft on
     /// the follower side can do the incremental resync.
-    virtual int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) = 0;
+    virtual int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_obj) = 0;
 
     /// @brief Called on the follower when the leader sends the data during the baseline resyc.
-    /// is_last_obj in in snapshot_data will be true once all the data has been transfered.
+    /// is_last_obj in in snapshot_obj will be true once all the data has been transfered.
     /// After this the raft on the follower side can do the incremental resync.
-    virtual void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) = 0;
+    virtual void write_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_obj) = 0;
 
-    /// @brief Free up user-defined context inside the snapshot_data that is allocated during read_snapshot_data.
+    /// @brief Free up user-defined context inside the snapshot_obj that is allocated during read_snapshot_obj.
     virtual void free_user_snp_ctx(void*& user_snp_ctx) = 0;
 
+    /// @brief ask upper layer to decide which data should be returned.
+    // @param header - header of the log entry.
+    // @param blkid - original blkid of the log entry
+    // @param sgs - sgs to be filled with data
+    // @param lsn - lsn of the log entry
+    virtual folly::Future< std::error_code > on_fetch_data(const int64_t lsn, const sisl::blob& header,
+                                                           const MultiBlkId& blkid, sisl::sg_list& sgs) {
+        // default implementation is reading by blkid directly
+        return data_service().async_read(blkid, sgs, sgs.size);
+    }
+
+    /// @brief ask upper layer to handle no_space_left event
+    // @param lsn - on which repl_lsn no_space_left happened
+    // @param chunk_id - on which chunk no_space_left happened
+    virtual void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) = 0;
+
+    /// @brief when restart, after all the logs are replayed and before joining raft group, notify the upper layer
+    virtual void on_log_replay_done(const group_id_t& group_id) {};
+
 private:
     std::weak_ptr< ReplDev > m_repl_dev;
 };
@@ -380,6 +429,39 @@ class ReplDev {
     ReplDev() = default;
     virtual ~ReplDev() { detach_listener(); }
 
+    /// @brief Allocates blkids from the storage engine to write the value into. Storage
+    /// engine returns a blkid_list in cases where single contiguous blocks are not
+    /// available.
+    ///
+    /// @param data_size - Size of the data.
+    /// @param hints - Specify block allocation hints.
+    /// @param out_blkids - List of bilkid's which may not be contiguous.
+    virtual std::error_code alloc_blks(uint32_t data_size, const blk_alloc_hints& hints,
+                                       std::vector< MultiBlkId >& out_blkids) = 0;
+
+    /// @brief  Write data locally using the specified blkid's. Data is split across the blkids.
+    /// @param blkids - List of blkid's where data will be written.
+    /// @param value - vector of io buffers that contain value for the key.
+    /// @param part_of_batch - Is write is part of a batch. If part of the batch, then submit_batch needs to be called
+    /// at the end
+    /// @return A Future with std::error_code to notify if it has successfully write the data or any error code in case
+    /// of failure
+    virtual folly::Future< std::error_code > async_write(const std::vector< MultiBlkId >& blkids,
+                                                         sisl::sg_list const& value, bool part_of_batch = false,
+                                                         trace_id_t tid = 0) = 0;
+
+    /// @brief Creates a log/journal entry with <header, key, blkid> and calls the on_commit listener callback.
+    /// @param blkids - List of blkid's where data was written.
+    /// @param header - Blob representing the header (it is opaque and will be copied
+    /// as-is to the journal entry)
+    /// @param key - Blob representing the key (it is opaque and will be copied as-is to
+    /// the journal entry).
+    /// @param data_size - Size of the data.
+    /// @param ctx - User supplied context which will be passed to listener callbacks
+    virtual void async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header,
+                                     sisl::blob const& key, uint32_t data_size, repl_req_ptr_t ctx,
+                                     trace_id_t tid = 0) = 0;
+
     /// @brief Replicate the data to the replica set. This method goes through the
     /// following steps:
     /// Step 1: Allocates blkid from the storage engine to write the value into. Storage
@@ -397,10 +479,11 @@ class ReplDev {
     /// cases
     /// @param value - vector of io buffers that contain value for the key. It is an optional field and if the value
     /// list size is 0, then only key is written to replicadev without data.
-    /// @param ctx - User supplied context which will be passed to listener
-    /// callbacks
+    /// @param ctx - User supplied context which will be passed to listener callbacks
+    /// @param part_of_batch Is write is part of a batch. If part of the batch, then submit_batch needs to be called at
+    /// the end
     virtual void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value,
-                                   repl_req_ptr_t ctx) = 0;
+                                   repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) = 0;
 
     /// @brief Reads the data and returns a future to continue on
     /// @param bid Block id to read
@@ -411,13 +494,14 @@ class ReplDev {
     /// @return A Future with std::error_code to notify if it has successfully read the data or any error code in case
     /// of failure
     virtual folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size,
-                                                        bool part_of_batch = false) = 0;
+                                                        bool part_of_batch = false, trace_id_t tid = 0) = 0;
 
     /// @brief After data is replicated and on_commit to the listener is called. the blkids can be freed.
     ///
     /// @param lsn - LSN of the old blkids that is being freed
     /// @param blkids - blkids to be freed.
-    virtual void async_free_blks(int64_t lsn, MultiBlkId const& blkid) = 0;
+    virtual folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid,
+                                                             trace_id_t tid = 0) = 0;
 
     /// @brief Try to switch the current replica where this method called to become a leader.
     /// @return True if it is successful, false otherwise.
@@ -438,10 +522,30 @@ class ReplDev {
     /// @return group_id
     virtual group_id_t group_id() const = 0;
 
+    /// @brief Sets a custom name for the repldev. Users can assign a meaningful name to the repldev for easy debugging.
+    virtual void set_custom_rdev_name(std::string const& name) = 0;
+
     /// @brief Gets the block size with which IO will happen on this device
     /// @return Block size
     virtual uint32_t get_blk_size() const = 0;
 
+    /// @brief Gets the last commit lsn of this repldev
+    /// @return last_commit_lsn
+    virtual repl_lsn_t get_last_commit_lsn() const = 0;
+
+    /// @brief Gets the repl lsn of the last log in log store
+    /// @return last_append_repl_lsn
+    virtual repl_lsn_t get_last_append_lsn() = 0;
+
+    /// @brief if this replica is ready for accepting client IO.
+    /// @return true if ready, false otherwise
+    virtual bool is_ready_for_traffic() const = 0;
+
+    /// @brief Clean up resources on this repl dev.
+    virtual void purge() = 0;
+
+    virtual std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) = 0;
+
     virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); }
 
     virtual void detach_listener() {
@@ -451,6 +555,30 @@ class ReplDev {
         }
     }
 
+    virtual shared< ReplDevListener > get_listener() { return m_listener; }
+
+    // we have no shutdown for repl_dev, since shutdown repl_dev is done by repl_service
+    void stop() {
+#if 0
+        start_stopping();
+        while (true) {
+            auto pending_request_num = get_pending_request_num();
+            if (!pending_request_num) break;
+
+            std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        }
+#endif
+    }
+
+    // complete all the requests that are in progress and start refusing new reqs
+    virtual void quiesce_reqs() = 0;
+
+    // start accepting new reqs
+    virtual void resume_accepting_reqs() = 0;
+
+    // clear reqs that has allocated blks on the given chunk.
+    virtual void clear_chunk_req(chunk_num_t chunk_id) = 0;
+
 protected:
     shared< ReplDevListener > m_listener;
 };
diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp
index 8f535b855..f28704546 100644
--- a/src/include/homestore/replication_service.hpp
+++ b/src/include/homestore/replication_service.hpp
@@ -41,9 +41,18 @@ class ReplicationService {
     /// @return A Future which gets called after schedule to release (before garbage collection is kicked in)
     virtual folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) = 0;
 
-    virtual AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out,
-                                             replica_id_t member_in) const = 0;
-
+    /// @brief Replace one of the members with a new one.
+    /// @param group_id Group where the replace member happens
+    /// @param member_out The member which is going to be replaced
+    /// @param member_in The member which is going to be added in place of member_out
+    /// @param commit_quorum Commit quorum to be used for this operation. If 0, it will use the default commit quorum.
+    /// @return A Future on replace the member accepted or Future ReplServiceError upon error
+    virtual AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out,
+                                                   const replica_member_info& member_in, uint32_t commit_quorum = 0,
+                                                   uint64_t trace_id = 0) const = 0;
+
+    virtual AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum,
+                                        bool wait_and_verify = true, uint64_t trace_id = 0) const = 0;
     /// @brief Get the repl dev for a given group id if it is already created or opened
     /// @param group_id Group id interested in
     /// @return ReplDev is opened or ReplServiceError::SERVER_NOT_FOUND if it doesn't exist
@@ -74,6 +83,14 @@ class ReplApplication {
     // Listener corresponding to the ReplDev which will be used to perform the precommit/commit/rollback.
     virtual shared< ReplDevListener > create_repl_dev_listener(group_id_t group_id) = 0;
 
+    // Called when the repl dev is destroyed. This interface provides the application a chance to cleanup any resources
+    // assocated with this listener;
+    virtual void destroy_repl_dev_listener(group_id_t group_id) = 0;
+
+    // Called after all the repl devs are found upon restart of the homestore instance.
+    // it is a nice place for upper layer to recovery anything depends on repl_devs
+    virtual void on_repl_devs_init_completed() = 0;
+
     // Given the uuid of the peer, get their address and port
     virtual std::pair< std::string, uint16_t > lookup_peer(replica_id_t uuid) const = 0;
 
diff --git a/src/include/homestore/vchunk.h b/src/include/homestore/vchunk.h
index b52832faa..4b69b1332 100644
--- a/src/include/homestore/vchunk.h
+++ b/src/include/homestore/vchunk.h
@@ -35,6 +35,8 @@ class VChunk {
     uint32_t get_pdev_id() const;
     uint16_t get_chunk_id() const;
     cshared< Chunk > get_internal_chunk() const;
+    uint64_t size() const;
+    void reset();
 
 private:
     shared< Chunk > m_internal_chunk;
diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp
index 4a4c7fd18..2f6cec25c 100644
--- a/src/lib/blkalloc/append_blk_allocator.cpp
+++ b/src/lib/blkalloc/append_blk_allocator.cpp
@@ -67,14 +67,23 @@ BlkAllocStatus AppendBlkAllocator::alloc_contiguous(BlkId& bid) { return alloc(1
 // If we want to change above design, we can open this api for vector allocation;
 //
 BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hints& hint, BlkId& out_bid) {
-    if (available_blks() < nblks) {
+    auto avail_blks = available_blks();
+    if (hint.reserved_blks) {
+        avail_blks = avail_blks > hint.reserved_blks.value() ? avail_blks - hint.reserved_blks.value() : 0;
+    }
+    if (avail_blks < nblks) {
         // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1);
-        LOGERROR("No space left to serve request nblks: {}, available_blks: {}", nblks, available_blks());
+        LOGERROR("No space left to serve request nblks: {}, available_blks: {}, actual available_blks(exclude reserved "
+                 "blks): {}",
+                 nblks, available_blks(), avail_blks);
+        // the caller can know in which chunk no_space_left happened;
+        out_bid = BlkId{0, 0, m_chunk_id};
         return BlkAllocStatus::SPACE_FULL;
     } else if (nblks > max_blks_per_blkid()) {
         // consumer(vdev) already handles this case.
         // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1);
         LOGERROR("Can't serve request nblks: {} larger than max_blks_in_op: {}", nblks, max_blks_per_blkid());
+        out_bid = BlkId{0, 0, m_chunk_id};
         return BlkAllocStatus::FAILED;
     }
 
@@ -127,33 +136,9 @@ void AppendBlkAllocator::cp_flush(CP* cp) {
     }
 }
 
-//
-// free operation does:
-// 1. book keeping "total freeable" space
-// 2. if the blk being freed happens to be last block, move last_append_offset backwards accordingly;
-//
+// free operation books keeping "total freeable" space
 void AppendBlkAllocator::free(const BlkId& bid) {
-    // If we are freeing the last block, just move the offset back
-    blk_num_t cur_last_offset = m_last_append_offset.load();
-    auto const input_last_offset = bid.blk_num() + bid.blk_count();
-    blk_num_t new_last_offset;
-    bool freeing_in_middle{false};
-    do {
-        if (input_last_offset == cur_last_offset) {
-            new_last_offset = bid.blk_num();
-            freeing_in_middle = false;
-        } else {
-            new_last_offset = cur_last_offset;
-            freeing_in_middle = true;
-        }
-    } while (!m_last_append_offset.compare_exchange_weak(cur_last_offset, new_last_offset));
-
-    if (freeing_in_middle) {
-        // Freeing something in the middle, increment the count
-        m_freeable_nblks.fetch_add(bid.blk_count());
-    } else {
-        m_commit_offset.store(m_last_append_offset.load());
-    }
+    m_freeable_nblks.fetch_add(bid.blk_count());
     m_is_dirty.store(true);
 }
 
@@ -162,6 +147,13 @@ bool AppendBlkAllocator::is_blk_alloced(const BlkId& in_bid, bool) const {
     return in_bid.blk_num() < get_used_blks();
 }
 
+void AppendBlkAllocator::reset() {
+    m_last_append_offset.store(0);
+    m_freeable_nblks.store(0);
+    m_commit_offset.store(0);
+    m_is_dirty.store(true);
+}
+
 bool AppendBlkAllocator::is_blk_alloced_on_disk(BlkId const& bid, bool) const {
     return bid.blk_num() < m_sb->commit_offset;
 }
diff --git a/src/lib/blkalloc/append_blk_allocator.h b/src/lib/blkalloc/append_blk_allocator.h
index 384a4936b..5e745c33a 100644
--- a/src/lib/blkalloc/append_blk_allocator.h
+++ b/src/lib/blkalloc/append_blk_allocator.h
@@ -38,21 +38,21 @@ struct append_blk_sb_t {
 };
 #pragma pack()
 
-//class AppendBlkAllocMetrics : public sisl::MetricsGroup {
-//public:
-//    explicit AppendBlkAllocMetrics(const char* inst_name) : sisl::MetricsGroup("AppendBlkAlloc", inst_name) {
-//        REGISTER_COUNTER(num_alloc, "Number of blks alloc attempts");
-//        REGISTER_COUNTER(num_alloc_failure, "Number of blk alloc failures");
+// class AppendBlkAllocMetrics : public sisl::MetricsGroup {
+// public:
+//     explicit AppendBlkAllocMetrics(const char* inst_name) : sisl::MetricsGroup("AppendBlkAlloc", inst_name) {
+//         REGISTER_COUNTER(num_alloc, "Number of blks alloc attempts");
+//         REGISTER_COUNTER(num_alloc_failure, "Number of blk alloc failures");
 //
-//        register_me_to_farm();
-//    }
+//         register_me_to_farm();
+//     }
 //
-//    AppendBlkAllocMetrics(const AppendBlkAllocMetrics&) = delete;
-//    AppendBlkAllocMetrics(AppendBlkAllocMetrics&&) noexcept = delete;
-//    AppendBlkAllocMetrics& operator=(const AppendBlkAllocMetrics&) = delete;
-//    AppendBlkAllocMetrics& operator=(AppendBlkAllocMetrics&&) noexcept = delete;
-//    ~AppendBlkAllocMetrics() { deregister_me_from_farm(); }
-//};
+//     AppendBlkAllocMetrics(const AppendBlkAllocMetrics&) = delete;
+//     AppendBlkAllocMetrics(AppendBlkAllocMetrics&&) noexcept = delete;
+//     AppendBlkAllocMetrics& operator=(const AppendBlkAllocMetrics&) = delete;
+//     AppendBlkAllocMetrics& operator=(AppendBlkAllocMetrics&&) noexcept = delete;
+//     ~AppendBlkAllocMetrics() { deregister_me_from_farm(); }
+// };
 
 //
 // The assumption for AppendBlkAllocator:
@@ -108,6 +108,11 @@ class AppendBlkAllocator : public BlkAllocator {
 
     std::string to_string() const override;
 
+    /**
+     * @brief : reset the allocator to initial state, so all the blks in this chunk are free.
+     */
+    void reset() override;
+
     void cp_flush(CP* cp) override;
     void recovery_completed() override {}
     nlohmann::json get_status(int log_level) const override;
@@ -121,7 +126,7 @@ class AppendBlkAllocator : public BlkAllocator {
     std::atomic< blk_num_t > m_freeable_nblks{0};     // count of blks fragmentedly freed (both on-disk and in-memory)
     std::atomic< blk_num_t > m_commit_offset{0};      // offset in on-disk version
     std::atomic< bool > m_is_dirty{false};
-    //AppendBlkAllocMetrics m_metrics;
+    // AppendBlkAllocMetrics m_metrics;
     superblk< append_blk_sb_t > m_sb; // only cp will be writing to this disk
 };
 
diff --git a/src/lib/blkalloc/bitmap_blk_allocator.h b/src/lib/blkalloc/bitmap_blk_allocator.h
index 381767bef..a86e08757 100644
--- a/src/lib/blkalloc/bitmap_blk_allocator.h
+++ b/src/lib/blkalloc/bitmap_blk_allocator.h
@@ -77,6 +77,7 @@ class BitmapBlkAllocator : public BlkAllocator {
     void cp_flush(CP* cp) override;
 
     void recovery_completed() override {}
+    void reset() override {}
     blk_num_t get_num_portions() const { return (m_num_blks - 1) / m_blks_per_portion + 1; }
     blk_num_t get_blks_per_portion() const { return m_blks_per_portion; }
 
diff --git a/src/lib/blkalloc/blk_allocator.h b/src/lib/blkalloc/blk_allocator.h
index b381f71c5..8c64fc8e5 100644
--- a/src/lib/blkalloc/blk_allocator.h
+++ b/src/lib/blkalloc/blk_allocator.h
@@ -158,6 +158,7 @@ class BlkAllocator {
     virtual bool is_blk_alloced(BlkId const& b, bool use_lock = false) const = 0;
     virtual bool is_blk_alloced_on_disk(BlkId const& b, bool use_lock = false) const = 0;
     virtual void recovery_completed() = 0;
+    virtual void reset() = 0;
 
     virtual std::string to_string() const = 0;
     virtual void cp_flush(CP* cp) = 0;
diff --git a/src/lib/blkalloc/fixed_blk_allocator.h b/src/lib/blkalloc/fixed_blk_allocator.h
index fa28681f2..01f1e1138 100644
--- a/src/lib/blkalloc/fixed_blk_allocator.h
+++ b/src/lib/blkalloc/fixed_blk_allocator.h
@@ -41,6 +41,7 @@ class FixedBlkAllocator : public BitmapBlkAllocator {
     blk_num_t available_blks() const override;
     blk_num_t get_used_blks() const override;
     blk_num_t get_defrag_nblks() const override;
+    void reset() override{};
     bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override;
     std::string to_string() const override;
 
diff --git a/src/lib/blkalloc/varsize_blk_allocator.h b/src/lib/blkalloc/varsize_blk_allocator.h
index 1a90de8da..03a507b03 100644
--- a/src/lib/blkalloc/varsize_blk_allocator.h
+++ b/src/lib/blkalloc/varsize_blk_allocator.h
@@ -222,6 +222,7 @@ class VarsizeBlkAllocator : public BitmapBlkAllocator {
     blk_num_t get_used_blks() const override;
     bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override;
     std::string to_string() const override;
+    void reset() override{};
     nlohmann::json get_metrics_in_json();
 
 private:
diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp
index 4acd3d846..b17fc0a61 100644
--- a/src/lib/blkdata_svc/blkdata_service.cpp
+++ b/src/lib/blkdata_svc/blkdata_service.cpp
@@ -34,11 +34,12 @@ BlkDataService::BlkDataService(shared< ChunkSelector > chunk_selector) :
         m_custom_chunk_selector{std::move(chunk_selector)} {
     m_blk_read_tracker = std::make_unique< BlkReadTracker >();
 }
+
 BlkDataService::~BlkDataService() = default;
 
 // first-time boot path
 void BlkDataService::create_vdev(uint64_t size, HSDevType devType, uint32_t blk_size, blk_allocator_type_t alloc_type,
-                                 chunk_selector_type_t chunk_sel_type, uint32_t num_chunks) {
+                                 chunk_selector_type_t chunk_sel_type, uint32_t num_chunks, uint32_t chunk_size) {
     hs_vdev_context vdev_ctx;
     vdev_ctx.type = hs_vdev_type_t::DATA_VDEV;
 
@@ -48,6 +49,7 @@ void BlkDataService::create_vdev(uint64_t size, HSDevType devType, uint32_t blk_
                                                         .vdev_size = size,
                                                         .num_chunks = num_chunks,
                                                         .blk_size = blk_size,
+                                                        .chunk_size = chunk_size,
                                                         .dev_type = devType,
                                                         .alloc_type = alloc_type,
                                                         .chunk_sel_type = chunk_sel_type,
@@ -188,8 +190,28 @@ folly::Future< std::error_code > BlkDataService::async_write(sisl::sg_list const
     }
 }
 
+folly::Future< std::error_code >
+BlkDataService::async_write(sisl::sg_list const& sgs, std::vector< MultiBlkId > const& blkids, bool part_of_batch) {
+    static thread_local std::vector< folly::Future< std::error_code > > s_futs;
+    s_futs.clear();
+    for (const auto& blkid : blkids) {
+        s_futs.emplace_back(async_write(sgs, blkid, part_of_batch));
+    }
+    return collect_all_futures(s_futs);
+}
+
+void BlkDataService::submit_io_batch() { m_vdev->submit_batch(); }
+
 BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, MultiBlkId& out_blkids) {
-    HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested");
+    HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested size={} blk_size={}", size, m_blk_size);
+    blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size);
+
+    return m_vdev->alloc_blks(nblks, hints, out_blkids);
+}
+
+BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints,
+                                          std::vector< BlkId >& out_blkids) {
+    HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested size={} blk_size={}", size, m_blk_size);
     blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size);
 
     return m_vdev->alloc_blks(nblks, hints, out_blkids);
@@ -234,10 +256,14 @@ void BlkDataService::start() {
                                      std::move(std::make_unique< DataSvcCPCallbacks >(m_vdev)));
 }
 
+void BlkDataService::stop() {}
+
 uint64_t BlkDataService::get_total_capacity() const { return m_vdev->size(); }
 
 uint64_t BlkDataService::get_used_capacity() const { return m_vdev->used_size(); }
 
+HSDevType BlkDataService::get_dev_type() const { return static_cast< HSDevType >(m_vdev->get_dev_type()); }
+
 uint32_t BlkDataService::get_align_size() const { return m_vdev->align_size(); }
 
 } // namespace homestore
diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp
index 33d22090a..a387b5da5 100644
--- a/src/lib/checkpoint/cp_mgr.cpp
+++ b/src/lib/checkpoint/cp_mgr.cpp
@@ -235,7 +235,8 @@ void CPManager::cp_start_flush(CP* cp) {
     for (size_t svcid = 0; svcid < (size_t)cp_consumer_t::SENTINEL; svcid++) {
         if (svcid == (size_t)cp_consumer_t::REPLICATION_SVC) { continue; }
         auto& consumer = m_cp_cb_table[svcid];
-        if (consumer) { futs.emplace_back(std::move(consumer->cp_flush(cp))); }
+        bool participated = (cp->m_contexts[svcid] != nullptr);
+        if (consumer && participated) { futs.emplace_back(std::move(consumer->cp_flush(cp))); }
     }
 
     folly::collectAllUnsafe(futs).thenValue([this, cp](auto) {
@@ -314,8 +315,12 @@ void CPManager::start_cp_thread() {
     };
     auto ctx = std::make_shared< Context >();
 
-    // Start a reactor with 9 fibers (8 for sync io)
-    iomanager.create_reactor("cp_io", iomgr::INTERRUPT_LOOP, 8u, [this, ctx](bool is_started) {
+    // Start a reactor with 2 fibers (1 for sync io)
+    // Prevent deadlock with sync_io fibers.
+    // Multiple sync_io fibers may acquire a thread-level mutex and perform synchronous I/O using io_uring.
+    // This can block the fiber and allow other fibers to be scheduled.
+    // If another fiber tries to acquire the same mutex, a deadlock can occur.
+    iomanager.create_reactor("cp_io", iomgr::INTERRUPT_LOOP, 2u, [this, ctx](bool is_started) {
         if (is_started) {
             {
                 std::unique_lock< std::mutex > lk{ctx->mtx};
diff --git a/src/lib/common/crash_simulator.hpp b/src/lib/common/crash_simulator.hpp
index 788de1eac..e8826b61d 100644
--- a/src/lib/common/crash_simulator.hpp
+++ b/src/lib/common/crash_simulator.hpp
@@ -42,8 +42,12 @@ class CrashSimulator {
         }
     }
 
+    bool will_crash() const { return m_will_crash.load(); }
+    void set_will_crash(bool crash) { m_will_crash.store(crash); }
+
 private:
     std::function< void(void) > m_restart_cb{nullptr};
+    std::atomic<bool> m_will_crash{false};
     sisl::urcu_scoped_ptr< bool > m_crashed;
 };
 } // namespace homestore
diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs
index cd8858863..4a7f9bd8b 100644
--- a/src/lib/common/homestore_config.fbs
+++ b/src/lib/common/homestore_config.fbs
@@ -143,7 +143,11 @@ table Generic {
 
     cache_max_throttle_cnt : uint32 = 4; // writeback cache max q depth
 
-    cache_min_throttle_cnt : uint32 = 4; // writeback cache min q deoth
+    cache_min_throttle_cnt : uint32 = 4; // writeback cache min q depth
+
+    cache_hashmap_nbuckets : uint32 = 1000000; // num buckets for sisl::SimpleHashmap used in wbcache
+
+    cache_evictor_npartitions: uint32 = 1000; // num partitions for lru evictor in the cache
 
     // if this value is set to 0, no sanity check will be run;
     sanity_check_level: uint32 = 1 (hotswap);
@@ -255,6 +259,10 @@ table Consensus {
     // Max append batch size
     max_append_batch_size: int32 = 64;
 
+    // Max grpc message size, use 64M (max data size on data channel) + 128M (max snasphot batch size) + 1M
+    // Please adjust it if data_fetch_max_size_kb is increased as well
+    max_grpc_message_size: int32 = 202375168;
+
     // Threshold of log gap from leader to consider a replica as stale
     stale_log_gap_hi_threshold: int32 = 200;
 
@@ -262,7 +270,8 @@ table Consensus {
     stale_log_gap_lo_threshold: int32 = 30;
 
     // Minimum log gap a replica has to be from leader before joining the replica set.
-    min_log_gap_to_join: int32 = 30;
+    // 0 indicates the new member will join in cluster immediately.
+    min_log_gap_to_join: int32 = 0;
     
     // amount of time in millis to wait on data write before fetch data from remote;
     wait_data_write_timer_ms: uint64 = 1500 (hotswap);
@@ -279,11 +288,38 @@ table Consensus {
     // ReplDev Reqs timeout in seconds.
     repl_req_timeout_sec: uint32 = 300;
 
+    // Timeout for snapshot sync context in ms. If the follower doesn't response
+    // within this timeout during snapshot resync,  the leader will release snapshot sync context.
+    snapshot_sync_ctx_timeout_ms: int32 = 60000;
+
     // Frequency to flush durable commit LSN in millis
     flush_durable_commit_interval_ms: uint64 = 500;
 
     // Log difference to determine if the follower is in resync mode
     resync_log_idx_threshold: int64 = 100;
+
+    // Log difference from leader's point of view,  to determine if the
+    // follower is laggy and if so, leader will stop pushing data until it drops under this threshold.
+    laggy_threshold: int64 = 2000;
+
+    // Reading snapshot objects will be done by a background thread asynchronously
+    // instead of synchronous read by Raft worker threads
+    use_bg_thread_for_snapshot_io: bool = true;
+
+    // Maximum number of election timeout rounds to wait during a prioritized leader election process.
+    // Every election timeout will compare its priority with the target_priority(max priority of the peers initially)
+    // then decay the target_priority and wait again until its priority >= target_priority. This setting helps us to set proper priority for peers.
+    // 0 means all members have the same priority.
+    max_wait_rounds_of_priority_election: uint32 = 2;
+
+    // Maximum number of retries when raft is undergoing config changing
+    config_changing_error_retries: int32 = 3;
+
+    // The time to wait for config change to be applied in ms
+    wait_for_config_change_ms: uint32 = 500;
+
+    // The interval in ms to check if the new member in replace_member is fully synced and ready to take over
+    replace_member_sync_check_interval_ms: uint64 = 60000;
 }
 
 table HomeStoreSettings {
diff --git a/src/lib/common/homestore_utils.hpp b/src/lib/common/homestore_utils.hpp
index 2ee51b03d..b6989ff48 100644
--- a/src/lib/common/homestore_utils.hpp
+++ b/src/lib/common/homestore_utils.hpp
@@ -53,4 +53,8 @@ class hs_utils {
     static bool topological_sort(std::unordered_map< std::string, std::vector< std::string > >& DAG,
                                  std::vector< std::string >& ordered_entries);
 };
+
+static bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms,
+                           uint32_t interval_ms = 100);
+
 } // namespace homestore
diff --git a/src/lib/common/resource_mgr.cpp b/src/lib/common/resource_mgr.cpp
index 8719089b9..8440d6f68 100644
--- a/src/lib/common/resource_mgr.cpp
+++ b/src/lib/common/resource_mgr.cpp
@@ -15,13 +15,15 @@
  *********************************************************************************/
 #include <homestore/homestore.hpp>
 #include <homestore/logstore_service.hpp>
-#include <homestore/replication_service.hpp>
 #include <homestore/checkpoint/cp_mgr.hpp>
 #include <iomgr/iomgr_flip.hpp>
 #include "resource_mgr.hpp"
 #include "homestore_assert.hpp"
-#include "replication/repl_dev/raft_repl_dev.h"
 
+#ifdef REPLICATION_SUPPORT
+#include <homestore/replication_service.hpp>
+#include "replication/repl_dev/raft_repl_dev.h"
+#endif
 namespace homestore {
 ResourceMgr& resource_mgr() { return hs()->resource_mgr(); }
 
@@ -48,14 +50,16 @@ void ResourceMgr::stop() {
 //
 void ResourceMgr::trigger_truncate() {
     if (hs()->has_repl_data_service()) {
-        // first make sure all repl dev's underlying raft log store make corresponding reservation during
-        // truncate -- set the safe truncate boundary for each raft log store;
-        hs()->repl_service().iterate_repl_devs([](cshared< ReplDev >& rd) {
-            // lock is already taken by repl service layer;
-            std::dynamic_pointer_cast< RaftReplDev >(rd)->truncate(
-                HS_DYNAMIC_CONFIG(resource_limits.raft_logstore_reserve_threshold));
-        });
-
+        /*
+         * DO NOT NEED : raft will truncate logs.
+         * // first make sure all repl dev's underlying raft log store make corresponding reservation during
+         * // truncate -- set the safe truncate boundary for each raft log store;
+         * hs()->repl_service().iterate_repl_devs([](cshared< ReplDev >& rd) {
+         *     // lock is already taken by repl service layer;
+         *     std::dynamic_pointer_cast< RaftReplDev >(rd)->truncate(
+         *     HS_DYNAMIC_CONFIG(resource_limits.raft_logstore_reserve_threshold));
+         *  });
+         */
         // next do device truncate which go through all logdevs and truncate them;
         hs()->logstore_service().device_truncate();
     }
diff --git a/src/lib/device/README.md b/src/lib/device/README.md
new file mode 100644
index 000000000..99f83ecc0
--- /dev/null
+++ b/src/lib/device/README.md
@@ -0,0 +1,7 @@
+# Device Manager
+
+## HomeStore 4.x Disk Layout
+1. max_num_chunks is decided by device size and min_chunk_size which is configurable by HomeStore consumer
+2. Super SuperBlk (SSB) is the first meta blk to load Meta Service. All other System Meta Blks are chained together by loading the SSB 
+
+![HomeStore_Disk_Layout](../../../docs/imgs/HomeStore_Disk_Layout2.png)
diff --git a/src/lib/device/chunk.cpp b/src/lib/device/chunk.cpp
index 9eb8563de..4962be386 100644
--- a/src/lib/device/chunk.cpp
+++ b/src/lib/device/chunk.cpp
@@ -29,6 +29,10 @@ std::string Chunk::to_string() const {
                        vdev_ordinal(), stream_id());
 }
 
+float Chunk::get_blk_usage() const {
+    return s_cast<float>(m_blk_allocator->get_used_blks()) / s_cast<float>(m_blk_allocator->get_total_blks());
+}
+
 void Chunk::set_user_private(const sisl::blob& data) {
     std::unique_lock lg{m_mgmt_mutex};
     m_chunk_info.set_user_private(data);
diff --git a/src/lib/device/chunk.h b/src/lib/device/chunk.h
index 77b275e4b..b9d84abdb 100644
--- a/src/lib/device/chunk.h
+++ b/src/lib/device/chunk.h
@@ -27,6 +27,7 @@ class Chunk {
     const uint32_t m_stream_id;
     uint32_t m_vdev_ordinal{0};
     shared< BlkAllocator > m_blk_allocator;
+    float blk_usage_report_threshold{0.9};
 
 public:
     static constexpr auto MAX_CHUNK_SIZE = std::numeric_limits< uint32_t >::max();
@@ -66,6 +67,8 @@ class Chunk {
     nlohmann::json get_status([[maybe_unused]] int log_level) const;
     const BlkAllocator* blk_allocator() const { return m_blk_allocator.get(); }
     BlkAllocator* blk_allocator_mutable() { return m_blk_allocator.get(); }
+    float get_blk_usage_report_threshold() const { return blk_usage_report_threshold; }
+    float get_blk_usage() const;
 
     ////////////// Setters /////////////////////
     void set_user_private(const sisl::blob& data);
diff --git a/src/lib/device/device.h b/src/lib/device/device.h
index beefdfc7f..1c3843534 100644
--- a/src/lib/device/device.h
+++ b/src/lib/device/device.h
@@ -36,6 +36,7 @@ VENUM(vdev_multi_pdev_opts_t, uint8_t, // Indicates the style of vdev when multi
 struct vdev_info {
     static constexpr size_t size = 512;
     static constexpr size_t user_private_size = 256;
+    static constexpr size_t max_name_len = 64;
 
     uint64_t vdev_size{0};                     // 0: Size of the vdev
     uint32_t vdev_id{0};                       // 8: Id for this vdev. It is unique per homestore instance
@@ -48,7 +49,7 @@ struct vdev_info {
     uint8_t failed{0};                         // 30: set to true if disk is replaced
     uint8_t hs_dev_type{0};                    // 31: PDev dev type (as in fast or data)
     uint8_t multi_pdev_choice{0};              // 32: Choice when multiple pdevs are present (vdev_multi_pdev_opts_t)
-    char name[64];                             // 33: Name of the vdev
+    char name[max_name_len];                   // 33: Name of the vdev
     uint16_t checksum{0};                      // 97: Checksum of this entire Block
     uint8_t alloc_type;                        // 98: Allocator type of this vdev
     uint8_t chunk_sel_type;                    // 99: Chunk Selector type of this vdev_id
@@ -59,7 +60,10 @@ struct vdev_info {
     uint32_t get_vdev_id() const { return vdev_id; }
     uint64_t get_size() const { return vdev_size; }
 
-    void set_name(const std::string& n) { std::strncpy(charptr_cast(name), n.c_str(), 63); }
+    void set_name(const std::string& n) {
+        std::strncpy(charptr_cast(name), n.c_str(), max_name_len - 1);
+        name[max_name_len - 1] = '\0';
+    }
     std::string get_name() const { return std::string{c_charptr_cast(name)}; }
 
     void set_allocated() { slot_allocated = s_cast< uint8_t >(0x01); };
diff --git a/src/lib/device/device_manager.cpp b/src/lib/device/device_manager.cpp
index cac91237f..28eb37e33 100644
--- a/src/lib/device/device_manager.cpp
+++ b/src/lib/device/device_manager.cpp
@@ -99,7 +99,8 @@ void DeviceManager::format_devices() {
     ++m_first_blk_hdr.gen_number;
     m_first_blk_hdr.version = first_block_header::CURRENT_SUPERBLOCK_VERSION;
     std::strncpy(m_first_blk_hdr.product_name, first_block_header::PRODUCT_NAME,
-                 first_block_header::s_product_name_size);
+                 first_block_header::s_product_name_size - 1);
+    m_first_blk_hdr.product_name[first_block_header::s_product_name_size - 1] = '\0';
     m_first_blk_hdr.num_pdevs = uint32_cast(m_dev_infos.size());
     m_first_blk_hdr.max_vdevs = hs_super_blk::MAX_VDEVS_IN_SYSTEM;
     m_first_blk_hdr.max_system_chunks = hs_super_blk::MAX_CHUNKS_IN_SYSTEM;
diff --git a/src/lib/device/hs_super_blk.h b/src/lib/device/hs_super_blk.h
index a539c1e56..9d0a3140d 100644
--- a/src/lib/device/hs_super_blk.h
+++ b/src/lib/device/hs_super_blk.h
@@ -75,7 +75,7 @@ struct disk_attr {
 };
 
 struct first_block_header {
-    static constexpr const char* PRODUCT_NAME{"OmStore"};
+    static constexpr const char* PRODUCT_NAME{"HomeStore4x"};
     static constexpr size_t s_product_name_size{64};
     static constexpr uint32_t CURRENT_SUPERBLOCK_VERSION{4};
 
@@ -128,7 +128,7 @@ struct first_block {
     static constexpr uint32_t s_atomic_fb_size{512};       // increase 512 to actual size if in the future first_block
                                                            // can be larger;
     static constexpr uint32_t s_io_fb_size{4096};          // This is the size we do IO on, with padding
-    static constexpr uint32_t HOMESTORE_MAGIC{0xCEEDDEEB}; // Magic written as first bytes on each device
+    static constexpr uint32_t HOMESTORE_MAGIC{0xABBECDCD}; // Magic written as first bytes on each device
 
 public:
     uint64_t magic{0};                // Header magic expected to be at the top of block
diff --git a/src/lib/device/journal_vdev.cpp b/src/lib/device/journal_vdev.cpp
index 5c3e5b34f..6ca2678fc 100644
--- a/src/lib/device/journal_vdev.cpp
+++ b/src/lib/device/journal_vdev.cpp
@@ -24,8 +24,6 @@
 #include <iomgr/iomgr_flip.hpp>
 #include <homestore/homestore.hpp>
 #include <homestore/logstore_service.hpp>
-#include <homestore/replication_service.hpp>
-#include "replication/repl_dev/raft_repl_dev.h"
 #include "device/chunk.h"
 #include "device/device.h"
 #include "device/physical_dev.hpp"
diff --git a/src/lib/device/physical_dev.cpp b/src/lib/device/physical_dev.cpp
index 1b6914cf5..ba52ba2f2 100644
--- a/src/lib/device/physical_dev.cpp
+++ b/src/lib/device/physical_dev.cpp
@@ -35,6 +35,8 @@ namespace homestore {
 static std::mutex s_cached_dev_mtx;
 static std::unordered_map< std::string, iomgr::io_device_ptr > s_cached_opened_devs;
 
+__attribute__((no_sanitize_address)) static auto get_current_time() { return Clock::now(); }
+
 iomgr::io_device_ptr open_and_cache_dev(const std::string& devname, int oflags) {
     std::unique_lock lg(s_cached_dev_mtx);
 
@@ -136,26 +138,50 @@ void PhysicalDev::close_device() { close_and_uncache_dev(m_devname, m_iodev); }
 
 folly::Future< std::error_code > PhysicalDev::async_write(const char* data, uint32_t size, uint64_t offset,
                                                           bool part_of_batch) {
-    HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1));
-    return m_drive_iface->async_write(m_iodev.get(), data, size, offset, part_of_batch);
+    auto const start_time = get_current_time();
+    return m_drive_iface->async_write(m_iodev.get(), data, size, offset, part_of_batch)
+        .thenValue([this, start_time, size](std::error_code ec) {
+            HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1));
+            HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time));
+            COUNTER_INCREMENT(m_metrics, drive_async_write_count, 1);
+            return ec;
+        });
 }
 
 folly::Future< std::error_code > PhysicalDev::async_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset,
                                                            bool part_of_batch) {
-    HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1));
-    return m_drive_iface->async_writev(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch);
+    auto const start_time = get_current_time();
+    return m_drive_iface->async_writev(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch)
+        .thenValue([this, start_time, size](std::error_code ec) {
+            HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1));
+            HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time));
+            COUNTER_INCREMENT(m_metrics, drive_async_write_count, 1);
+            return ec;
+        });
 }
 
 folly::Future< std::error_code > PhysicalDev::async_read(char* data, uint32_t size, uint64_t offset,
                                                          bool part_of_batch) {
-    HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1));
-    return m_drive_iface->async_read(m_iodev.get(), data, size, offset, part_of_batch);
+    auto const start_time = get_current_time();
+    return m_drive_iface->async_read(m_iodev.get(), data, size, offset, part_of_batch)
+        .thenValue([this, start_time, size](std::error_code ec) {
+            HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1));
+            HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time));
+            COUNTER_INCREMENT(m_metrics, drive_async_read_count, 1);
+            return ec;
+        });
 }
 
 folly::Future< std::error_code > PhysicalDev::async_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset,
                                                           bool part_of_batch) {
-    HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1));
-    return m_drive_iface->async_readv(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch);
+    auto const start_time = get_current_time();
+    return m_drive_iface->async_readv(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch)
+        .thenValue([this, start_time, size](std::error_code ec) {
+            HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1));
+            HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time));
+            COUNTER_INCREMENT(m_metrics, drive_async_read_count, 1);
+            return ec;
+        });
 }
 
 folly::Future< std::error_code > PhysicalDev::async_write_zero(uint64_t size, uint64_t offset) {
@@ -174,46 +200,50 @@ folly::Future< std::error_code > PhysicalDev::async_write_zero(uint64_t size, ui
 
 folly::Future< std::error_code > PhysicalDev::queue_fsync() { return m_drive_iface->queue_fsync(m_iodev.get()); }
 
-__attribute__((no_sanitize_address)) static auto get_current_time() { return Clock::now(); }
-
 std::error_code PhysicalDev::sync_write(const char* data, uint32_t size, uint64_t offset) {
-    HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1));
-    COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1);
     auto const start_time = get_current_time();
     auto const ret = m_drive_iface->sync_write(m_iodev.get(), data, size, offset);
     HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time));
+    HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1));
+    COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1);
     return ret;
 }
 
 std::error_code PhysicalDev::sync_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset) {
-    HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1));
-    COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1);
     auto const start_time = Clock::now();
     auto const ret = m_drive_iface->sync_writev(m_iodev.get(), iov, iovcnt, size, offset);
     HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time));
+    HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1));
+    COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1);
+
     return ret;
 }
 
 std::error_code PhysicalDev::sync_read(char* data, uint32_t size, uint64_t offset) {
-    HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1));
-    COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1);
     auto const start_time = Clock::now();
     auto const ret = m_drive_iface->sync_read(m_iodev.get(), data, size, offset);
     HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time));
+    HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1));
+    COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1);
     return ret;
 }
 
 std::error_code PhysicalDev::sync_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset) {
-    HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1));
-    COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1);
     auto const start_time = Clock::now();
     auto const ret = m_drive_iface->sync_readv(m_iodev.get(), iov, iovcnt, size, offset);
     HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time));
+    HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1));
+    COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1);
     return ret;
 }
 
 std::error_code PhysicalDev::sync_write_zero(uint64_t size, uint64_t offset) {
-    return m_drive_iface->sync_write_zero(m_iodev.get(), size, offset);
+    auto const start_time = Clock::now();
+    auto const ret = m_drive_iface->sync_write_zero(m_iodev.get(), size, offset);
+    HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time));
+    HISTOGRAM_OBSERVE(m_metrics, wirte_io_size, (((size - 1) / 1024) + 1));
+    COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1);
+    return ret;
 }
 
 void PhysicalDev::submit_batch() { m_drive_iface->submit_batch(); }
diff --git a/src/lib/device/vchunk.cpp b/src/lib/device/vchunk.cpp
index 1a7aaeac5..a809450d1 100644
--- a/src/lib/device/vchunk.cpp
+++ b/src/lib/device/vchunk.cpp
@@ -25,6 +25,8 @@ const uint8_t* VChunk::get_user_private() const { return m_internal_chunk->user_
 
 blk_num_t VChunk::get_total_blks() const { return m_internal_chunk->blk_allocator()->get_total_blks(); }
 
+void VChunk::reset() { m_internal_chunk->blk_allocator_mutable()->reset(); }
+
 blk_num_t VChunk::available_blks() const { return m_internal_chunk->blk_allocator()->available_blks(); }
 
 blk_num_t VChunk::get_defrag_nblks() const { return m_internal_chunk->blk_allocator()->get_defrag_nblks(); }
@@ -33,5 +35,7 @@ uint32_t VChunk::get_pdev_id() const { return m_internal_chunk->physical_dev()->
 
 uint16_t VChunk::get_chunk_id() const { return m_internal_chunk->chunk_id(); }
 
+uint64_t VChunk::size() const { return m_internal_chunk->size(); }
+
 cshared< Chunk > VChunk::get_internal_chunk() const { return m_internal_chunk; }
 } // namespace homestore
diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp
index 591540995..a3f060e4a 100644
--- a/src/lib/device/virtual_dev.cpp
+++ b/src/lib/device/virtual_dev.cpp
@@ -431,6 +431,8 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId con
 
     Chunk* chunk;
     uint64_t const dev_offset = to_dev_offset(bid, &chunk);
+    HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(),
+           dev_offset);
     if (sisl_unlikely(dev_offset == INVALID_DEV_OFFSET)) {
         return std::make_error_code(std::errc::resource_unavailable_try_again);
     }
@@ -443,6 +445,9 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared<
     if (hs()->crash_simulator().is_in_crashing_phase()) { return std::error_code{}; }
 #endif
 
+    HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(),
+           chunk->start_offset() + offset_in_chunk);
+
     if (sisl_unlikely(!is_chunk_available(chunk))) {
         return std::make_error_code(std::errc::resource_unavailable_try_again);
     }
@@ -464,6 +469,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId cons
     auto const size = get_len(iov, iovcnt);
     auto* pdev = chunk->physical_dev_mutable();
 
+    HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset);
+
     COUNTER_INCREMENT(m_metrics, vdev_write_count, 1);
     if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) {
         COUNTER_INCREMENT(m_metrics, unalign_writes, 1);
@@ -486,6 +493,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< C
     auto const size = get_len(iov, iovcnt);
     auto* pdev = chunk->physical_dev_mutable();
 
+    HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset);
+
     COUNTER_INCREMENT(m_metrics, vdev_write_count, 1);
     if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) {
         COUNTER_INCREMENT(m_metrics, unalign_writes, 1);
diff --git a/src/lib/device/virtual_dev.hpp b/src/lib/device/virtual_dev.hpp
index 36032954e..eb6b63192 100644
--- a/src/lib/device/virtual_dev.hpp
+++ b/src/lib/device/virtual_dev.hpp
@@ -292,6 +292,7 @@ class VirtualDev {
     virtual nlohmann::json get_status(int log_level) const;
     virtual uint64_t get_total_chunk_num() const { return m_total_chunk_num; }
 
+    uint8_t get_dev_type() const { return m_vdev_info.hs_dev_type; }
     uint32_t align_size() const;
     uint32_t optimal_page_size() const;
     uint32_t atomic_page_size() const;
diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp
index e2bbcbc21..f7e4f9019 100644
--- a/src/lib/homestore.cpp
+++ b/src/lib/homestore.cpp
@@ -21,6 +21,7 @@
 #include <sisl/fds/malloc_helper.hpp>
 #include <sisl/fds/buffer.hpp>
 #include <sisl/logging/logging.h>
+#include <sisl/version.hpp>
 #include <sisl/cache/lru_evictor.hpp>
 
 #include <homestore/blkdata_service.hpp>
@@ -39,7 +40,9 @@
 #include "device/virtual_dev.hpp"
 #include "common/resource_mgr.hpp"
 #include "meta/meta_sb.hpp"
+#ifdef REPLICATION_SUPPORT
 #include "replication/service/generic_repl_svc.h"
+#endif
 #include "common/crash_simulator.hpp"
 
 /*
@@ -57,6 +60,7 @@ HomeStoreSafePtr HomeStore::s_instance{nullptr};
 static std::unique_ptr< IndexServiceCallbacks > s_index_cbs;
 static shared< ChunkSelector > s_custom_chunk_selector{nullptr};
 static shared< ReplApplication > s_repl_app{nullptr};
+std::string version = PACKAGE_VERSION;
 
 HomeStore* HomeStore::instance() {
     if (s_instance == nullptr) { s_instance = std::make_shared< HomeStore >(); }
@@ -92,6 +96,7 @@ HomeStore& HomeStore::with_log_service() {
     return *this;
 }
 
+#ifdef REPLICATION_SUPPORT
 HomeStore& HomeStore::with_repl_data_service(cshared< ReplApplication >& repl_app,
                                              cshared< ChunkSelector >& custom_chunk_selector) {
     m_services[uint32_cast(ServiceType::REPLICATION)] = std::vector< ServiceSubType >{1u, ServiceSubType::DEFAULT};
@@ -101,6 +106,7 @@ HomeStore& HomeStore::with_repl_data_service(cshared< ReplApplication >& repl_ap
     s_custom_chunk_selector = std::move(custom_chunk_selector);
     return *this;
 }
+#endif
 
 #ifdef _PRERELEASE
 HomeStore& HomeStore::with_crash_simulator(std::function< void(void) > cb) {
@@ -149,6 +155,12 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_
 
     static std::once_flag flag1;
     std::call_once(flag1, [this]() {
+#ifndef NDEBUG
+        LOGINFO("HomeStore DEBUG version: {}", version);
+#else
+        LOGINFO("HomeStore RELEASE version: {}", version);
+#endif
+        sisl::VersionMgr::addVersion(PACKAGE_NAME, version::Semver200_version(PACKAGE_VERSION));
         m_periodic_logger =
             sisl::logging::CreateCustomLogger("homestore", "_periodic", false, true /* tee_to_stdout_stderr */);
         sisl::logging::SetLogPattern("[%D %T.%f] [%^%L%$] [%t] %v", m_periodic_logger);
@@ -156,6 +168,19 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_
 
     HomeStoreDynamicConfig::init_settings_default();
 
+    // Check if the max_grpc_message_size is large enough to hold the data and snapshot batch size
+    auto data_fetch_max_size_in_byte = HS_DYNAMIC_CONFIG(consensus.data_fetch_max_size_kb) * 1024ull;
+    RELEASE_ASSERT(data_fetch_max_size_in_byte <= INT_MAX, "data fetch size is larger than the grpc limit");
+    if (HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < input.max_data_size ||
+        HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < input.max_snapshot_batch_size ||
+        HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size) < s_cast< int >(data_fetch_max_size_in_byte)) {
+        LOGERROR("max_grpc_message_size {} is too small to hold max_data_size {}, max_snapshot_batch_size {} and "
+                 "data_fetch_max_size {}",
+                 HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size), input.max_data_size,
+                 input.max_snapshot_batch_size, data_fetch_max_size_in_byte);
+        throw std::invalid_argument("max_grpc_message_size is insufficient for the configured data or snapshot sizes");
+    }
+
 #ifdef _PRERELEASE
     // Start a default crash simulator which raises SIGKILL, in case user has not provided with_crash_simulator()
     // callback
@@ -171,7 +196,9 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_
     if (has_repl_data_service()) {
         m_log_service = std::make_unique< LogStoreService >();
         m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_chunk_selector));
+#ifdef REPLICATION_SUPPORT
         m_repl_service = GenericReplService::create(std::move(s_repl_app));
+#endif
     } else {
         if (has_log_service()) { m_log_service = std::make_unique< LogStoreService >(); }
         if (has_data_service()) {
@@ -243,11 +270,11 @@ void HomeStore::format_and_start(std::map< ServiceId, hs_format_params >&& forma
         } else if ((svc_id.type == ServiceType::DATA) && has_data_service()) {
             m_data_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type,
                                         fparams.block_size, fparams.alloc_type, fparams.chunk_sel_type,
-                                        fparams.num_chunks);
+                                        fparams.num_chunks, fparams.chunk_size);
         } else if ((svc_id.type == ServiceType::REPLICATION) && has_repl_data_service()) {
             m_data_service->create_vdev(pct_to_size(fparams.size_pct, fparams.dev_type), fparams.dev_type,
                                         fparams.block_size, fparams.alloc_type, fparams.chunk_sel_type,
-                                        fparams.num_chunks);
+                                        fparams.num_chunks, fparams.chunk_size);
         }
     }
 
@@ -265,7 +292,7 @@ void HomeStore::do_start() {
     const auto& inp_params = HomeStoreStaticConfig::instance().input;
 
     uint64_t cache_size = resource_mgr().get_cache_size();
-    m_evictor = std::make_shared< sisl::LRUEvictor >(cache_size, 1000);
+    m_evictor = std::make_shared< sisl::LRUEvictor >(cache_size, HS_DYNAMIC_CONFIG(generic.cache_evictor_npartitions));
 
     if (m_before_services_starting_cb) { m_before_services_starting_cb(); }
 
@@ -279,7 +306,9 @@ void HomeStore::do_start() {
     if (has_index_service()) { m_index_service->start(); }
 
     if (has_repl_data_service()) {
+#ifdef REPLICATION_SUPPORT
         s_cast< GenericReplService* >(m_repl_service.get())->start(); // Replservice starts logstore & data service
+#endif
     } else {
         if (has_data_service()) { m_data_service->start(); }
         if (has_log_service() && inp_params.auto_recovery) {
@@ -317,11 +346,13 @@ void HomeStore::shutdown() {
     m_resource_mgr->stop();
 
     if (has_repl_data_service()) {
+#ifdef REPLICATION_SUPPORT
         // Log and Data services are stopped by repl service
         s_cast< GenericReplService* >(m_repl_service.get())->stop();
         m_log_service.reset();
         m_data_service.reset();
         m_repl_service.reset();
+#endif
     } else {
         if (has_log_service()) {
             m_log_service->stop();
@@ -346,8 +377,6 @@ void HomeStore::shutdown() {
 #ifdef _PRERELEASE
     flip::Flip::instance().stop_rpc_server();
 #endif
-
-    HomeStore::reset_instance();
     LOGINFO("Homestore is completed its shutdown");
 }
 
diff --git a/src/lib/index/inplace_btree/index_cp.hpp b/src/lib/index/inplace_btree/index_cp.hpp
index c8292c47f..b04b8f052 100644
--- a/src/lib/index/inplace_btree/index_cp.hpp
+++ b/src/lib/index/inplace_btree/index_cp.hpp
@@ -92,12 +92,12 @@ struct IndexCPContext : public VDevCPContext {
         }
 
         std::string parent_id_string() const {
-            return (has_inplace_parent == 0x1) ? fmt::format("chunk={}, blk={}", ids[0].second, ids[0].first) : "empty";
+            return (has_inplace_parent == 0x1) ? fmt::format("{}", blk_id(0).to_integer()) : "empty";
         }
 
         std::string child_id_string() const {
             auto const idx = (has_inplace_parent == 0x1) ? 1 : 0;
-            return (has_inplace_child == 0x1) ? fmt::format("chunk={}, blk={}", ids[idx].second, ids[idx].first)
+            return (has_inplace_child == 0x1) ? fmt::format("{}", blk_id(idx).to_integer())
                                               : "empty";
         }
 
@@ -160,6 +160,7 @@ struct IndexCPContext : public VDevCPContext {
     std::optional< IndexBufferPtr > next_dirty();
     std::string to_string();
     std::string to_string_with_dags();
+    uint16_t num_dags();
     void to_string_dot(const std::string& filename);
 
 private:
diff --git a/src/lib/index/inplace_btree/inplace_btree_store.h b/src/lib/index/inplace_btree/inplace_btree_store.h
index 4552c2516..63e141bda 100644
--- a/src/lib/index/inplace_btree/inplace_btree_store.h
+++ b/src/lib/index/inplace_btree/inplace_btree_store.h
@@ -59,7 +59,28 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
     superblk< index_table_sb > m_sb;
     shared< MetaIndexBuffer > m_sb_buffer;
 
+    // graceful shutdown
+private:
+    std::atomic_bool m_stopping{false};
+    mutable std::atomic_uint64_t pending_request_num{0};
+
+    bool is_stopping() const { return m_stopping.load(); }
+    void start_stopping() { m_stopping = true; }
+
+    uint64_t get_pending_request_num() const { return pending_request_num.load(); }
+
+    void incr_pending_request_num() const { pending_request_num++; }
+    void decr_pending_request_num() const { pending_request_num--; }
+
 public:
+    void stop() {
+        start_stopping();
+        while (true) {
+            if (!get_pending_request_num()) break;
+            std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        }
+    }
+
     IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg) :
             Btree< K, V >{cfg}, m_sb{"index"} {
         // Create a superblk for the index table and create MetaIndexBuffer corresponding to that
@@ -100,9 +121,20 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
         }
     }
 
-    void destroy() override {
-        Btree< K, V >::destroy_btree(nullptr);
+    void audit_tree() override {
+        cp_mgr().cp_guard();
+        Btree< K, V >::sanity_sub_tree();
+    }
+
+    btree_status_t destroy() override {
+        if (is_stopping()) return btree_status_t::stopping;
+        incr_pending_request_num();
+        auto cpg = cp_mgr().cp_guard();
+        Btree< K, V >::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC));
         m_sb.destroy();
+        m_sb_buffer->m_valid = false;
+        decr_pending_request_num();
+        return btree_status_t::success;
     }
 
     uuid_t uuid() const override { return m_sb->uuid; }
@@ -114,6 +146,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
 
     template < typename ReqT >
     btree_status_t put(ReqT& put_req) {
+        if (is_stopping()) return btree_status_t::stopping;
+        incr_pending_request_num();
         auto ret = btree_status_t::success;
         do {
             auto cpg = cp_mgr().cp_guard();
@@ -121,11 +155,14 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
             ret = Btree< K, V >::put(put_req);
             if (ret == btree_status_t::cp_mismatch) { LOGTRACEMOD(wbcache, "CP Mismatch, retrying put"); }
         } while (ret == btree_status_t::cp_mismatch);
+        decr_pending_request_num();
         return ret;
     }
 
     template < typename ReqT >
     btree_status_t remove(ReqT& remove_req) {
+        if (is_stopping()) return btree_status_t::stopping;
+        incr_pending_request_num();
         auto ret = btree_status_t::success;
         do {
             auto cpg = cp_mgr().cp_guard();
@@ -133,14 +170,66 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
             ret = Btree< K, V >::remove(remove_req);
             if (ret == btree_status_t::cp_mismatch) { LOGTRACEMOD(wbcache, "CP Mismatch, retrying remove"); }
         } while (ret == btree_status_t::cp_mismatch);
+        decr_pending_request_num();
         return ret;
     }
 
+    template < typename ReqT >
+    btree_status_t get(ReqT& greq) const {
+        if (is_stopping()) return btree_status_t::stopping;
+        incr_pending_request_num();
+        auto ret = Btree< K, V >::get(greq);
+        decr_pending_request_num();
+        return ret;
+    }
+
+    void repair_root_node(IndexBufferPtr const& idx_buf) override {
+        LOGTRACEMOD(wbcache, "check if this was the previous root node {} for buf {} ", m_sb->root_node,
+                    idx_buf->to_string());
+        if (m_sb->root_node == idx_buf->blkid().to_integer()) {
+            // This is the root node, we need to update the root node in superblk
+            LOGTRACEMOD(wbcache, "{} is old root so we need to update the meta node ", idx_buf->to_string());
+            BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */,
+                                           BtreeNode::identify_leaf_node(idx_buf->raw_buffer()));
+            static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf);
+            auto edge_id = n->next_bnode();
+
+            BT_DBG_ASSERT(!n->has_valid_edge(),
+                          "root {} already has a valid edge {}, so we should have found the new root node",
+                          n->to_string(), n->get_edge_value().bnode_id());
+            n->set_next_bnode(empty_bnodeid);
+            n->set_edge_value(BtreeLinkInfo{edge_id, 0});
+            LOGTRACEMOD(wbcache, "change root node {}: edge updated to {} and invalidate the next node! ", n->node_id(),
+                        edge_id);
+            auto cpg = cp_mgr().cp_guard();
+            write_node_impl(n, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
+
+        } else {
+            LOGTRACEMOD(wbcache, "This is not the root node, so we can ignore this repair call for buf {}",
+                        idx_buf->to_string());
+        }
+    }
+
+    void delete_stale_children(IndexBufferPtr const& idx_buf) override {
+                                           BtreeNode::identify_leaf_node(idx_buf->raw_buffer()));
+            static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf);
+            auto cpg = cp_mgr().cp_guard();
+            idx_buf->m_dirtied_cp_id = cpg->id();
+            BtreeNodePtr bn = BtreeNodePtr{n};
+
+            if (!bn->is_leaf()) {
+                LOGTRACEMOD(wbcache, "delete_stale_links cp={} buf={}", cpg->id(), idx_buf->to_string());
+                delete_stale_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
+            }
+        }
+    }
+
     void repair_node(IndexBufferPtr const& idx_buf) override {
         if (idx_buf->is_meta_buf()) {
-            // We cannot repair the meta buf on its own, we need to repair the root node which modifies the
             // meta_buf. It is ok to ignore this call, because repair will be done from root before meta_buf is
             // attempted to repair, which would have updated the meta_buf already.
+            LOGTRACEMOD(wbcache, "Ignoring repair on meta buf {} root id {} ", idx_buf->to_string(),
+                        this->root_node_id());
             return;
         }
         BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */,
@@ -153,12 +242,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
         idx_buf->m_dirtied_cp_id = cpg->id();
         BtreeNodePtr bn = BtreeNodePtr{n};
 
-        LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string());
-        repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
+        // Only for interior nodes we need to repair its links
+        if (!bn->is_leaf()) {
+            LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string());
+            repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
+        }
 
         if (idx_buf->m_up_buffer && idx_buf->m_up_buffer->is_meta_buf()) {
             // Our up buffer is a meta buffer, which means that we are the new root node, we need to update the
             // meta_buf with new root as well
+            LOGTRACEMOD(wbcache, "root change for after repairing {}\n\n", idx_buf->to_string());
             on_root_changed(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
         }
     }
@@ -179,6 +272,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
 
         node->set_checksum();
         auto prev_state = idx_node->m_idx_buf->m_state.exchange(index_buf_state_t::DIRTY);
+        idx_node->m_idx_buf->m_node_level = node->level();
         if (prev_state == index_buf_state_t::CLEAN) {
             // It was clean before, dirtying it first time, add it to the wb_cache list to flush
             if (idx_node->m_idx_buf->m_dirtied_cp_id != -1) {
@@ -192,9 +286,9 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
                 (int)prev_state, (int)index_buf_state_t::FLUSHING,
                 "Writing on a node buffer which was currently in flushing state on cur_cp={} buffer_cp_id={}",
                 cp_ctx->id(), idx_node->m_idx_buf->m_dirtied_cp_id);
+            BT_DBG_ASSERT_EQ(idx_node->m_idx_buf->m_dirtied_cp_id, cp_ctx->id(),
         }
         return btree_status_t::success;
-    }
 
     btree_status_t transact_nodes(const BtreeNodeList& new_nodes, const BtreeNodeList& freed_nodes,
                                   const BtreeNodePtr& left_child_node, const BtreeNodePtr& parent_node,
@@ -243,14 +337,19 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
 
     void free_node_impl(const BtreeNodePtr& node, void* context) override {
         auto n = static_cast< IndexBtreeNode* >(node.get());
+        n->m_idx_buf->m_node_level = node->level();
         wb_cache().free_buf(n->m_idx_buf, r_cast< CPContext* >(context));
     }
 
     btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override {
+        // todo: if(m_sb->root_node == new_root->node_id() && m_sb->root_link_version == new_root->link_version()){
+        // return btree_status_t::success;}
+        LOGTRACEMOD(wbcache, "root changed for index old_root={} new_root={}", m_sb->root_node, new_root->node_id());
         m_sb->root_node = new_root->node_id();
         m_sb->root_link_version = new_root->link_version();
 
         if (!wb_cache().refresh_meta_buf(m_sb_buffer, r_cast< CPContext* >(context))) {
+            LOGTRACEMOD(wbcache, "CP mismatch error - discard transact for meta node");
             return btree_status_t::cp_mismatch;
         }
 
@@ -259,23 +358,132 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
         return btree_status_t::success;
     }
 
+    btree_status_t delete_stale_links(BtreeNodePtr const& parent_node, void* cp_ctx) {
+        LOGTRACEMOD(wbcache, "deleting stale links for parent node [{}]", parent_node->to_string());
+        BtreeNodeList free_nodes;
+        auto nentries = parent_node->total_entries();
+        uint32_t deleted = 0;
+        for (uint32_t i = nentries; i-- > 0;) {
+            BtreeLinkInfo cur_child_info;
+            BtreeNodePtr child_node;
+            parent_node->get_nth_value(i, &cur_child_info, false /* copy */);
+            if (auto ret = read_node_impl(cur_child_info.bnode_id(), child_node); ret == btree_status_t::success) {
+                if (child_node->is_node_deleted()) {
+                    LOGTRACEMOD(wbcache, "Deleting stale child node [{}] for parent node [{}]", child_node->to_string(),
+                                parent_node->to_string());
+                    child_node->set_node_deleted();
+                    free_node_impl(child_node, cp_ctx);
+
+                    if (i > 0) {
+                        BtreeLinkInfo pre_child_info;
+                        parent_node->get_nth_value(i - 1, &pre_child_info, false /* copy */);
+                        //                        auto ckey = parent_node->get_nth_key< K >(i-1, true);
+                        //                        parent_node->set_nth_key(i-1, ckey);
+                        parent_node->update(i, pre_child_info);
+                        parent_node->remove(i - 1);
+                    } else {
+                        parent_node->remove(i);
+                    }
+
+                    LOGTRACEMOD(wbcache, "so far parent node [{}]", parent_node->to_string());
+                    // free_nodes.push_back(child_node);
+                    deleted++;
+                }
+            } else {
+                LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}",
+                            cur_child_info.bnode_id(), parent_node->to_string(), ret);
+            }
+        }
+        if (parent_node->has_valid_edge()) {
+            auto edge_info = parent_node->get_edge_value();
+            BtreeNodePtr edge_node;
+            if (auto ret = read_node_impl(edge_info.bnode_id(), edge_node); ret == btree_status_t::success) {
+                if (edge_node->is_node_deleted()) {
+                    LOGTRACEMOD(wbcache, "Deleting stale edge node [{}] for parent node [{}]", edge_node->to_string(),
+                                parent_node->to_string());
+                    edge_node->set_node_deleted();
+                    free_node_impl(edge_node, cp_ctx);
+                    if (parent_node->total_entries() == 0) {
+                        parent_node->invalidate_edge();
+                    } else {
+                        BtreeLinkInfo last_child_info;
+                        parent_node->get_nth_value(parent_node->total_entries() - 1, &last_child_info,
+                                                   false /* copy */);
+                        parent_node->set_edge_value(last_child_info);
+                        parent_node->remove(parent_node->total_entries() - 1);
+                        LOGTRACEMOD(wbcache, "Replacing edge with previous child node [{}] for parent node [{}]",
+                                    last_child_info.bnode_id(), parent_node->to_string());
+                    }
+
+                    deleted++;
+                }
+            } else {
+                LOGTRACEMOD(wbcache, "Failed to read edge node {} for parent node [{}] reason {}",
+                            edge_node->to_string(), parent_node->to_string(), ret);
+            }
+        }
+        if (deleted /*free_nodes.size()*/) {
+            btree_status_t ret = btree_status_t::success;
+
+            if ((parent_node->total_entries() == 0) && !parent_node->has_valid_edge()) {
+                parent_node->set_node_deleted();
+                LOGTRACEMOD(wbcache,
+                            "Freeing parent node=[{}] because it is empty and not an edge node but had stale children",
+                            parent_node->to_string());
+                ret = write_node_impl(parent_node, cp_ctx);
+                free_node_impl(parent_node, cp_ctx);
+                LOGTRACEMOD(wbcache,
+                            "Accomplishing deleting stale links. After removing {} stale links, parent node is [{}]",
+                            deleted, parent_node->to_string());
+            } else {
+                ret = write_node_impl(parent_node, cp_ctx);
+                if (ret != btree_status_t::success) {
+                    LOGTRACEMOD(wbcache, "Failed to write parent node [{}] after deleting stale links",
+                                parent_node->to_string());
+                } else {
+                    LOGTRACEMOD(
+                        wbcache,
+                        "Accomplishing deleting stale links. After removing {} stale links, parent node is [{}]",
+                        deleted, parent_node->to_string());
+                }
+            }
+            //            auto ret = transact_nodes({}, free_nodes, parent_node, nullptr, cp_ctx);
+            return ret;
+        } else {
+            LOGTRACEMOD(wbcache, "Accomplishing deleting stale links. No stale links found for parent node [{}]",
+                        parent_node->to_string());
+        }
+        return btree_status_t::success;
+    }
+
+    //
     btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) {
-        BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->to_string());
+        LOGTRACEMOD(wbcache, "Repairing links for parent node [{}]", parent_node->to_string());
+        // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this
+        // needs to be handled. Get the last key in the node
 
-        // Get the last key in the node
-        auto const last_parent_key = parent_node->get_last_key< K >();
+        auto last_parent_key = parent_node->get_last_key< K >();
         auto const is_parent_edge_node = parent_node->has_valid_edge();
         if ((parent_node->total_entries() == 0) && !is_parent_edge_node) {
             BT_LOG_ASSERT(false, "Parent node={} is empty and not an edge node but was asked to repair",
                           parent_node->node_id());
             return btree_status_t::not_found;
         }
-        BT_LOG(INFO, "Repairing node={} with last_parent_key={}", parent_node->to_string(),
-               last_parent_key.to_string());
+
+        // Get all original child ids as a support to check if we are beyond the last child node
+        std::unordered_map< bnodeid_t, K > orig_child_infos;
+        for (uint32_t i = 0; i < parent_node->total_entries(); ++i) {
+            BtreeLinkInfo link_info;
+            parent_node->get_nth_value(i, &link_info, true);
+            orig_child_infos[link_info.bnode_id()] = parent_node->get_nth_key< K >(i, false /* copy */);
+        }
+        LOGTRACEMOD(wbcache, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(),
+                    last_parent_key.to_string());
 
         // Get the first child node and its link info
         BtreeLinkInfo child_info;
         BtreeNodePtr child_node;
+        BtreeNodePtr pre_child_node;
         auto ret = this->get_child_and_lock_node(parent_node, 0, child_info, child_node, locktype_t::READ,
                                                  locktype_t::READ, cp_ctx);
         if (ret != btree_status_t::success) {
@@ -284,9 +492,122 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
             return ret;
         }
 
+        // update the last key of parent for issue
+        // start from first child and store the last key of the child node, then traverse to next sibling
+        //        2-1- if this is greater than parent last key, traverse for sibling of parent until reaches to
+        // siblings which has keys more than Y or end of list (name this parent sibling node F),
+        //        2-2- Put last key of F to last key of P
+        //        2-3 - set F as Next of A
+        BtreeNodeList siblings;
+        BtreeNodePtr next_cur_child;
+        BT_DBG_ASSERT(parent_node->has_valid_edge() || parent_node->total_entries(),
+                      "parent node {} doesn't have valid edge and no entries ", parent_node->to_string());
+        if (parent_node->total_entries() > 0) {
+            auto updated_last_key = last_parent_key;
+            K last_child_last_key;
+            K last_child_neighbor_key;
+            BtreeNodePtr cur_child;
+            BtreeLinkInfo cur_child_info;
+
+            bool found_child = false;
+            uint32_t nentries = parent_node->total_entries() + parent_node->has_valid_edge() ? 1 : 0;
+
+            for (uint32_t i = nentries; i-- > 0;) {
+                parent_node->get_nth_value(i, &cur_child_info, false /* copy */);
+                if (auto ret = read_node_impl(cur_child_info.bnode_id(), cur_child); ret == btree_status_t::success) {
+                    if (!cur_child->is_node_deleted() && cur_child->total_entries()) {
+                        last_child_last_key = cur_child->get_last_key< K >();
+                        if (cur_child->next_bnode() != empty_bnodeid &&
+                            read_node_impl(cur_child->next_bnode(), next_cur_child) == btree_status_t::success) {
+                            LOGTRACEMOD(
+                                wbcache,
+                                "Last child last key {} for child_node [{}] parent node [{}],  next neigbor is [{}]",
+                                last_child_last_key.to_string(), cur_child->to_string(), parent_node->to_string(),
+                                next_cur_child->to_string());
+                            found_child = true;
+                            break;
+                        }
+                        found_child = true;
+                        break;
+                    }
+                    LOGTRACEMOD(wbcache, "PASSING child node {} so we need to check next child node",
+                                cur_child->to_string());
+                }
+            }
+
+            if (found_child) {
+                LOGTRACEMOD(wbcache, "Last child last key {} for parent node {}, child_node {}",
+                            last_child_last_key.to_string(), parent_node->to_string(), cur_child->to_string());
+                if (last_child_last_key.compare(last_parent_key) > 0) {
+                    if (next_cur_child) {
+                        last_child_neighbor_key = next_cur_child->get_last_key< K >();
+                        LOGTRACEMOD(wbcache,
+                                    "Voila !! last child_key of child [{}] is greater than its parents [{}] and its "
+                                    "next neighbor key is {}",
+                                    cur_child->to_string(), parent_node->to_string(),
+                                    last_child_neighbor_key.to_string());
+                    } else {
+                        LOGTRACEMOD(
+                            wbcache,
+                            "Last child_key of child [{}] is greater than its parents [{}] and it has no next neighbor",
+                            cur_child->to_string(), parent_node->to_string());
+                    }
+
+                    // 2-1 traverse for sibling of parent until reaches to siblings which has keys more than 7563
+                    //                        or end
+                    // of list (put all siblings in a list, here is F) ,
+                    BtreeNodePtr sibling;
+                    BtreeNodePtr true_sibling;
+                    BtreeLinkInfo sibling_info;
+
+                    auto sibling_node_id = parent_node->next_bnode();
+                    while (sibling_node_id != empty_bnodeid) {
+                        if (auto ret = read_node_impl(sibling_node_id, sibling); ret == btree_status_t::success) {
+                            if (sibling->is_node_deleted()) {
+                                // Do we need to free the sibling node here?
+                                siblings.push_back(sibling);
+                                sibling_node_id = sibling->next_bnode();
+                                LOGTRACEMOD(wbcache, "Sibling node [{}] is deleted, continue to next sibling",
+                                            sibling->to_string());
+                                continue;
+                            }
+                            auto sibling_last_key = sibling->get_last_key< K >();
+                            if (next_cur_child && sibling_last_key.compare(last_child_neighbor_key) < 0) {
+                                siblings.push_back(sibling);
+                                sibling_node_id = sibling->next_bnode();
+                            } else {
+                                true_sibling = sibling;
+                                break;
+                            }
+                        }
+                    }
+                    if (true_sibling) {
+                        LOGTRACEMOD(wbcache, "True sibling [{}] for parent_node {}", true_sibling->to_string(),
+                                    parent_node->to_string());
+                    } else {
+                        LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]", parent_node->to_string());
+                    }
+                    if (sibling_node_id != empty_bnodeid) {
+                        last_parent_key = last_child_last_key;
+                        parent_node->set_next_bnode(true_sibling->node_id());
+                        for (auto sibling : siblings) {
+                            LOGTRACEMOD(wbcache, "Sibling list [{}]", sibling->to_string());
+                        }
+                        LOGTRACEMOD(wbcache, "True sibling [{}]", true_sibling->to_string());
+                        BtreeLinkInfo first_child_info;
+                        parent_node->get_nth_value(0, &first_child_info, false);
+                    }
+                } else {
+                    LOGTRACEMOD(wbcache,
+                                "No undeleted child found for parent_node [{}], keep normal repair (regular recovery)",
+                                parent_node->to_string());
+                    next_cur_child = nullptr;
+                }
+            }
+        }
+
         // Keep a copy of the node buffer, in case we need to revert back
         uint8_t* tmp_buffer = new uint8_t[this->m_node_size];
-        std::memcpy(tmp_buffer, parent_node->m_phys_node_buf, this->m_node_size);
 
         // Remove all the entries in parent_node and let walk across child_nodes rebuild this node
         parent_node->remove_all();
@@ -295,22 +616,111 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
         auto cur_parent = parent_node;
         BtreeNodeList new_parent_nodes;
         do {
-            if (child_node->has_valid_edge() ||
-                (child_node->is_leaf() && (child_node->next_bnode() == empty_bnodeid))) {
-                BT_DBG_ASSERT(is_parent_edge_node,
-                              "Child node={} is an edge node but parent_node={} is not an edge node",
-                              child_node->node_id(), cur_parent->node_id());
-                cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
+            if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) {
+                if (child_node->is_node_deleted()) {
+                    // Edge node is merged, we need to set the current last entry as edge
+                    if (cur_parent->total_entries() > 0) {
+                        auto prev_val = V{};
+                        cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true);
+                        cur_parent->remove(cur_parent->total_entries() - 1);
+                        cur_parent->set_edge_value(prev_val);
+                        LOGTRACEMOD(wbcache,
+                                    "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}",
+                                    cur_parent->node_id(), child_node->to_string(), prev_val.to_string());
+                    } else {
+                        LOGTRACEMOD(wbcache, "Found an empty interior node {} with maybe all childs deleted",
+                                    cur_parent->node_id());
+                    }
+                } else {
+                    // Update edge and finish
+                    if (is_parent_edge_node) {
+                        cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
+                    } else {
+                        auto tsib_id = find_true_sibling(cur_parent);
+                        if (tsib_id != empty_bnodeid) {
+                            cur_parent->set_next_bnode(tsib_id);
+                            LOGTRACEMOD(wbcache,
+                                        "True sibling [{}] for parent_node [{}], So don't add child [{}] here ",
+                                        tsib_id, cur_parent->to_string(), child_node->to_string());
+                        } else {
+                            cur_parent->set_next_bnode(empty_bnodeid);
+                            // if this child node previously belonged to this parent node, we need to add it but as edge
+                            // o.w, not this node
+                            if (orig_child_infos.contains(child_node->node_id())) {
+                                cur_parent->set_edge_value(
+                                    BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
+                                LOGTRACEMOD(wbcache,
+                                            "Child node [{}] is an edge node and previously belong to this parent, so "
+                                            "we need to add it as edge",
+                                            child_node->to_string());
+                            } else {
+                                LOGTRACEMOD(wbcache, "No true sibling found for parent_node [{}]",
+                                            cur_parent->to_string());
+                            }
+                            BT_REL_ASSERT(cur_parent->total_entries() != 0 || cur_parent->has_valid_edge(),
+                                          "Parent node [{}] cannot be empty", cur_parent->to_string());
+                        }
+                    }
+
+                    //
+                    //                        }
+                    break;
+                }
                 break;
             }
 
-            auto const child_last_key = child_node->get_last_key< K >();
-            BT_LOG(INFO, "Repairing node={} child_node={} child_last_key={}", cur_parent->node_id(),
-                   child_node->to_string(), child_last_key.to_string());
+            auto child_last_key = child_node->get_last_key< K >();
+            LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(),
+                        child_node->to_string(), child_last_key.to_string());
+
+            // Check if we are beyond the last child node.
+            //
+            // There can be cases where the child level merge is successfully persisted but the parent level is
+            // not. In this case, you may have your rightmost child node with last key greater than the
+            // last_parent_key. That's why here we have to check if the child node is one of the original child
+            // nodes first.
+            if (!is_parent_edge_node && !orig_child_infos.contains(child_node->node_id())) {
+                if (child_last_key.compare(last_parent_key) > 0) {
+                    // We have reached a child beyond this parent, we can stop now
+                    // TODO this case if child last key is less than last parent key to update the parent node.
+                    // this case can potentially break the btree for put and remove op.
+                    break;
+                }
+                if (child_node->total_entries() == 0) {
+                    // this child has no entries, but maybe in the middle of the parent node, we need to update the key
+                    // of parent as previous one and go on
+                    LOGTRACEMOD(wbcache,
+                                "Reach to an empty child node {}, and this child doesn't belong to this parent; Hence "
+                                "loop ends",
+                                child_node->to_string());
+                    // now update  the next of parent node by skipping all deleted siblings of this parent node
+                    auto valid_sibling = cur_parent->next_bnode();
+                    while (valid_sibling != empty_bnodeid) {
+                        BtreeNodePtr sibling;
+                        if (read_node_impl(valid_sibling, sibling) == btree_status_t::success) {
+                            if (sibling->is_node_deleted()) {
+                                valid_sibling = sibling->next_bnode();
+                                continue;
+                            }
+                            // cur_parent->set_next_bnode(sibling->node_id());
+                            break;
+                        }
+                        LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}",
+                                    valid_sibling, cur_parent->to_string(), ret);
+                    }
+                    if (valid_sibling != empty_bnodeid) {
+                        cur_parent->set_next_bnode(valid_sibling);
+                        LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop",
+                                    cur_parent->node_id(), child_node->to_string());
+
+                    } else {
+                        cur_parent->set_next_bnode(empty_bnodeid);
+                        LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] is an edge node, end loop",
+                                    cur_parent->node_id(), child_node->to_string());
+                    }
 
-            if (child_last_key.compare(last_parent_key) > 0) {
-                // We have reached the last key, we can stop now
-                break;
+                    break;
+                }
             }
 
             if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(),
@@ -332,33 +742,135 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
             }
 
             // Insert the last key of the child node into parent node
-            cur_parent->insert(cur_parent->total_entries(), child_last_key,
-                               BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
+            if (!child_node->is_node_deleted()) {
+                if (child_node->total_entries() == 0) {
+                    if (orig_child_infos.contains(child_node->node_id())) {
+                        child_last_key = orig_child_infos[child_node->node_id()];
+                        LOGTRACEMOD(wbcache,
+                                    "Reach to an empty child node [{}], but not the end of the parent node, so we need "
+                                    "to update the key of parent node as original one {}",
+                                    child_node->to_string(), child_last_key.to_string());
+                    } else {
+                        LOGTRACEMOD(wbcache,
+                                    "Reach to an empty child node [{}] but not belonging to this parent (probably next "
+                                    "parent sibling); Hence end loop",
+                                    child_node->to_string());
+                        break;
+                    }
+                }
+                cur_parent->insert(cur_parent->total_entries(), child_last_key,
+                                   BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
+            } else {
+                // Node deleted indicates it's freed & no longer used during recovery
+                LOGTRACEMOD(wbcache, "Repairing node={}, child node=[{}] is deleted, skipping the insert",
+                            cur_parent->node_id(), child_node->to_string());
+                if (pre_child_node) {
+                    // We need to update the next of the previous child node to this child node
+
+                    LOGTRACEMOD(wbcache,
+                                "Repairing node={}, child_node=[{}] is deleted, set next of previous child node [{}] "
+                                "to this child node [{}]",
+                                cur_parent->node_id(), child_node->to_string(), pre_child_node->to_string(),
+                                child_node->next_bnode());
+                    pre_child_node->set_next_bnode(child_node->next_bnode());
+                    // repairing the next of previous child node
+                    // We need to set the state of the previous child node to clean, so that it can be flushed
+                    IndexBtreeNode* idx_node = static_cast< IndexBtreeNode* >(pre_child_node.get());
+                    idx_node->m_idx_buf->set_state(index_buf_state_t::CLEAN);
+                    write_node_impl(pre_child_node, cp_ctx);
+                    // update the key of last entry of the parent with the last key of deleted child
+                    child_last_key = orig_child_infos[child_node->node_id()];
+                    LOGTRACEMOD(wbcache, "updating parent [{}] current last key with {}", cur_parent->to_string(),
+                                child_last_key.to_string());
+                    // update it here to go to the next child node and unlock this node
+                    LOGTRACEMOD(wbcache, "update the child node next to the next of previous child node");
+                    child_node->set_next_bnode(child_node->next_bnode());
+                }
+            }
 
-            BT_LOG(INFO, "Repairing node={}, repaired so_far={}", cur_parent->node_id(), cur_parent->to_string());
+            LOGTRACEMOD(wbcache, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(),
+                        cur_parent->to_string());
 
             // Move to the next child node
-            this->unlock_node(child_node, locktype_t::READ);
             auto const next_node_id = child_node->next_bnode();
+            this->unlock_node(child_node, locktype_t::READ);
+            if (!child_node->is_node_deleted()) {
+                // We need to free the child node
+                pre_child_node = child_node;
+            }
             if (next_node_id == empty_bnodeid) {
-                BT_LOG_ASSERT(false,
-                              "Child node={} next_node_id is empty, while its not a edge node, parent_node={} "
-                              "repair is partial",
-                              child_node->node_id(), parent_node->node_id());
-                ret = btree_status_t::not_found;
+                // This can be a deleted edge node - only check if it is still valid
+                if (!child_node->is_node_deleted()) {
+                    BT_LOG_ASSERT(false,
+                                  "Child node={} next_node_id is empty, while its not a edge node, parent_node={} "
+                                  "repair is partial",
+                                  child_node->node_id(), parent_node->node_id());
+                    ret = btree_status_t::not_found;
+                }
+                child_node = nullptr;
+                break;
+            }
+            if (next_cur_child && next_node_id == next_cur_child->node_id()) {
+                // We are at the last child node, we can stop now
+                LOGTRACEMOD(
+                    wbcache,
+                    "REACH Repairing node={}, child_node=[{}] is the true child of sibling parent; Hence, end loop",
+                    child_node->node_id(), next_cur_child->to_string());
+                child_node = nullptr;
                 break;
             }
-
             ret = this->read_and_lock_node(next_node_id, child_node, locktype_t::READ, locktype_t::READ, cp_ctx);
             if (ret != btree_status_t::success) {
                 BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}",
                               parent_node->node_id(), enum_name(ret));
+                child_node = nullptr;
                 break;
             }
+
         } while (true);
-        this->unlock_node(child_node, locktype_t::READ);
+
+        if (child_node) { this->unlock_node(child_node, locktype_t::READ); }
+        // if last parent has the key less than the last child key, then we need to update the parent node with
+        // the last child key if it doesn't have edge.
+        auto last_parent = parent_node;
+        if (new_parent_nodes.size() > 0) { last_parent = new_parent_nodes[new_parent_nodes.size() - 1]; }
+        if (last_parent->total_entries() && !last_parent->has_valid_edge()) {
+            if (last_parent->compare_nth_key(last_parent_key, last_parent->total_entries() - 1) < 0) {
+                BtreeLinkInfo child_info;
+                last_parent->get_nth_value(last_parent->total_entries() - 1, &child_info, false /* copy */);
+                parent_node->update(parent_node->total_entries() - 1, last_parent_key, child_info);
+                LOGTRACEMOD(wbcache, "Repairing parent node={} with last_parent_key={} and child_info={}",
+                            parent_node->node_id(), last_parent_key.to_string(), child_info.to_string());
+            }
+            // if last key of children is less than the last key of parent, then we need to update the last key of non
+            // interior child
+            if (last_parent->level() > 1 && !last_parent->has_valid_edge()) {
+                // read last child
+                BtreeNodePtr last_child;
+                BtreeLinkInfo child_info;
+                auto total_entries = last_parent->total_entries();
+                last_parent->get_nth_value(total_entries - 1, &child_info, false /* copy */);
+                if (ret = read_node_impl(child_info.bnode_id(), last_child); ret == btree_status_t::success) {
+                    // get last key of cur child
+                    auto last_child_key = last_child->get_last_key< K >();
+                    BtreeLinkInfo last_child_info;
+                    last_child->get_nth_value(last_child->total_entries() - 1, &last_child_info, false /* copy*/);
+                    if (last_parent->compare_nth_key(last_child_key, total_entries - 1) > 0) {
+                        auto cur_child_st = last_child->to_string();
+                        last_child->update(last_child->total_entries() - 1, last_parent_key, last_child_info);
+                        LOGTRACEMOD(wbcache,
+                                    "Updating interior child node={} with last_parent_key={} and child_info={}",
+                                    cur_child_st, last_parent_key.to_string(), last_child_info.to_string());
+                        write_node_impl(last_child, cp_ctx);
+                    }
+                }
+            }
+        }
 
         if (ret == btree_status_t::success) {
+            // Make write_buf happy for the parent node in case of multiple write (stale repair and link repair)
+            IndexBtreeNode* p_node = static_cast< IndexBtreeNode* >(parent_node.get());
+            p_node->m_idx_buf->set_state(index_buf_state_t::CLEAN);
             ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx);
         }
 
@@ -371,6 +883,49 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
         delete[] tmp_buffer;
         return ret;
     }
+
+    bnodeid_t find_true_sibling(BtreeNodePtr const& node) {
+        if (node == nullptr) return empty_bnodeid;
+        bnodeid_t sibling_id = empty_bnodeid;
+        if (node->has_valid_edge()) {
+            sibling_id = node->get_edge_value().bnode_id();
+        } else {
+            sibling_id = node->next_bnode();
+        }
+        if (sibling_id == empty_bnodeid) {
+            return empty_bnodeid;
+        } else {
+            BtreeNodePtr sibling_node;
+            if (read_node_impl(sibling_id, sibling_node) != btree_status_t::success) { return empty_bnodeid; }
+
+            if (sibling_node->is_node_deleted()) {
+                LOGTRACEMOD(wbcache, "Sibling node [{}] is not the sibling for parent_node {}",
+                            sibling_node->to_string(), node->to_string());
+                return find_true_sibling(sibling_node);
+            } else {
+                return sibling_id;
+            }
+        }
+        return sibling_id;
+    }
+
+    K get_last_true_child_key(BtreeNodePtr const& parent_node) {
+        uint32_t nentries = parent_node->total_entries() + parent_node->has_valid_edge() ? 1 : 0;
+        BtreeLinkInfo cur_child_info;
+        BtreeNodePtr cur_child;
+        for (uint32_t i = nentries; i-- > 0;) {
+            parent_node->get_nth_value(i, &cur_child_info, false /* copy */);
+            if (auto ret = read_node_impl(cur_child_info.bnode_id(), cur_child); ret == btree_status_t::success) {
+                if (!cur_child->is_node_deleted()) {
+                    if (cur_child->total_entries()) {
+                        return cur_child->get_last_key< K >();
+                    } else {
+                        LOGTRACEMOD(wbcache, "Last valid child {} has no entries", cur_child->to_string());
+                    }
+                }
+            }
+        }
+    }
 };
 
 } // namespace homestore
diff --git a/src/lib/index/inplace_btree/wb_cache.cpp b/src/lib/index/inplace_btree/wb_cache.cpp
index 899d7475a..b888a8f71 100644
--- a/src/lib/index/inplace_btree/wb_cache.cpp
+++ b/src/lib/index/inplace_btree/wb_cache.cpp
@@ -43,13 +43,13 @@ IndexWBCacheBase& wb_cache() {
 IndexWBCache::IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb,
                            const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size) :
         m_vdev{vdev},
-        m_cache{evictor, 100000, node_size,
+        m_cache{evictor, HS_DYNAMIC_CONFIG(generic.cache_hashmap_nbuckets), node_size,
                 [](const BtreeNodePtr& node) -> BlkId {
                     return static_cast< IndexBtreeNode* >(node.get())->m_idx_buf->m_blkid;
                 },
                 [](const sisl::CacheRecord& rec) -> bool {
                     const auto& hnode = (sisl::SingleEntryHashNode< BtreeNodePtr >&)rec;
-                    return (hnode.m_value->m_refcount.test_le(1));
+                    return static_cast< IndexBtreeNode* >(hnode.m_value.get())->m_idx_buf->is_clean();
                 }},
         m_node_size{node_size},
         m_meta_blk{sb.first} {
@@ -194,14 +194,19 @@ bool IndexWBCache::refresh_meta_buf(shared< MetaIndexBuffer >& meta_buf, CPConte
         return false; // meta_buf modified by a newer CP, we shouldn't overwrite that
     } else if (meta_buf->m_dirtied_cp_id == cp_ctx->id()) {
         // Modified by the same cp, no need to create new index buffer, but we only copy the superblk to the buffer
+        LOGTRACEMOD(wbcache, "meta buf {} is already dirtied in cp {} now is in recovery {}", meta_buf->to_string(),
+                    cp_ctx->id(), m_in_recovery);
         meta_buf->copy_sb_to_buf();
+        // TODO: corner case , meta buffer is dirtied by the same cp but not added to dirty list due to previously
+        // recovery mode
     } else {
         // We always create a new meta index buffer on every meta buf update, which copies the superblk
         auto new_buf = std::make_shared< MetaIndexBuffer >(meta_buf);
         new_buf->m_dirtied_cp_id = cp_ctx->id();
         write_buf(nullptr, new_buf, cp_ctx);
         meta_buf = new_buf; // Replace the meta_buf with new buf
-        LOGTRACEMOD(wbcache, "meta buf {} is created in cp {}", meta_buf->to_string(), cp_ctx->id());
+        LOGTRACEMOD(wbcache, "meta buf {} is created in cp {} in recovery = {}", meta_buf->to_string(), cp_ctx->id(),
+                    m_in_recovery);
     }
     return true;
 }
@@ -211,39 +216,55 @@ static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr con
                             IndexBufferPtrList const& new_node_bufs, IndexBufferPtrList const& freed_node_bufs) {
     // TODO: Need an API from flip to quickly check if flip is enabled, so this method doesn't check flip_enabled a
     // bunch of times.
+    // TODO: Need an API to check if a flip is triggered easilly to avoid the use of several atomics.
     if (parent_buf && parent_buf->is_meta_buf()) {
         // Split or merge happening on root
         if (iomgr_flip::instance()->test_flip("crash_flush_on_meta")) {
             parent_buf->set_crash_flag();
+            hs()->crash_simulator().set_will_crash(true);
         } else if (iomgr_flip::instance()->test_flip("crash_flush_on_root")) {
             child_buf->set_crash_flag();
+            hs()->crash_simulator().set_will_crash(true);
         }
     } else if ((new_node_bufs.size() == 1) && freed_node_bufs.empty()) {
         // Its a split node situation
         if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_parent")) {
             parent_buf->set_crash_flag();
+            hs()->crash_simulator().set_will_crash(true);
         } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_left_child")) {
             child_buf->set_crash_flag();
+            hs()->crash_simulator().set_will_crash(true);
         } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_right_child")) {
             new_node_bufs[0]->set_crash_flag();
+            hs()->crash_simulator().set_will_crash(true);
         }
     } else if (!freed_node_bufs.empty() && (new_node_bufs.size() != freed_node_bufs.size())) {
         // Its a merge nodes sitation
         if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_parent")) {
             parent_buf->set_crash_flag();
+            hs()->crash_simulator().set_will_crash(true);
         } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_left_child")) {
             child_buf->set_crash_flag();
+            hs()->crash_simulator().set_will_crash(true);
         } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_right_child")) {
-            if (!new_node_bufs.empty()) { new_node_bufs[0]->set_crash_flag(); }
+            if (!new_node_bufs.empty()) {
+                new_node_bufs[0]->set_crash_flag();
+                hs()->crash_simulator().set_will_crash(true);
+            }
         }
     } else if (!freed_node_bufs.empty() && (new_node_bufs.size() == freed_node_bufs.size())) {
         // Its a rebalance node situation
         if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_parent")) {
             parent_buf->set_crash_flag();
+            hs()->crash_simulator().set_will_crash(true);
         } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_left_child")) {
             child_buf->set_crash_flag();
+            hs()->crash_simulator().set_will_crash(true);
         } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_right_child")) {
-            if (!new_node_bufs.empty()) { new_node_bufs[0]->set_crash_flag(); }
+            if (!new_node_bufs.empty()) {
+                new_node_bufs[0]->set_crash_flag();
+                hs()->crash_simulator().set_will_crash(true);
+            }
         }
     }
 }
@@ -282,18 +303,52 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p
         }
         icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {});
     } else {
-        icp_ctx->add_to_txn_journal(index_ordinal,          // Ordinal
-                                    child_buf->m_up_buffer, // real up buffer
-                                    new_node_bufs.empty() ? freed_node_bufs[0]->m_up_buffer
-                                                          : new_node_bufs[0]->m_up_buffer, // real in place child
-                                    new_node_bufs,                                         // new node bufs
-                                    freed_node_bufs                                        // free_node_bufs
-        );
+        icp_ctx->add_to_txn_journal(index_ordinal, child_buf->m_up_buffer /* real up buffer */, child_buf,
+                                    new_node_bufs, freed_node_bufs);
+    }
+#ifdef _PRERELEASE
+    // log new nodes and freed nodes and parent and child
+    static uint32_t txn_id = 0;
+    static int last_cp_id = -2;
+    static std::string txn = "";
+    if (last_cp_id != icp_ctx->id()) {
+        last_cp_id = icp_ctx->id();
+        txn_id = 0;
+        txn = "";
     }
+
+    if (new_node_bufs.empty() && freed_node_bufs.empty()) {
+        fmt::format_to(std::back_inserter(txn), "\n{} - parent=[{}] child=[{}] new=[{}] freed=[{}]", txn_id,
+                       (parent_buf && parent_buf->blkid().to_integer() != 0)
+                           ? std::to_string(parent_buf->blkid().to_integer())
+                           : "empty",
+                       child_buf->blkid().to_integer(), "empty", "empty");
+    } else {
+        std::string new_nodes;
+        for (auto const& buf : new_node_bufs) {
+            new_nodes += std::to_string(buf->blkid().to_integer()) + ", ";
+        }
+        std::string freed_nodes;
+        for (auto const& buf : freed_node_bufs) {
+            freed_nodes += std::to_string(buf->blkid().to_integer()) + ", ";
+        }
+        std::string parent_str = (parent_buf && parent_buf->blkid().to_integer() != 0)
+            ? std::to_string(parent_buf->blkid().to_integer())
+            : "empty";
+        std::string child_str = (child_buf && child_buf->blkid().to_integer() != 0)
+            ? std::to_string(child_buf->blkid().to_integer())
+            : "empty";
+
+        fmt::format_to(std::back_inserter(txn), "\n{} - parent={} child={} new=[{}] freed=[{}]", txn_id, parent_str,
+                       child_str, new_nodes, freed_nodes);
+    }
+    LOGTRACEMOD(wbcache, "\ttranasction till now: cp: {} \n{}\n", icp_ctx->id(), txn);
+    txn_id++;
+#endif
 #if 0
     static int id = 0;
-    auto filename = "transact_bufs_"+std::to_string(id++)+ "_" +std::to_string(rand()%100)+".dot";
-    LOGINFO("Transact cp is in cp\n{} and storing in {}\n\n\n", icp_ctx->to_string(), filename);
+    auto filename = fmt::format("txn_buf_{}_{}.dot", icp_ctx->id(), id++);
+    LOGTRACEMOD(wbcache,"Writing txn to file: {}", filename);
     icp_ctx->to_string_dot(filename);
 #endif
 }
@@ -355,25 +410,20 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const&
             HS_DBG_ASSERT((real_up_buf->m_dirtied_cp_id == down_buf->m_dirtied_cp_id) || (real_up_buf->is_meta_buf()),
                           "Up buffer is not modified by current cp, but down buffer is linked to it");
 #ifndef NDEBUG
-            bool found{false};
-            for (auto const& dbuf : real_up_buf->m_down_buffers) {
-                if (dbuf.lock() == down_buf) {
-                    found = true;
-                    break;
-                }
-            }
-            HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list");
+            HS_DBG_ASSERT(real_up_buf->is_in_down_buffers(down_buf),
+                          "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list");
 #endif
             return;
         }
     }
 
     // Now we link the down_buffer to the real up_buffer
-    real_up_buf->m_wait_for_down_buffers.increment(1);
+    if (down_buf->m_up_buffer) {
+        // release existing up_buffer's wait count
+        down_buf->m_up_buffer->remove_down_buffer(down_buf);
+    }
     down_buf->m_up_buffer = real_up_buf;
-#ifndef NDEBUG
-    real_up_buf->m_down_buffers.emplace_back(down_buf);
-#endif
+    real_up_buf->add_down_buffer(down_buf);
 }
 
 void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) {
@@ -382,12 +432,92 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) {
         bool done = m_cache.remove(buf->m_blkid, node);
         HS_REL_ASSERT_EQ(done, true, "Race on cache removal of btree blkid?");
     }
-
+    buf->m_node_freed = true;
     resource_mgr().inc_free_blk(m_node_size);
     m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(cp_ctx));
 }
 
 //////////////////// Recovery Related section /////////////////////////////////
+void IndexWBCache::load_buf(IndexBufferPtr const& buf) {
+    if (buf->m_bytes == nullptr) {
+        buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size());
+        m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid());
+        buf->m_dirtied_cp_id = BtreeNode::get_modified_cp_id(buf->m_bytes);
+    }
+}
+
+struct DagNode {
+    IndexBufferPtr buffer;
+    std::vector< shared< DagNode > > children;
+};
+
+using DagPtr = std::shared_ptr< DagNode >;
+using DagMap = std::map< IndexBufferPtr, DagPtr >;
+
+static DagMap generate_dag_buffers(std::map< BlkId, IndexBufferPtr >& bufmap) {
+    std::vector< IndexBufferPtr > bufs;
+    std::ranges::transform(bufmap, std::back_inserter(bufs), [](const auto& pair) { return pair.second; });
+
+    auto buildReverseMapping = [](const std::vector< IndexBufferPtr >& buffers) {
+        std::unordered_map< IndexBufferPtr, std::vector< IndexBufferPtr > > parentToChildren;
+        for (const auto& buffer : buffers) {
+            if (buffer->m_up_buffer) { parentToChildren[buffer->m_up_buffer].push_back(buffer); }
+        }
+        return parentToChildren;
+    };
+
+    std::function< DagPtr(IndexBufferPtr, std::unordered_map< IndexBufferPtr, std::vector< IndexBufferPtr > >&) >
+        buildDag;
+    buildDag =
+        [&buildDag](IndexBufferPtr buffer,
+                    std::unordered_map< IndexBufferPtr, std::vector< IndexBufferPtr > >& parentToChildren) -> DagPtr {
+        auto dagNode = std::make_shared< DagNode >();
+        dagNode->buffer = buffer;
+        if (parentToChildren.count(buffer)) {
+            for (const auto& child : parentToChildren[buffer]) {
+                dagNode->children.push_back(buildDag(child, parentToChildren));
+            }
+        }
+        return dagNode;
+    };
+
+    auto generateDagMap = [&](const std::vector< IndexBufferPtr >& buffers) {
+        DagMap dagMap;
+        auto parentToChildren = buildReverseMapping(buffers);
+        for (const auto& buffer : buffers) {
+            if (!buffer->m_up_buffer) { // This is a root buffer
+                auto dagRoot = buildDag(buffer, parentToChildren);
+                dagMap[buffer] = dagRoot;
+            }
+        }
+        return dagMap;
+    };
+
+    return generateDagMap(bufs);
+}
+
+static std::string to_string_dag_bufs(DagMap& dags, cp_id_t cp_id = 0) {
+    std::string str{fmt::format("#_of_dags={}\n", dags.size())};
+    int cnt = 1;
+    for (const auto& [_, dag] : dags) {
+        std::vector< std::tuple< std::shared_ptr< DagNode >, int, int > > stack;
+        stack.emplace_back(dag, 0, cnt++);
+        while (!stack.empty()) {
+            auto [node, level, index] = stack.back();
+            stack.pop_back();
+            auto snew = node->buffer->m_created_cp_id == cp_id ? "NEW" : "";
+            auto sfree = node->buffer->m_node_freed ? "FREED" : "";
+            fmt::format_to(std::back_inserter(str), "{}{}-{} {} {}\n", std::string(level * 4, ' '), index,
+                           node->buffer->to_string(), snew, sfree);
+            int c = node->children.size();
+            for (const auto& d : node->children) {
+                stack.emplace_back(d, level + 1, c--);
+            }
+        }
+    }
+    return str;
+}
+
 void IndexWBCache::recover(sisl::byte_view sb) {
     // If sb is empty, its possible a first time boot.
     if ((sb.bytes() == nullptr) || (sb.size() == 0)) {
@@ -406,6 +536,31 @@ void IndexWBCache::recover(sisl::byte_view sb) {
     LOGINFOMOD(wbcache, "Detected unclean shutdown, prior cp={} had to flush {} nodes, recovering... ", icp_ctx->id(),
                bufs.size());
 
+#ifdef _PRERELEASE
+    auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs,
+                               std::vector< IndexBufferPtr > const& pending_bufs) {
+        std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size());
+        for (auto const& [_, buf] : bufs) {
+            load_buf(buf);
+            fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string());
+        }
+
+        // list of new_bufs
+        if (!pending_bufs.empty()) {
+            fmt::format_to(std::back_inserter(log), "\n\tpending_bufs (#of bufs = {})\n", pending_bufs.size());
+            for (auto const& buf : pending_bufs) {
+                fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string());
+            }
+        }
+        return log;
+    };
+
+    std::string log = fmt::format("Recovering bufs (#of bufs = {}) before processing them\n", bufs.size());
+    LOGTRACEMOD(wbcache, "{}\n{}", log, detailed_log(bufs, {}));
+    auto dags = generate_dag_buffers(bufs);
+    LOGTRACEMOD(wbcache, "Before recovery: {}", to_string_dag_bufs(dags, icp_ctx->id()));
+#endif
+
     // At this point, we have the DAG structure (up/down dependency graph), exactly the same as prior to crash, with one
     // addition of all freed buffers also put in the DAG structure.
     //
@@ -416,6 +571,68 @@ void IndexWBCache::recover(sisl::byte_view sb) {
     // This has to be done before doing any repair, because repair can allocate blkids and we don't want to allocate
     // the same blkid which could clash with the blkid next in the buf list.
     //
+    // On the second pass, we only take part of the parents/siblings and then repair them, if needed.
+    std::vector< IndexBufferPtr > pending_bufs;
+    std::vector< IndexBufferPtr > deleted_bufs;
+    for (auto const& [_, buf] : bufs) {
+        if (buf->m_node_freed) {
+            // Freed node
+            load_buf(buf);
+            if (was_node_committed(buf)) {
+                // Mark this buffer as deleted, so that we can avoid using it anymore when repairing its parent's link
+                r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted = true;
+                write_buf(nullptr, buf, icp_ctx);
+                deleted_bufs.push_back(buf);
+                pending_bufs.push_back(buf->m_up_buffer);
+            } else {
+                // (Up) buffer is not committed, node need to be kept and (potentially) repaired later
+                buf->m_node_freed = false;
+                if (buf->m_created_cp_id == icp_ctx->id()) {
+                    // New nodes need to be commited first
+                    m_vdev->commit_blk(buf->m_blkid);
+                    // it can happen when children moved to one of right parent sibling and then the previous node is
+                    // deleted but not commited during crash (upbuffer is not committed). but its children already
+                    // committed. and freed (or changed)
+                    if (buf->m_node_level) { potential_parent_recovered_bufs.insert(buf); }
+                } else {
+                    LOGINFO("deleting and creating new buf {}", buf->to_string());
+                    deleted_bufs.push_back(buf);
+                }
+                //  1- upbuffer was dirtied by the same cp, so it is not commited, so we don't need to repair it.
+                //  remove it from down_waiting list (probably recursively going up) 2- upbuffer was created and
+                //  freed at the same cp, so it is not commited, so we don't need to repair it.
+                if (buf->m_up_buffer) {
+                    LOGTRACEMOD(wbcache, "remove_down_buffer {} from up buffer {}", buf->to_string(),
+                                buf->m_up_buffer->to_string());
+                    buf->m_up_buffer->remove_down_buffer(buf);
+                    if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) {
+                        // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers
+                        LOGINFOMOD(wbcache,
+                                   "\n\npruning up_buffer due to zero dependency of child\n up buffer {}\n buffer {}",
+                                   buf->m_up_buffer ? buf->m_up_buffer->to_string() : std::string("nullptr"),
+                                   buf->to_string());
+                        update_up_buffer_counters(buf->m_up_buffer /*,visited_bufs*/);
+                    }
+                    buf->m_up_buffer = nullptr;
+                }
+                pending_bufs.push_back(buf);
+                buf->m_wait_for_down_buffers.increment(1); // Purely for recover_buf() counter consistency
+            }
+        } else if (buf->m_created_cp_id == icp_ctx->id()) {
+            // New node
+            if (was_node_committed(buf) && was_node_committed(buf->m_up_buffer)) {
+                // Both current and up buffer is commited, we can safely commit the current block
+                m_vdev->commit_blk(buf->m_blkid);
+                pending_bufs.push_back(buf->m_up_buffer);
+            } else {
+                // Up buffer is not committed, we need to repair it first
+                buf->m_up_buffer->remove_down_buffer(buf);
+                // buf->m_up_buffer = nullptr;
+                if (buf->m_up_buffer->m_wait_for_down_buffers.testz()) {
+                    // if up buffer has upbuffer, then we need to decrement its wait_for_down_buffers
+                    update_up_buffer_counters(buf->m_up_buffer);
+                }
+
     // On the second pass, we only take the new nodes/bufs and then repair their up buffers, if needed.
     std::vector< IndexBufferPtr > l0_bufs;
     for (auto const& [_, buf] : bufs) {
@@ -436,38 +653,65 @@ void IndexWBCache::recover(sisl::byte_view sb) {
         }
     }
 
+#ifdef _PRERELEASE
     LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}",
-               l0_bufs.size(), bufs.size(), icp_ctx->id());
-
-    auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs,
-                               std::vector< IndexBufferPtr > const& l0_bufs) {
-        // Logs to detect down_waits are set correctly for up buffers list of all recovered bufs
-        std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size());
-        for (auto const& [_, buf] : bufs) {
-            fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string());
-        }
+               pending_bufs.size(), bufs.size(), icp_ctx->id());
+    LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, pending_bufs));
+    LOGTRACEMOD(wbcache, "After recovery: {}", to_string_dag_bufs(dags, icp_ctx->id()));
+#endif
 
-        // list of new_bufs
-        fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size());
-        for (auto const& buf : l0_bufs) {
-            fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string());
+    uint32_t cnt = 0;
+    LOGTRACEMOD(wbcache, "Potential parent recovered bufs (#of bufs = {})", potential_parent_recovered_bufs.size());
+    for (auto const& buf : potential_parent_recovered_bufs) {
+        LOGTRACEMOD(wbcache, " {} - check stale recovered buf {}", cnt++, buf->to_string());
+    }
+    // This step is needed since there is a case where all(or some) children of an interior node is freed (after moving
+    // to a previous sibling parent) and after crash, this node has stale links to its children
+    cnt = 0;
+    std::vector< IndexBufferPtr > buffers_to_repair;
+    for (auto const& buf : potential_parent_recovered_bufs) {
+        LOGTRACEMOD(wbcache, " {} - potential parent recovered buf {}", cnt, buf->to_string());
+        parent_recover(buf);
+        if (buf->m_bytes == nullptr || r_cast< persistent_hdr_t* >(buf->m_bytes)->node_deleted) {
+            // This buffer was marked as deleted during repair, so we also need to free it
+            deleted_bufs.push_back(buf);
+        } else {
+            // This buffer was not marked as deleted during repair, so we need to repair it
+            buffers_to_repair.push_back(buf);
         }
-        return log;
-    };
-    LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, l0_bufs));
+    }
 
-    // Second iteration we start from the lowest levels (which are all new_bufs) and check if up_buffers need to be
-    // repaired. All L1 buffers are not needed to repair, because they are sibling nodes and so we pass false in
-    // do_repair flag.
-    for (auto const& buf : l0_bufs) {
-        recover_buf(buf->m_up_buffer);
+    for (auto const& buf : deleted_bufs) {
+        m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx));
     }
+
     m_in_recovery = false;
     m_vdev->recovery_completed();
 }
 
+// if buf->m_wait_for_down_buffers.testz() is true (which means that it has  no dependency on any other buffer) then we
+// can decrement the wait_for_down_buffers of its up buffer. If the up buffer has up buffer, then we need to decrement
+// its wait_for_down_buffers. If the up buffer of up buffer has wait_for_down_buffers as 0, then we need to decrement
+// its wait_for_down_buffers. This process continues until we reach the root buffer. If the root buffer has
+// wait_for_down_buffers as 0, then we need to decrement its wait_for_down_buffers.
+void IndexWBCache::update_up_buffer_counters(IndexBufferPtr const& buf) {
+    if (buf == nullptr || !buf->m_wait_for_down_buffers.testz() || buf->m_up_buffer == nullptr) {
+        LOGINFOMOD(wbcache, "Finish decrementing wait_for_down_buffers");
+        return;
+    }
+    auto grand_buf = buf->m_up_buffer;
+    grand_buf->remove_down_buffer(buf);
+    LOGINFOMOD(wbcache,
+               "Decrementing wait_for_down_buffers for buffer {} due to zero dependency of child {}, Keep going up",
+               grand_buf->to_string(), buf->to_string());
+    update_up_buffer_counters(grand_buf);
+}
+
 void IndexWBCache::recover_buf(IndexBufferPtr const& buf) {
-    if (!buf->m_wait_for_down_buffers.decrement_testz()) { return; }
+    if (!buf->m_wait_for_down_buffers.decrement_testz()) {
+        // TODO: remove the buf_>m_up_buffer from down_buffers list of buf->m_up_buffer
+        return;
+    }
 
     // All down buffers are completed and given a nod saying that they are committed. If this buffer is not committed,
     // then we need to repair this node/buffer. After that we will keep going to the next up level to repair them if
@@ -478,6 +722,12 @@ void IndexWBCache::recover_buf(IndexBufferPtr const& buf) {
     } else {
         LOGTRACEMOD(wbcache, "Index Recovery detected up node [{}] as committed no need to repair that",
                     buf->to_string());
+        if (buf->m_up_buffer && buf->m_up_buffer->is_meta_buf()) {
+            // Our up buffer is a meta buffer, which means old root is dirtied and may need no repair but possible of
+            // new root on upper level so needs to be retore the edge
+            LOGTRACEMOD(wbcache, "check root change for without repairing {}", buf->to_string());
+            index_service().update_root(buf->m_index_ordinal, buf);
+        }
     }
 
     if (buf->m_up_buffer) { recover_buf(buf->m_up_buffer); }
@@ -493,21 +743,21 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) {
     }
 
     // All down_buf has indicated that they have seen this up buffer, now its time to repair them.
-    if (buf->m_bytes == nullptr) {
-        // Read the btree node and get its modified cp_id
-        buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size());
-        m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid());
-        if (!BtreeNode::is_valid_node(sisl::blob{buf->m_bytes, m_node_size})) { return false; }
-
-        buf->m_dirtied_cp_id = BtreeNode::get_modified_cp_id(buf->m_bytes);
-    }
-    auto cpg = cp_mgr().cp_guard();
-    return (buf->m_dirtied_cp_id == cpg->id());
+    load_buf(buf);
+    if (!BtreeNode::is_valid_node(sisl::blob{buf->m_bytes, m_node_size})) { return false; }
+    return (buf->m_dirtied_cp_id == cp_mgr().cp_guard()->id());
 }
 
 //////////////////// CP Related API section /////////////////////////////////
 folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) {
-    LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp context={}", cp_ctx->to_string_with_dags());
+    LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp \ndag={}\n\n cp context {}", cp_ctx->to_string_with_dags(),
+                cp_ctx->to_string());
+    // #ifdef _PRERELEASE
+    //     static int id = 0;
+    //     auto filename = "cp_" + std::to_string(id++) + "_" + std::to_string(rand() % 100) + ".dot";
+    //     LOGTRACEMOD(wbcache, "Transact cp storing in file {}\n\n\n", filename);
+    //     cp_ctx->to_string_dot(filename);
+    // #endif
     if (!cp_ctx->any_dirty_buffers()) {
         if (cp_ctx->id() == 0) {
             // For the first CP, we need to flush the journal buffer to the meta blk
@@ -521,17 +771,20 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) {
 
 #ifdef _PRERELEASE
     if (hs()->crash_simulator().is_crashed()) {
-        LOGINFOMOD(wbcache, "crash simulation is ongoing, so skip the cp flush");
+        LOGINFO("crash simulation is ongoing, so skip the cp flush");
         return folly::makeFuture< bool >(true);
     }
 #endif
 
-    // First thing is to flush the new_blks created as part of the CP.
+    // First thing is to flush the journal created as part of the CP.
     auto const& journal_buf = cp_ctx->journal_buf();
+    auto txn = r_cast< IndexCPContext::txn_journal const* >(journal_buf.cbytes());
     if (journal_buf.size() != 0) {
         if (m_meta_blk) {
+            LOGTRACEMOD(wbcache, " journal {} ", txn->to_string());
             meta_service().update_sub_sb(journal_buf.cbytes(), journal_buf.size(), m_meta_blk);
         } else {
+            LOGTRACEMOD(wbcache, " First time journal {} ", txn->to_string());
             meta_service().add_sub_sb("wb_cache", journal_buf.cbytes(), journal_buf.size(), m_meta_blk);
         }
     }
@@ -554,44 +807,44 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) {
 
 void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) {
 #ifdef _PRERELEASE
+    static std::once_flag flag;
+    if (hs()->crash_simulator().is_crashed()) {
+        std::call_once(flag, []() { LOGINFO("Crash simulation is ongoing; aid simulation by not flushing."); });
+        return;
+    }
     if (buf->m_crash_flag_on) {
-//        std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot";
-//        LOGINFOMOD(wbcache, "Simulating crash while writing buffer {},  stored in file {}", buf->to_string(), filename);
-//        cp_ctx->to_string_dot(filename);
-        LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}", buf->to_string());
+        std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot";
+        LOGINFO("Simulating crash while writing buffer {},  stored in file {}", buf->to_string(), filename);
+        //        cp_ctx->to_string_dot(filename);
         hs()->crash_simulator().crash();
         cp_ctx->complete(true);
         return;
-    } else if (hs()->crash_simulator().is_crashed()) {
-        LOGINFOMOD(wbcache, "crash simulation is ongoing, aid simulation by not flushing");
-        return;
     }
 #endif
 
-    LOGTRACEMOD(wbcache, "cp={} {}", cp_ctx->id(), buf->to_string());
     buf->set_state(index_buf_state_t::FLUSHING);
 
     if (buf->is_meta_buf()) {
-        LOGTRACEMOD(wbcache, "flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(),
+        LOGTRACEMOD(wbcache, "Flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(),
                     buf->to_string());
-        auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb;
-        meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk());
+        auto const sb_buf = r_cast< MetaIndexBuffer* >(buf.get());
+        if (sb_buf->m_valid) {
+            auto const& sb = sb_buf->m_sb;
+            if (!sb.is_empty()) { meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); }
+        }
         process_write_completion(cp_ctx, buf);
     } else if (buf->m_node_freed) {
         LOGTRACEMOD(wbcache, "Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(),
                     buf->to_string());
         process_write_completion(cp_ctx, buf);
     } else {
-        LOGTRACEMOD(wbcache, "flushing cp {} buf {} info: {}", cp_ctx->id(), buf->to_string(),
-                    BtreeNode::to_string_buf(buf->raw_buffer()));
+        LOGTRACEMOD(wbcache, "Flushing cp {} buf {}", cp_ctx->id(), buf->to_string());
         m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch)
             .thenValue([buf, cp_ctx](auto) {
                 try {
                     auto& pthis = s_cast< IndexWBCache& >(wb_cache());
                     pthis.process_write_completion(cp_ctx, buf);
-                } catch (const std::runtime_error& e) {
-                    LOGERROR("Failed to access write-back cache: {}", e.what());
-                }
+                } catch (const std::runtime_error& e) { LOGERROR("Failed to access write-back cache: {}", e.what()); }
             });
 
         if (!part_of_batch) { m_vdev->submit_batch(); }
@@ -600,8 +853,10 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const
 
 void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferPtr const& buf) {
 #ifdef _PRERELEASE
+    static std::once_flag flag;
     if (hs()->crash_simulator().is_crashed()) {
-        LOGINFOMOD(wbcache, "Crash simulation is ongoing, ignore all process_write_completion");
+        std::call_once(
+            flag, []() { LOGINFOMOD(wbcache, "Crash simulation is ongoing, ignore all process_write_completion"); });
         return;
     }
 #endif
@@ -635,7 +890,10 @@ std::pair< IndexBufferPtr, bool > IndexWBCache::on_buf_flush_done_internal(Index
                                                                            IndexBufferPtr const& buf) {
     IndexBufferPtrList buf_list;
 #ifndef NDEBUG
-    buf->m_down_buffers.clear();
+    {
+        std::lock_guard lg(buf->m_down_buffers_mtx);
+        buf->m_down_buffers.clear();
+    }
 #endif
     buf->set_state(index_buf_state_t::CLEAN);
 
@@ -683,7 +941,7 @@ void IndexWBCache::get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_c
         std::optional< IndexBufferPtr > buf = cp_ctx->next_dirty();
         if (!buf) { break; } // End of list
 
-        if ((*buf)->m_wait_for_down_buffers.testz()) {
+        if ((*buf)->state() == index_buf_state_t::DIRTY && (*buf)->m_wait_for_down_buffers.testz()) {
             bufs.emplace_back(std::move(*buf));
             ++count;
         } else {
diff --git a/src/lib/index/inplace_btree/wb_cache.hpp b/src/lib/index/inplace_btree/wb_cache.hpp
index 209d3845e..7d10d7f54 100644
--- a/src/lib/index/inplace_btree/wb_cache.hpp
+++ b/src/lib/index/inplace_btree/wb_cache.hpp
@@ -78,5 +78,7 @@ class IndexWBCache : public IndexWBCacheBase {
 
     void recover_buf(IndexBufferPtr const& buf);
     bool was_node_committed(IndexBufferPtr const& buf);
+    void load_buf(IndexBufferPtr const& buf);
+    void update_up_buffer_counters(IndexBufferPtr const& buf);
 };
 } // namespace homestore
diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp
index 7cae168f3..2b3f88c30 100644
--- a/src/lib/logstore/log_dev.cpp
+++ b/src/lib/logstore/log_dev.cpp
@@ -65,10 +65,11 @@ void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) {
     // First read the info block
     if (format) {
         HS_LOG_ASSERT(m_logdev_meta.is_empty(), "Expected meta to be not present");
-        m_logdev_meta.create(m_logdev_id);
+        m_logdev_meta.create(m_logdev_id, m_flush_mode);
         m_vdev_jd->update_data_start_offset(0);
     } else {
-        HS_LOG_ASSERT(!m_logdev_meta.is_empty(), "Expected meta data to be read already before loading");
+        HS_LOG_ASSERT(!m_logdev_meta.is_empty(),
+                      "Expected meta data to be read already before loading this log dev id: {}", m_logdev_id);
         auto const store_list = m_logdev_meta.load();
 
         // Notify to the caller that a new log store was reserved earlier and it is being loaded, with its meta info
@@ -133,6 +134,7 @@ void LogDev::stop() {
     m_log_idx.store(0);
     m_pending_flush_size.store(0);
     m_last_flush_idx = -1;
+    m_last_flush_ld_key = logdev_key{0, 0};
     m_last_truncate_idx = -1;
     m_last_crc = INVALID_CRC32_VALUE;
 
@@ -144,10 +146,37 @@ void LogDev::stop() {
     m_hs.reset();
 }
 
-bool LogDev::is_stopped() {
-    std::unique_lock lg = flush_guard();
-    return m_stopped;
+#if 0
+void LogDev::stop() {
+    start_stopping();
+    while (true) {
+        if (!get_pending_request_num()) break;
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+    }
+    {
+        std::unique_lock lg = flush_guard();
+        // waiting under lock to make sure no new flush is started
+        while (m_pending_callback.load() > 0) {
+            THIS_LOGDEV_LOG(INFO, "Waiting for pending callbacks to complete, pending callbacks {}",
+                            m_pending_callback.load());
+            std::this_thread::sleep_for(std::chrono::milliseconds{1000});
+        }
+    }
+
+    folly::SharedMutexWritePriority::ReadHolder holder(m_store_map_mtx);
+    for (auto& [_, store] : m_id_logstore_map) {
+        store.log_store->stop();
+    }
+
+    // after we call stop, we need to do any pending device truncations
+    truncate();
+    m_id_logstore_map.clear();
+    if (allow_timer_flush()) {
+        auto f = stop_timer();
+        std::move(f).get();
+    }
 }
+#endif
 
 void LogDev::destroy() {
     THIS_LOGDEV_LOG(INFO, "Logdev destroy metablks log_dev={}", m_logdev_id);
@@ -164,12 +193,19 @@ void LogDev::start_timer() {
         });
 }
 
-void LogDev::stop_timer() {
-    if (m_flush_timer_hdl != iomgr::null_timer_handle) {
-        // cancel the timer
-        iomanager.run_on_wait(logstore_service().flush_thread(),
-                              [this]() { iomanager.cancel_timer(m_flush_timer_hdl, true); });
-    }
+folly::Future< int > LogDev::stop_timer() {
+    // return future to the caller;
+    // this future will be completed when the timer is stopped
+    auto p = std::make_shared< folly::Promise< int > >();
+    auto f = p->getFuture();
+    iomanager.run_on_forget(logstore_service().flush_thread(), [this, p]() mutable {
+        if (m_flush_timer_hdl != iomgr::null_timer_handle) {
+            iomanager.cancel_timer(m_flush_timer_hdl, true);
+            m_flush_timer_hdl = iomgr::null_timer_handle;
+        }
+        p->setValue(0);
+    });
+    return f;
 }
 
 void LogDev::do_load(off_t device_cursor) {
@@ -202,8 +238,7 @@ void LogDev::do_load(off_t device_cursor) {
         // Loop through each record within the log group and do a callback
         decltype(header->nrecords()) i{0};
         HS_REL_ASSERT_GT(header->nrecords(), 0, "nrecords greater then zero");
-        const auto flush_ld_key =
-            logdev_key{header->start_idx() + header->nrecords(), group_dev_offset + header->total_size()};
+        const auto flush_ld_key = logdev_key{header->start_idx(), group_dev_offset};
         while (i < header->nrecords()) {
             const auto* rec = header->nth_record(i);
             const uint32_t data_offset = (rec->offset + (rec->get_inlined() ? 0 : header->oob_data_offset));
@@ -262,6 +297,7 @@ int64_t LogDev::append_async(logstore_id_t store_id, logstore_seq_num_t seq_num,
 }
 
 log_buffer LogDev::read(const logdev_key& key) {
+    std::unique_lock lg = flush_guard();
     auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread);
     auto ec = m_vdev_jd->sync_pread(buf->bytes(), initial_read_size, key.dev_offset);
     if (ec) {
@@ -290,6 +326,7 @@ log_buffer LogDev::read(const logdev_key& key) {
 }
 
 void LogDev::read_record_header(const logdev_key& key, serialized_log_record& return_record_header) {
+    std::unique_lock lg = flush_guard();
     auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread);
     auto ec = m_vdev_jd->sync_pread(buf->bytes(), initial_read_size, key.dev_offset);
     if (ec) LOGERROR("Failed to read from Journal vdev log_dev={} {} {}", m_logdev_id, ec.value(), ec.message());
@@ -497,11 +534,12 @@ void LogDev::on_flush_completion(LogGroup* lg) {
     free_log_group(lg);
     m_log_records->truncate(upto_indx);
     m_last_flush_idx = upto_indx;
+    m_last_flush_ld_key = logdev_key{from_indx, dev_offset};
 
     // since we support out-of-order lsn write, so no need to guarantee the order of logstore write completion
     for (auto const& [idx, req] : req_map) {
         m_pending_callback++;
-        iomanager.run_on_forget(iomgr::reactor_regex::random_worker, iomgr::fiber_regex::syncio_only,
+        iomanager.run_on_forget(iomgr::reactor_regex::random_worker, /* iomgr::fiber_regex::syncio_only, */
                                 [this, dev_offset, idx, req]() {
                                     auto ld_key = logdev_key{idx, dev_offset};
                                     auto comp_cb = req->log_store->get_comp_cb();
@@ -526,20 +564,31 @@ uint64_t LogDev::truncate() {
         auto lstore = store.log_store;
         if (lstore == nullptr) { continue; }
         auto const [trunc_lsn, trunc_ld_key, tail_lsn] = lstore->truncate_info();
-        if (trunc_lsn == tail_lsn) {
-            THIS_LOGDEV_LOG(DEBUG, "Store_id={} didn't have any writes since last truncation, skipping ", store_id);
-            m_logdev_meta.remove_all_rollback_records(store_id, m_stopped /* persist_now */);
-            continue;
-        }
-        HS_DBG_ASSERT_GE(trunc_ld_key.idx, m_last_truncate_idx, "Trying to truncate logid which is already truncated");
         m_logdev_meta.update_store_superblk(store_id, logstore_superblk(trunc_lsn + 1), m_stopped /* persist_now */);
-
         // We found a new minimum logdev_key that we can truncate to
-        if (trunc_ld_key.idx > 0 && trunc_ld_key.idx < min_safe_ld_key.idx) { min_safe_ld_key = trunc_ld_key; }
+        if (trunc_ld_key.idx < min_safe_ld_key.idx) { min_safe_ld_key = trunc_ld_key; }
+    }
+
+    // All log stores are empty, we can truncate logs depends on the last flushed logdev_key
+    if (min_safe_ld_key == logdev_key::out_of_bound_ld_key()) {
+        min_safe_ld_key = m_last_flush_ld_key;
     }
 
     // There are no writes or no truncation called for any of the store, so we can't truncate anything
-    if (min_safe_ld_key == logdev_key::out_of_bound_ld_key() || min_safe_ld_key.idx <= m_last_truncate_idx) return 0;
+    if (min_safe_ld_key.idx <= 0 || min_safe_ld_key.idx <= m_last_truncate_idx) {
+        // Persist the logstore superblock to ensure correct start LSN during recovery. Avoid such scenario:
+        // 1. Follower1 appends logs up to 100, then is stopped by a sigkill.
+        // 2. Upon restart, a baseline resync is triggered using snapshot 2000.
+        // 3. Baseline resync completed with start_lsn=2001, but m_trunc_ld_key remains {0,0} since we cannot get a
+        // valid
+        //    device offset for LSN 2000 to update it.
+        // 4. Follower1 appends logs from 2001 to 2500, making tail_lsn > 2000.
+        // 5. Get m_trunc_ld_key={0,0}, goto here and return 0 without persist.
+        // 6. Follower1 is killed again, after restart, its start index remains 0, misinterpreting the range as
+        // [1,2500].
+        m_logdev_meta.persist();
+        return 0;
+    }
 
     uint64_t const num_records_to_truncate = uint64_cast(min_safe_ld_key.idx - m_last_truncate_idx);
 
@@ -615,7 +664,9 @@ std::shared_ptr< HomeLogStore > LogDev::create_new_log_store(bool append_mode) {
     return lstore;
 }
 
-folly::Future< shared< HomeLogStore > > LogDev::open_log_store(logstore_id_t store_id, bool append_mode) {
+folly::Future< shared< HomeLogStore > > LogDev::open_log_store(logstore_id_t store_id, bool append_mode,
+                                                               log_found_cb_t log_found_cb,
+                                                               log_replay_done_cb_t log_replay_done_cb) {
     folly::SharedMutexWritePriority::WriteHolder holder(m_store_map_mtx);
     auto it = m_id_logstore_map.find(store_id);
     if (it == m_id_logstore_map.end()) {
@@ -624,6 +675,8 @@ folly::Future< shared< HomeLogStore > > LogDev::open_log_store(logstore_id_t sto
                                                                     logstore_info{
                                                                         .log_store = nullptr,
                                                                         .append_mode = append_mode,
+                                                                        .log_found_cb = log_found_cb,
+                                                                        .log_replay_done_cb = log_replay_done_cb,
                                                                     }));
         HS_REL_ASSERT_EQ(happened, true, "Unable to insert logstore into id_logstore_map");
     }
@@ -635,7 +688,10 @@ void LogDev::remove_log_store(logstore_id_t store_id) {
     {
         folly::SharedMutexWritePriority::WriteHolder holder(m_store_map_mtx);
         auto ret = m_id_logstore_map.erase(store_id);
-        HS_REL_ASSERT((ret == 1), "try to remove invalid store_id {}-{}", m_logdev_id, store_id);
+        if (ret == 0) {
+            LOGWARN("try to remove invalid store_id {}-{}", m_logdev_id, store_id);
+            return;
+        }
     }
     unreserve_store_id(store_id);
 }
@@ -656,6 +712,8 @@ void LogDev::on_log_store_found(logstore_id_t store_id, const logstore_superblk&
     logstore_info& info = it->second;
     info.log_store =
         std::make_shared< HomeLogStore >(shared_from_this(), store_id, info.append_mode, sb.m_first_seq_num);
+    info.log_store->register_log_found_cb(info.log_found_cb);
+    info.log_store->register_log_replay_done_cb(info.log_replay_done_cb);
     info.promise.setValue(info.log_store);
 }
 
@@ -729,7 +787,7 @@ nlohmann::json LogDev::get_status(int verbosity) const {
 /////////////////////////////// LogDevMetadata Section ///////////////////////////////////////
 LogDevMetadata::LogDevMetadata() : m_sb{logdev_sb_meta_name}, m_rollback_sb{logdev_rollback_sb_meta_name} {}
 
-logdev_superblk* LogDevMetadata::create(logdev_id_t id) {
+logdev_superblk* LogDevMetadata::create(logdev_id_t id, flush_mode_t flush_mode) {
     logdev_superblk* sb = m_sb.create(logdev_sb_size_needed(0));
     rollback_superblk* rsb = m_rollback_sb.create(rollback_superblk::size_needed(1));
 
@@ -738,6 +796,7 @@ logdev_superblk* LogDevMetadata::create(logdev_id_t id) {
 
     m_id_reserver = std::make_unique< sisl::IDReserver >();
     m_sb->logdev_id = id;
+    m_sb->flush_mode = flush_mode;
     m_sb.write();
 
     m_rollback_sb->logdev_id = id;
diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp
index cf09e57bc..f3cc03f1d 100644
--- a/src/lib/logstore/log_dev.hpp
+++ b/src/lib/logstore/log_dev.hpp
@@ -404,6 +404,8 @@ struct logdev_superblk {
     uint32_t num_stores{0};
     uint64_t start_dev_offset{0};
     logid_t key_idx{0};
+    flush_mode_t flush_mode;
+
     // The meta data starts immediately after the super block
     // Equivalent of:
     // logstore_superblk meta[0];
@@ -481,7 +483,7 @@ class LogDevMetadata {
     LogDevMetadata& operator=(LogDevMetadata&&) noexcept = delete;
     ~LogDevMetadata() = default;
 
-    logdev_superblk* create(logdev_id_t id);
+    logdev_superblk* create(logdev_id_t id, flush_mode_t);
     void reset();
     std::vector< std::pair< logstore_id_t, logstore_superblk > > load();
     void persist();
@@ -564,18 +566,14 @@ class log_stream_reader {
 struct logstore_info {
     std::shared_ptr< HomeLogStore > log_store;
     bool append_mode;
+    log_found_cb_t log_found_cb{nullptr};
+    log_replay_done_cb_t log_replay_done_cb{nullptr};
     folly::SharedPromise< std::shared_ptr< HomeLogStore > > promise{};
 };
 
 static std::string const logdev_sb_meta_name{"Logdev_sb"};
 static std::string const logdev_rollback_sb_meta_name{"Logdev_rollback_sb"};
 
-VENUM(flush_mode_t, uint32_t, // Various flush modes (can be or'ed together)
-      INLINE = 1 << 0,        // Allow flush inline with the append
-      TIMER = 1 << 1,         // Allow timer based automatic flush
-      EXPLICIT = 1 << 2,      // Allow explcitly user calling flush
-);
-
 class LogDev : public std::enable_shared_from_this< LogDev > {
     friend class HomeLogStore;
 
@@ -708,7 +706,9 @@ class LogDev : public std::enable_shared_from_this< LogDev > {
     /// @param append_mode Is this log store is append mode or not. If append mode, write_async call fails and only
     /// append_async calls succeed.
     /// @return future< shared< HomeLogStore > > : Future which will be set with the log store once it is opened
-    folly::Future< shared< HomeLogStore > > open_log_store(logstore_id_t store_id, bool append_mode);
+    folly::Future< shared< HomeLogStore > > open_log_store(logstore_id_t store_id, bool append_mode,
+                                                           log_found_cb_t log_found_cb = nullptr,
+                                                           log_replay_done_cb_t log_replay_done_cb = nullptr);
 
     /// @brief Remove the log store and its associated resources
     /// @param store_id Store id that was created/opened
@@ -727,7 +727,7 @@ class LogDev : public std::enable_shared_from_this< LogDev > {
 
 private:
     void start_timer();
-    void stop_timer();
+    folly::Future< int > stop_timer();
 
     bool allow_inline_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::INLINE); }
     bool allow_timer_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::TIMER); }
@@ -791,8 +791,9 @@ class LogDev : public std::enable_shared_from_this< LogDev > {
     std::multimap< logid_t, logstore_id_t > m_garbage_store_ids;
     Clock::time_point m_last_flush_time;
 
-    logid_t m_last_flush_idx{-1}; // Track last flushed, last device offset and truncated log idx
-    logid_t m_last_truncate_idx{std::numeric_limits< logid_t >::min()}; // logdev truncate up to this idx
+    logid_t m_last_flush_idx{-1};           // Track last flushed, last device offset and truncated log idx
+    logdev_key m_last_flush_ld_key{0,0};    // Left interval of the last flush, 0 indicates the very beginning of logdev
+    logid_t m_last_truncate_idx{-1};        // Logdev truncate up to this idx
     crc32_t m_last_crc{INVALID_CRC32_VALUE};
 
     // LogDev Info block related fields
diff --git a/src/lib/logstore/log_store.cpp b/src/lib/logstore/log_store.cpp
index e2ea0f333..1e3a1bea6 100644
--- a/src/lib/logstore/log_store.cpp
+++ b/src/lib/logstore/log_store.cpp
@@ -188,12 +188,27 @@ void HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate
 
 #endif
 
+    // In normal write and compact path, upto_lsn is expected to be no larger than m_tail_lsn after the flush.
+    // So upto_lsn > m_tail_lsn is expected to exist only in baseline resync path.
+    // In baseline resync path, we truncate all entries up to upto_lsn, and update m_tail_lsn and m_next_lsn
+    // to make sure logstore's idx is always = raft's idx - 1.
     if (upto_lsn > m_tail_lsn) {
         THIS_LOGSTORE_LOG(WARN,
-                          "Truncating issued on lsn={} which is greater than tail_lsn={}, truncating upto tail_lsn",
+                          "Truncating issued on lsn={} which is greater than tail_lsn={}",
                           upto_lsn, m_tail_lsn.load(std::memory_order_relaxed));
-        m_trunc_ld_key = m_records.at(m_tail_lsn).m_trunc_key;
-        upto_lsn = m_tail_lsn;
+        // update m_tail_lsn if it is less than upto_lsn
+        auto current_tail_lsn = m_tail_lsn.load(std::memory_order_relaxed);
+        while (current_tail_lsn < upto_lsn &&
+               !m_tail_lsn.compare_exchange_weak(current_tail_lsn, upto_lsn, std::memory_order_relaxed)) {}
+
+        // update m_next_lsn if it is less than upto_lsn + 1
+        auto current_next_lsn = m_next_lsn.load(std::memory_order_relaxed);
+        while (current_next_lsn < upto_lsn + 1 &&
+               !m_next_lsn.compare_exchange_weak(current_next_lsn, upto_lsn + 1, std::memory_order_relaxed)) {}
+
+        // insert an empty record to make sure m_records has enough size to truncate
+        logdev_key empty_ld_key;
+        m_records.create_and_complete(upto_lsn, logstore_record(empty_ld_key, empty_ld_key));
     } else {
         m_trunc_ld_key = m_records.at(upto_lsn).m_trunc_key;
         THIS_LOGSTORE_LOG(TRACE, "Truncating logstore upto lsn={} , m_trunc_ld_key index {} offset {}", upto_lsn,
@@ -206,7 +221,12 @@ void HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate
 
 std::tuple< logstore_seq_num_t, logdev_key, logstore_seq_num_t > HomeLogStore::truncate_info() const {
     auto const trunc_lsn = m_start_lsn.load(std::memory_order_relaxed) - 1;
-    return std::make_tuple(trunc_lsn, m_trunc_ld_key, m_tail_lsn.load(std::memory_order_relaxed));
+    auto const tail_lsn = m_tail_lsn.load(std::memory_order_relaxed);
+
+    // If the store is empty, return out_of_bound_ld_key as trunc_ld_key, allowing the caller to truncate freely.
+    // Otherwise, return the actual trunc_ld_key.
+    return (trunc_lsn == tail_lsn) ? std::make_tuple(trunc_lsn, logdev_key::out_of_bound_ld_key(), tail_lsn)
+                                   : std::make_tuple(trunc_lsn, m_trunc_ld_key, tail_lsn);
 }
 
 void HomeLogStore::fill_gap(logstore_seq_num_t seq_num) {
@@ -276,10 +296,7 @@ void HomeLogStore::flush(logstore_seq_num_t upto_lsn) {
         return;
     }
 
-    if (upto_lsn == invalid_lsn()) { upto_lsn = m_records.active_upto(); }
-
-    // if we have flushed already, we are done, else issue a flush
-    if (m_records.status(upto_lsn).is_active) m_logdev->flush_under_guard();
+    m_logdev->flush_under_guard();
 }
 
 bool HomeLogStore::rollback(logstore_seq_num_t to_lsn) {
diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp
index c44291d69..7270a6184 100644
--- a/src/lib/logstore/log_store_service.cpp
+++ b/src/lib/logstore/log_store_service.cpp
@@ -135,10 +135,10 @@ logdev_id_t LogStoreService::get_next_logdev_id() {
     return id;
 }
 
-logdev_id_t LogStoreService::create_new_logdev() {
+logdev_id_t LogStoreService::create_new_logdev(flush_mode_t flush_mode) {
     folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx);
     logdev_id_t logdev_id = get_next_logdev_id();
-    auto logdev = create_new_logdev_internal(logdev_id);
+    auto logdev = create_new_logdev_internal(logdev_id, flush_mode);
     logdev->start(true /* format */, m_logdev_vdev);
     COUNTER_INCREMENT(m_metrics, logdevs_count, 1);
     HS_LOG(INFO, logstore, "Created log_dev={}", logdev_id);
@@ -146,6 +146,8 @@ logdev_id_t LogStoreService::create_new_logdev() {
 }
 
 void LogStoreService::destroy_log_dev(logdev_id_t logdev_id) {
+    HS_LOG(INFO, logstore, "Destroying logdev {}", logdev_id);
+
     folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx);
     const auto it = m_id_logdev_map.find(logdev_id);
     if (it == m_id_logdev_map.end()) {
@@ -155,20 +157,20 @@ void LogStoreService::destroy_log_dev(logdev_id_t logdev_id) {
 
     // Stop the logdev and release all the chunks from the journal vdev.
     auto& logdev = it->second;
-    if (!logdev->is_stopped()) {
-        // Stop the logdev if its started.
-        logdev->stop();
-    }
+    // if (!logdev->is_stopped()) {
+    //  Stop the logdev if its started.
+    logdev->stop();
+        //}
 
-    // First release all chunks.
-    m_logdev_vdev->destroy(logdev_id);
+        // First release all chunks.
+        m_logdev_vdev->destroy(logdev_id);
 
-    // Destroy the metablks for logdev.
-    logdev->destroy();
+        // Destroy the metablks for logdev.
+        logdev->destroy();
 
-    m_id_logdev_map.erase(it);
-    COUNTER_DECREMENT(m_metrics, logdevs_count, 1);
-    HS_LOG(INFO, logstore, "Removed log_dev={}", logdev_id);
+        m_id_logdev_map.erase(it);
+        COUNTER_DECREMENT(m_metrics, logdevs_count, 1);
+        HS_LOG(INFO, logstore, "Removed log_dev={}", logdev_id);
 }
 
 void LogStoreService::delete_unopened_logdevs() {
@@ -179,19 +181,20 @@ void LogStoreService::delete_unopened_logdevs() {
     m_unopened_logdev.clear();
 }
 
-std::shared_ptr< LogDev > LogStoreService::create_new_logdev_internal(logdev_id_t logdev_id) {
-    auto logdev = std::make_shared< LogDev >(logdev_id);
+std::shared_ptr< LogDev > LogStoreService::create_new_logdev_internal(logdev_id_t logdev_id, flush_mode_t flush_mode) {
+    auto logdev = std::make_shared< LogDev >(logdev_id, flush_mode);
     const auto it = m_id_logdev_map.find(logdev_id);
     HS_REL_ASSERT((it == m_id_logdev_map.end()), "logdev id {} already exists", logdev_id);
     m_id_logdev_map.insert(std::make_pair<>(logdev_id, logdev));
+    LOGINFO("Created logdev {}", logdev_id);
     return logdev;
 }
 
-void LogStoreService::open_logdev(logdev_id_t logdev_id) {
+void LogStoreService::open_logdev(logdev_id_t logdev_id, flush_mode_t flush_mode) {
     folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx);
     const auto it = m_id_logdev_map.find(logdev_id);
     if (it == m_id_logdev_map.end()) {
-        auto logdev = std::make_shared< LogDev >(logdev_id);
+        auto logdev = std::make_shared< LogDev >(logdev_id, flush_mode);
         m_id_logdev_map.emplace(logdev_id, logdev);
         LOGDEBUGMOD(logstore, "log_dev={} does not exist, created!", logdev_id);
     }
@@ -224,13 +227,14 @@ void LogStoreService::logdev_super_blk_found(const sisl::byte_view& buf, void* m
         folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx);
         std::shared_ptr< LogDev > logdev;
         auto id = sb->logdev_id;
+        auto flush_mode = sb->flush_mode;
         const auto it = m_id_logdev_map.find(id);
         // We could update the logdev map either with logdev or rollback superblks found callbacks.
         if (it != m_id_logdev_map.end()) {
             logdev = it->second;
             HS_LOG(DEBUG, logstore, "Log dev superblk found log_dev={}", id);
         } else {
-            logdev = std::make_shared< LogDev >(id);
+            logdev = std::make_shared< LogDev >(id, flush_mode);
             m_id_logdev_map.emplace(id, logdev);
             // when recover logdev meta blk, we get all the logdevs from the superblk. we put them in m_unopened_logdev
             // too. after logdev meta blks are all recovered, when a client opens a logdev, we remove it from
@@ -272,20 +276,28 @@ std::shared_ptr< HomeLogStore > LogStoreService::create_new_log_store(logdev_id_
 }
 
 folly::Future< shared< HomeLogStore > > LogStoreService::open_log_store(logdev_id_t logdev_id, logstore_id_t store_id,
-                                                                        bool append_mode) {
+                                                                        bool append_mode, log_found_cb_t log_found_cb,
+                                                                        log_replay_done_cb_t log_replay_done_cb) {
     folly::SharedMutexWritePriority::ReadHolder holder(m_logdev_map_mtx);
     const auto it = m_id_logdev_map.find(logdev_id);
     HS_REL_ASSERT((it != m_id_logdev_map.end()), "logdev id {} doesnt exist", logdev_id);
     COUNTER_INCREMENT(m_metrics, logstores_count, 1);
-    return it->second->open_log_store(store_id, append_mode);
+    return it->second->open_log_store(store_id, append_mode, log_found_cb, log_replay_done_cb);
 }
 
 void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t store_id) {
+    HS_LOG(INFO, logstore, "Removing logstore {} from logdev {}", store_id, logdev_id);
+
     folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx);
     COUNTER_INCREMENT(m_metrics, logstores_count, 1);
     const auto it = m_id_logdev_map.find(logdev_id);
-    HS_REL_ASSERT((it != m_id_logdev_map.end()), "logdev id {} doesnt exist", logdev_id);
+    if (it == m_id_logdev_map.end()) {
+        HS_LOG(WARN, logstore, "logdev id {} doesnt exist", logdev_id);
+        return;
+    }
     it->second->remove_log_store(store_id);
+    HS_LOG(INFO, logstore, "Successfully removed logstore {} from logdev {}", store_id, logdev_id);
+
     COUNTER_DECREMENT(m_metrics, logstores_count, 1);
 }
 
diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp
index 4d80987d1..37ef04bee 100644
--- a/src/lib/replication/log_store/home_raft_log_store.cpp
+++ b/src/lib/replication/log_store/home_raft_log_store.cpp
@@ -47,6 +47,9 @@ static uint64_t extract_term(const log_buffer& log_bytes) {
     return (*r_cast< uint64_t const* >(raw_ptr));
 }
 
+#if 0
+// Since truncate_lsn can not accross compact_lsn passed down by raft server
+// and compact will truncate logs upto compact_lsn, we don't need to re-truncate in this function now.
 void HomeRaftLogStore::truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_lsn) {
     auto const last_lsn = last_index();
     auto const start_lsn = start_index();
@@ -77,6 +80,7 @@ void HomeRaftLogStore::truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_ls
         m_log_store->truncate(truncate_lsn);
     }
 }
+#endif
 
 HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore_id, log_found_cb_t const& log_found_cb,
                                    log_replay_done_cb_t const& log_replay_done_cb) :
@@ -86,7 +90,7 @@ HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore
     m_dummy_log_entry = nuraft::cs_new< nuraft::log_entry >(0, nuraft::buffer::alloc(0), nuraft::log_val_type::app_log);
 
     if (logstore_id == UINT32_MAX) {
-        m_logdev_id = logstore_service().create_new_logdev();
+        m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
         m_log_store = logstore_service().create_new_log_store(m_logdev_id, true);
         if (!m_log_store) { throw std::runtime_error("Failed to create log store"); }
         m_logstore_id = m_log_store->get_store_id();
@@ -95,15 +99,13 @@ HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore
         m_logdev_id = logdev_id;
         m_logstore_id = logstore_id;
         LOGDEBUGMOD(replication, "Opening existing home log_dev={} log_store={}", m_logdev_id, logstore_id);
-        logstore_service().open_logdev(m_logdev_id);
+        logstore_service().open_logdev(m_logdev_id, flush_mode_t::EXPLICIT);
         m_log_store_future = logstore_service()
-                                 .open_log_store(m_logdev_id, logstore_id, true)
-                                 .thenValue([this, log_found_cb, log_replay_done_cb](auto log_store) {
+                                 .open_log_store(m_logdev_id, logstore_id, true, log_found_cb, log_replay_done_cb)
+                                 .thenValue([this](auto log_store) {
                                      m_log_store = std::move(log_store);
                                      DEBUG_ASSERT_EQ(m_logstore_id, m_log_store->get_store_id(),
                                                      "Mismatch in passed and create logstore id");
-                                     m_log_store->register_log_found_cb(log_found_cb);
-                                     m_log_store->register_log_replay_done_cb(log_replay_done_cb);
                                      REPL_STORE_LOG(DEBUG, "Home Log store created/opened successfully");
                                  });
     }
@@ -147,8 +149,11 @@ nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::last_entry() const {
         auto log_bytes = m_log_store->read_sync(max_seq);
         nle = to_nuraft_log_entry(log_bytes);
     } catch (const std::exception& e) {
-        REPL_STORE_LOG(ERROR, "last_entry() out_of_range={}", max_seq);
-        throw e;
+        // all the log entries are truncated, so we should return a dummy log entry.
+        REPL_STORE_LOG(ERROR, "last_entry() out_of_range={}, {}", max_seq, e.what());
+        // according to the contract, we should return a dummy log entry if the index is out of range.
+        // https://github.com/eBay/NuRaft/blob/50e2f949503081262cb21923e633eaa8dacad8fa/include/libnuraft/log_store.hxx#L56
+        nle = m_dummy_log_entry;
     }
 
     return nle;
@@ -182,6 +187,20 @@ void HomeRaftLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& e
 
     m_log_store->append_async(sisl::io_blob{buf->data_begin(), uint32_cast(buf->size()), false /* is_aligned */},
                               nullptr /* cookie */, [buf](int64_t, sisl::io_blob&, logdev_key, void*) {});
+
+    auto position_in_cache = index % m_log_entry_cache.size();
+    {
+        std::unique_lock lk(m_mutex);
+        m_log_entry_cache[position_in_cache] = std::make_pair(index, entry);
+
+        // remove all cached entries after this index
+        for (size_t i{0}; i < m_log_entry_cache.size(); ++i) {
+            if (m_log_entry_cache[i].first > index) { m_log_entry_cache[i] = std::make_pair(0, nullptr); }
+        }
+    }
+
+    // flushing the log before returning to ensure new(over-written) log is persisted to disk.
+    end_of_append_batch(index, 1);
 }
 
 void HomeRaftLogStore::end_of_append_batch(ulong start, ulong cnt) {
@@ -205,6 +224,31 @@ nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > HomeRaftLogStore:
     return out_vec;
 }
 
+nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > >
+HomeRaftLogStore::log_entries_ext(ulong start, ulong end, int64_t batch_size_hint_in_bytes) {
+    // WARNING: we interpret batch_size_hint_in_bytes as count as of now.
+    auto batch_size_hint_cnt = batch_size_hint_in_bytes;
+    auto new_end = end;
+    // batch_size_hint_in_bytes < 0 indicats that follower is busy now and do not want to receive any more log entry.
+    if (batch_size_hint_cnt < 0)
+        new_end = start;
+    else if (batch_size_hint_cnt > 0) {
+        // limit to the hint, also prevent overflow by a huge batch_size_hint_cnt
+        if (sisl_unlikely(start + (uint64_t)batch_size_hint_cnt < start)) {
+            new_end = end;
+        } else {
+            new_end = start + (uint64_t)batch_size_hint_cnt;
+        }
+        // limit to original end
+        new_end = std::min(new_end, end);
+    }
+    DEBUG_ASSERT(new_end <= end, "new end {} should be <= original end {}", new_end, end);
+    DEBUG_ASSERT(start <= new_end, "start {} should be <= new_end {}", start, new_end);
+    REPL_STORE_LOG(TRACE, "log_entries_ext, start={} end={}, hint {}, adjusted range {} ~ {}, cnt {}", start, end,
+                   batch_size_hint_cnt, start, new_end, new_end - start);
+    return log_entries(start, new_end);
+}
+
 nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::entry_at(ulong index) {
     auto positio_in_cache = index % m_log_entry_cache.size();
     {
@@ -315,14 +359,12 @@ bool HomeRaftLogStore::compact(ulong compact_lsn) {
         // release this assert if for some use case, we should tolorant this case;
         // for now, don't expect this case to happen.
         // RELEASE_ASSERT(false, "compact_lsn={} is beyond the current max_lsn={}", compact_lsn, cur_max_lsn);
-        REPL_STORE_LOG(DEBUG, "Adding dummy entries during compact from={} upto={}", cur_max_lsn + 1,
-                       to_store_lsn(compact_lsn));
-        // We need to fill the remaining entries with dummy data.
-        for (auto lsn{cur_max_lsn + 1}; lsn <= to_store_lsn(compact_lsn); ++lsn) {
-            append(m_dummy_log_entry);
-        }
+
+        // if compact_lsn is beyond the current max_lsn, it indicates a hole from cur_max_lsn to compact_lsn.
+        // we directly compact and truncate up to compact_lsn assuming there are dummy logs.
+        REPL_STORE_LOG(DEBUG, "Compact with log holes from {} to={}", cur_max_lsn + 1, to_store_lsn(compact_lsn));
     }
-    m_log_store->truncate(to_store_lsn(compact_lsn));
+    m_log_store->truncate(to_store_lsn(compact_lsn), false);
     return true;
 }
 
@@ -336,6 +378,13 @@ ulong HomeRaftLogStore::last_durable_index() {
     return to_repl_lsn(m_last_durable_lsn);
 }
 
+void HomeRaftLogStore::purge_all_logs() {
+    auto last_lsn = m_log_store->get_contiguous_issued_seq_num(m_last_durable_lsn);
+    REPL_STORE_LOG(INFO, "Store={} LogDev={}: Purging all logs in the log store, last_lsn={}", m_logstore_id,
+                   m_logdev_id, last_lsn);
+    m_log_store->truncate(last_lsn, false /* in_memory_truncate_only */);
+}
+
 void HomeRaftLogStore::wait_for_log_store_ready() { m_log_store_future.wait(); }
 
 void HomeRaftLogStore::set_last_durable_lsn(repl_lsn_t lsn) { m_last_durable_lsn = to_store_lsn(lsn); }
diff --git a/src/lib/replication/log_store/home_raft_log_store.h b/src/lib/replication/log_store/home_raft_log_store.h
index ccf46ef92..846b1de3c 100644
--- a/src/lib/replication/log_store/home_raft_log_store.h
+++ b/src/lib/replication/log_store/home_raft_log_store.h
@@ -99,12 +99,34 @@ class HomeRaftLogStore : public nuraft::log_store {
     /**
      * Get log entries with index [start, end).
      *
+     * Return nullptr to indicate error if any log entry within the requested range
+     * could not be retrieved (e.g. due to external log truncation).
+     *
      * @param start The start log index number (inclusive).
      * @param end The end log index number (exclusive).
      * @return The log entries between [start, end).
      */
     virtual nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > log_entries(ulong start, ulong end) override;
 
+    /**
+     * Get log entries with index [start, end).
+     *
+     * The total size of the returned entries is limited by batch_size_hint.
+     *
+     * Return nullptr to indicate error if any log entry within the requested range
+     * could not be retrieved (e.g. due to external log truncation).
+     *
+     * @param start The start log index number (inclusive).
+     * @param end The end log index number (exclusive).
+     * @param batch_size_hint_in_bytes Total size (in bytes) of the returned entries,
+     *        see the detailed comment at
+     *        `state_machine::get_next_batch_size_hint_in_bytes()`.
+     * @return The log entries between [start, end) and limited by the total size
+     *         given by the batch_size_hint_in_bytes.
+     */
+    virtual nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > >
+    log_entries_ext(ulong start, ulong end, int64_t batch_size_hint_in_bytes = 0) override;
+
     /**
      * Get the log entry at the specified log index number.
      *
@@ -182,6 +204,7 @@ class HomeRaftLogStore : public nuraft::log_store {
      */
     ulong last_index() const;
 
+#if 0
     /**
      * Truncates the log store
      *
@@ -190,6 +213,13 @@ class HomeRaftLogStore : public nuraft::log_store {
      * LSN;
      */
     void truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_lsn);
+#endif
+
+    /**
+     * Purge all logs in the log store
+     * It is a dangerous operation and not be used currently.
+     */
+    void purge_all_logs();
 
     void wait_for_log_store_ready();
     void set_last_durable_lsn(repl_lsn_t lsn);
diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp
index 4271d8b88..f9b3d454e 100644
--- a/src/lib/replication/log_store/repl_log_store.cpp
+++ b/src/lib/replication/log_store/repl_log_store.cpp
@@ -10,16 +10,16 @@ uint64_t ReplLogStore::append(nuraft::ptr< nuraft::log_entry >& entry) {
     // We don't want to transform anything that is not an app log
     if (entry->get_val_type() != nuraft::log_val_type::app_log || entry->get_buf_ptr()->size() == 0) {
         ulong lsn = HomeRaftLogStore::append(entry);
-        RD_LOGD("append entry term={}, log_val_type={} lsn={} size={}", entry->get_term(),
+        RD_LOGD(NO_TRACE_ID, "None-APP log: append entry term={}, log_val_type={} lsn={} size={}", entry->get_term(),
                 static_cast< uint32_t >(entry->get_val_type()), lsn, entry->get_buf().size());
         return lsn;
     }
 
     repl_req_ptr_t rreq = m_sm.localize_journal_entry_finish(*entry);
+    RELEASE_ASSERT_NE(nullptr != rreq, "Failed to localize journal entry before appending log");
     ulong lsn = HomeRaftLogStore::append(entry);
     m_sm.link_lsn_to_req(rreq, int64_cast(lsn));
-
-    RD_LOGD("Raft Channel: Received append log entry rreq=[{}]", rreq->to_compact_string());
+    RD_LOGT(rreq->traceID(), "Raft Channel: Received append log entry rreq=[{}]", rreq->to_compact_string());
     return lsn;
 }
 
@@ -31,9 +31,10 @@ void ReplLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry
     }
 
     repl_req_ptr_t rreq = m_sm.localize_journal_entry_finish(*entry);
+    RELEASE_ASSERT(nullptr != rreq, "Failed to localize journal entry before overwriting log at index {}", index);
     HomeRaftLogStore::write_at(index, entry);
     m_sm.link_lsn_to_req(rreq, int64_cast(index));
-    RD_LOGD("Raft Channel: Received write_at log entry rreq=[{}]", rreq->to_compact_string());
+    RD_LOGT(rreq->traceID(), "Raft Channel: Received write_at log entry rreq=[{}]", rreq->to_compact_string());
 }
 
 void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) {
@@ -44,11 +45,8 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) {
     auto proposer_reqs = sisl::VectorPool< repl_req_ptr_t >::alloc();
     for (int64_t lsn = int64_cast(start_lsn); lsn <= end_lsn; ++lsn) {
         auto rreq = m_sm.lsn_to_req(lsn);
-        // Skip this call in proposer, since this method will synchronously flush the data, which is not required for
-        // leader. Proposer will call the flush as part of commit after receiving quorum, upon which time, there is a
-        // high possibility the log entry is already flushed. Skip it for rreq == nullptr which is the case for raft
-        // config entries.
-        if ((rreq == nullptr) /*|| rreq->is_proposer()*/) {
+        // Skip it for rreq == nullptr which is the case for raft config entries.
+        if ((rreq == nullptr)) {
             continue;
         } else if (rreq->is_proposer()) {
             proposer_reqs->emplace_back(std::move(rreq));
@@ -57,49 +55,66 @@ void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) {
         }
     }
 
-    RD_LOGT("Raft Channel: end_of_append_batch start_lsn={} count={} num_data_to_be_written={} {}", start_lsn, count,
-            reqs->size(), proposer_reqs->size());
+    RD_LOGT(NO_TRACE_ID, "Raft Channel: end_of_append_batch start_lsn={} count={} num_data_to_be_written={} {}",
+            start_lsn, count, reqs->size(), proposer_reqs->size());
 
-    // All requests are from proposer for data write, so as mentioned above we can skip the flush for now
     if (!reqs->empty()) {
         // Check the map if data corresponding to all of these requsts have been received and written. If not, schedule
         // a fetch and write. Once all requests are completed and written, these requests are poped out of the map and
         // the future will be ready.
-        auto fut = m_rd.notify_after_data_written(reqs);
-
-        // In the meanwhile, we can flush the journal for this lsn batch. It is ok to flush the entries in log before
-        // actual data is written, because, even if we have the log, it doesn't mean data is committed, until state
-        // machine reports that. This way the flush and fetch both can run in parallel.
         auto cur_time = std::chrono::steady_clock::now();
-        HomeRaftLogStore::end_of_append_batch(start_lsn, count);
-        HISTOGRAM_OBSERVE(m_rd.metrics(), raft_end_of_append_batch_latency_us, get_elapsed_time_us(cur_time));
-
-        cur_time = std::chrono::steady_clock::now();
+        auto fut = m_rd.notify_after_data_written(reqs);
         // Wait for the fetch and write to be completed successfully.
+        // It is essential to complete the data write before appending to the log. If the logs are flushed
+        // before the data is written, a restart and subsequent log replay occurs, as the in-memory state is lost,
+        // it leaves us uncertain about whether the data was actually written, potentially leading to data
+        // inconsistency.
         std::move(fut).wait();
         HISTOGRAM_OBSERVE(m_rd.metrics(), data_channel_wait_latency_us, get_elapsed_time_us(cur_time));
+    }
+
+    // Flushing logs now.
+    auto cur_time = std::chrono::steady_clock::now();
+    HomeRaftLogStore::end_of_append_batch(start_lsn, count);
+    HISTOGRAM_OBSERVE(m_rd.metrics(), raft_end_of_append_batch_latency_us, get_elapsed_time_us(cur_time));
 
-        // Mark all the reqs also completely written
-        for (auto const& rreq : *reqs) {
-            if (rreq) { rreq->add_state(repl_req_state_t::LOG_FLUSHED); }
+    // Mark all the reqs completely written
+    for (auto const& rreq : *reqs) {
+        if (rreq) { rreq->add_state(repl_req_state_t::LOG_FLUSHED); }
+    }
+
+    // Data corresponding to proposer reqs have already been written before propose reqs to raft,
+    // so skip waiting data written and mark reqs as flushed here.
+    for (auto const& rreq : *proposer_reqs) {
+        if (rreq) {
+            RD_LOGT(rreq->traceID(),
+                    "Raft Channel: end_of_append_batch, I am proposer for lsn {}, only flushed log for it",
+                    rreq->lsn());
+            rreq->add_state(repl_req_state_t::LOG_FLUSHED);
         }
-    } else if (!proposer_reqs->empty()) {
-        RD_LOGT("Raft Channel: end_of_append_batch, I am proposer, only flush log s from {} , count {}", start_lsn,
-                count);
-        // Mark all the reqs also completely written
-        HomeRaftLogStore::end_of_append_batch(start_lsn, count);
-        for (auto const& rreq : *proposer_reqs) {
-            if (rreq) { rreq->add_state(repl_req_state_t::LOG_FLUSHED); }
+    }
+
+    // Convert volatile logs to non-volatile logs in state machine.
+    for (int64_t lsn = int64_cast(start_lsn); lsn <= end_lsn; ++lsn) {
+        auto rreq = m_sm.lsn_to_req(lsn);
+        if (rreq != nullptr) {
+            if (rreq->has_state(repl_req_state_t::ERRORED)) {
+                RD_LOGE(rreq->traceID(), "Raft Channel: rreq=[{}] met some errors before", rreq->to_compact_string());
+                continue;
+            }
+            rreq->set_is_volatile(false);
         }
     }
+
     sisl::VectorPool< repl_req_ptr_t >::free(reqs);
     sisl::VectorPool< repl_req_ptr_t >::free(proposer_reqs);
 }
 
 std::string ReplLogStore::rdev_name() const { return m_rd.rdev_name(); }
+std::string ReplLogStore::identify_str() const { return m_rd.identify_str(); }
 
 bool ReplLogStore::compact(ulong compact_upto_lsn) {
-    RD_LOG(DEBUG, "Raft Channel: compact_to_lsn={}", compact_upto_lsn);
+    RD_LOGD(NO_TRACE_ID, "Raft Channel: compact_to_lsn={}", compact_upto_lsn);
     m_rd.on_compact(compact_upto_lsn);
     return HomeRaftLogStore::compact(compact_upto_lsn);
 }
diff --git a/src/lib/replication/log_store/repl_log_store.h b/src/lib/replication/log_store/repl_log_store.h
index a386d397b..bb19df119 100644
--- a/src/lib/replication/log_store/repl_log_store.h
+++ b/src/lib/replication/log_store/repl_log_store.h
@@ -30,6 +30,7 @@ class ReplLogStore : public HomeRaftLogStore {
 
 private:
     std::string rdev_name() const;
+    std::string identify_str() const;
 };
 
 } // namespace homestore
diff --git a/src/lib/replication/push_data_rpc.fbs b/src/lib/replication/push_data_rpc.fbs
index 1f6d20546..d9a981e7c 100644
--- a/src/lib/replication/push_data_rpc.fbs
+++ b/src/lib/replication/push_data_rpc.fbs
@@ -2,6 +2,7 @@ native_include "boost/uuid/uuid.hpp";
 namespace homestore;
 
 table PushDataRequest {
+    trace_id: uint64;            // traceID for the REQ
     issuer_replica_id : int32;   // Replica id of the issuer
     raft_term : uint64;          // Raft term number
     dsn : uint64;                // Data Sequence number
diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp
index 71927a3ad..6b8ce122b 100644
--- a/src/lib/replication/repl_dev/common.cpp
+++ b/src/lib/replication/repl_dev/common.cpp
@@ -6,11 +6,13 @@
 #include <common/homestore_config.hpp>
 #include "replication/repl_dev/common.h"
 #include <libnuraft/nuraft.hxx>
+#include <iomgr/iomgr_flip.hpp>
 
 namespace homestore {
 
-void repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer, sisl::blob const& user_header,
-                        sisl::blob const& key, uint32_t data_size) {
+ReplServiceError repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer,
+                                    sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size,
+                                    cshared< ReplDevListener >& listener) {
     m_rkey = std::move(rkey);
 #ifndef NDEBUG
     if (data_size > 0) {
@@ -24,6 +26,36 @@ void repl_req_ctx::init(repl_key rkey, journal_type_t op_code, bool is_proposer,
     m_header = user_header;
     m_key = key;
     m_is_jentry_localize_pending = (!is_proposer && (data_size > 0)); // Pending on the applier and with linked data
+
+    // We need to allocate the block if the req has data linked, since entry doesn't exist or if it exist, two
+    // threads(data channel and raft channel) are trying to do the same thing. So take state mutex and allocate the blk
+    std::unique_lock< std::mutex > lg(m_state_mtx);
+    if (has_linked_data() && !has_state(repl_req_state_t::BLK_ALLOCATED)) {
+        ReplServiceError alloc_status;
+#ifdef _PRERELEASE
+        if (iomgr_flip::instance()->test_flip("simulate_no_space_left") && !is_proposer) {
+            LOGERROR("Simulate no space left on follower for testing purposes");
+            // TODO: support `simulate_no_space_left` for the leader, do not throw exception in on-error in the test
+            // framework, it will cause the leader to fail and exit.
+            alloc_status = ReplServiceError::NO_SPACE_LEFT;
+        } else {
+            alloc_status = alloc_local_blks(listener, data_size);
+            if (alloc_status != ReplServiceError::OK) {
+                LOGERRORMOD(replication, "[traceID={}] Allocate blk for rreq failed error={}", m_rkey.traceID,
+                            alloc_status);
+            }
+        }
+#else
+        alloc_status = alloc_local_blks(listener, data_size);
+        if (alloc_status != ReplServiceError::OK) {
+            LOGERRORMOD(replication, "[traceID={}] Allocate blk for rreq failed error={}", m_rkey.traceID,
+                        alloc_status);
+        }
+#endif
+        return alloc_status;
+    }
+
+    return ReplServiceError::OK;
 }
 
 repl_req_ctx::~repl_req_ctx() {
@@ -31,7 +63,7 @@ repl_req_ctx::~repl_req_ctx() {
 }
 
 void repl_req_ctx::create_journal_entry(bool is_raft_buf, int32_t server_id) {
-    uint32_t val_size = has_linked_data() ? m_local_blkid.serialized_size() : 0;
+    uint32_t val_size = has_linked_data() ? blkids_serialized_size() : 0;
     uint32_t entry_size = sizeof(repl_journal_entry) + m_header.size() + m_key.size() + val_size;
 
     if (is_raft_buf) {
@@ -43,6 +75,7 @@ void repl_req_ctx::create_journal_entry(bool is_raft_buf, int32_t server_id) {
     }
 
     m_journal_entry->code = m_op_code;
+    m_journal_entry->traceID = m_rkey.traceID;
     m_journal_entry->server_id = server_id;
     m_journal_entry->dsn = m_rkey.dsn;
     m_journal_entry->user_header_size = m_header.size();
@@ -61,14 +94,25 @@ void repl_req_ctx::create_journal_entry(bool is_raft_buf, int32_t server_id) {
     }
 
     if (has_linked_data()) {
-        auto const b = m_local_blkid.serialize();
-        std::memcpy(raw_ptr, b.cbytes(), b.size());
+        for (const auto& blkid : m_local_blkids) {
+            auto const b = blkid.serialize();
+            std::memcpy(raw_ptr, b.cbytes(), b.size());
+            raw_ptr += b.size();
+        }
     }
 }
 
 uint32_t repl_req_ctx::journal_entry_size() const {
     return sizeof(repl_journal_entry) + m_header.size() + m_key.size() +
-        (has_linked_data() ? m_local_blkid.serialized_size() : 0);
+        (has_linked_data() ? blkids_serialized_size() : 0);
+}
+
+uint32_t repl_req_ctx::blkids_serialized_size() const {
+    uint32_t blkids_serialized_size = 0;
+    for (const auto& blkid : m_local_blkids) {
+        blkids_serialized_size += blkid.serialized_size();
+    }
+    return blkids_serialized_size;
 }
 
 void repl_req_ctx::change_raft_journal_buf(raft_buf_ptr_t new_buf, bool adjust_hdr_key) {
@@ -88,15 +132,36 @@ void repl_req_ctx::change_raft_journal_buf(raft_buf_ptr_t new_buf, bool adjust_h
 ReplServiceError repl_req_ctx::alloc_local_blks(cshared< ReplDevListener >& listener, uint32_t data_size) {
     DEBUG_ASSERT(has_linked_data(), "Trying to allocate a block for non-inlined block");
 
-    auto const hints_result = listener->get_blk_alloc_hints(m_header, data_size);
+    auto const hints_result = listener->get_blk_alloc_hints(m_header, data_size, repl_req_ptr_t(this));
     if (hints_result.hasError()) { return hints_result.error(); }
 
+    if (hints_result.value().committed_blk_id.has_value()) {
+        // if the committed_blk_id is already present, use it and skip allocation and commitment
+        LOGINFOMOD(replication, "[traceID={}] For Repl_key=[{}] data already exists, skip", rkey().traceID,
+                   rkey().to_string());
+        m_local_blkids.emplace_back(hints_result.value().committed_blk_id.value());
+        add_state(repl_req_state_t::BLK_ALLOCATED);
+        add_state(repl_req_state_t::DATA_RECEIVED);
+        add_state(repl_req_state_t::DATA_WRITTEN);
+        add_state(repl_req_state_t::DATA_COMMITTED);
+        m_data_received_promise.setValue();
+        m_data_written_promise.setValue();
+        return ReplServiceError::OK;
+    }
+
+    std::vector< BlkId > blkids;
     auto status = data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()),
-                                            hints_result.value(), m_local_blkid);
+                                            hints_result.value(), blkids);
     if (status != BlkAllocStatus::SUCCESS) {
+        LOGWARNMOD(replication, "[traceID={}] block allocation failure, repl_key=[{}], status=[{}]", rkey().traceID,
+                   rkey(), status);
         DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks");
         return ReplServiceError::NO_SPACE_LEFT;
     }
+
+    for (auto& blkid : blkids) {
+        m_local_blkids.emplace_back(blkid);
+    }
     add_state(repl_req_state_t::BLK_ALLOCATED);
     return ReplServiceError::OK;
 }
@@ -109,7 +174,7 @@ void repl_req_ctx::set_lsn(int64_t lsn) {
                  "Changing lsn for request={} on the fly can cause race condition, not expected. lsn {}, m_lsn {}",
                  to_string(), lsn, m_lsn);
     m_lsn = lsn;
-    LOGTRACEMOD(replication, "Setting lsn={} for request={}", lsn, to_string());
+    LOGTRACEMOD(replication, "[traceID={}] Setting lsn={} for request={}", rkey().traceID, lsn, to_string());
 }
 
 bool repl_req_ctx::save_pushed_data(intrusive< sisl::GenericRpcData > const& pushed_data, uint8_t const* data,
@@ -164,12 +229,21 @@ bool repl_req_ctx::add_state_if_not_already(repl_req_state_t s) {
 void repl_req_ctx::clear() {
     m_header = sisl::blob{};
     m_key = sisl::blob{};
+    m_pkts.clear();
+}
+
+// FIXME: Use lock to avoid concurrent release of data.
+void repl_req_ctx::release_data() {
+    m_data = nullptr;
+    // explicitly clear m_buf_for_unaligned_data as unaligned pushdata/fetchdata will be saved here
+    m_buf_for_unaligned_data = sisl::io_blob_safe{};
     if (m_pushed_data) {
+        LOGTRACEMOD(replication, "[traceID={}] m_pushed_data addr={}, m_rkey={}, m_lsn={}", rkey().traceID,
+                    static_cast< void* >(m_pushed_data.get()), m_rkey.to_string(), m_lsn);
         m_pushed_data->send_response();
         m_pushed_data = nullptr;
     }
     m_fetched_data = sisl::GenericClientResponse{};
-    m_pkts.clear();
 }
 
 static std::string req_state_name(uint32_t state) {
@@ -188,15 +262,25 @@ std::string repl_req_ctx::to_string() const {
     return fmt::format("repl_key=[{}], lsn={} state=[{}] m_headersize={} m_keysize={} is_proposer={} "
                        "local_blkid={} remote_blkid={}",
                        m_rkey.to_string(), m_lsn, req_state_name(uint32_cast(state())), m_header.size(), m_key.size(),
-                       m_is_proposer, m_local_blkid.to_string(), m_remote_blkid.blkid.to_string());
+                       m_is_proposer, blkids_to_string(), m_remote_blkid.blkid.to_string());
 }
 
 std::string repl_req_ctx::to_compact_string() const {
-    if (m_op_code == journal_type_t::HS_CTRL_DESTROY) {
+    if (m_op_code == journal_type_t::HS_CTRL_DESTROY || m_op_code == journal_type_t::HS_CTRL_START_REPLACE || m_op_code == journal_type_t::HS_CTRL_COMPLETE_REPLACE) {
         return fmt::format("term={} lsn={} op={}", m_rkey.term, m_lsn, enum_name(m_op_code));
     }
+
     return fmt::format("dsn={} term={} lsn={} op={} local_blkid={} state=[{}]", m_rkey.dsn, m_rkey.term, m_lsn,
-                       enum_name(m_op_code), m_local_blkid.to_string(), req_state_name(uint32_cast(state())));
+                       enum_name(m_op_code), blkids_to_string(), req_state_name(uint32_cast(state())));
+}
+
+std::string repl_req_ctx::blkids_to_string() const {
+    std::string str = fmt::format("[");
+    for (const auto& blkid : m_local_blkids) {
+        fmt::format_to(std::back_inserter(str), "{} ", blkid.to_string());
+    }
+    fmt::format_to(std::back_inserter(str), "]");
+    return str;
 }
 
 bool repl_req_ctx::is_expired() const {
diff --git a/src/lib/replication/repl_dev/common.h b/src/lib/replication/repl_dev/common.h
index cb8a57931..c3433083f 100644
--- a/src/lib/replication/repl_dev/common.h
+++ b/src/lib/replication/repl_dev/common.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include <boost/intrusive_ptr.hpp>
-
+#include <random>
 #include <homestore/replication/repl_decls.h>
 #include <homestore/replication_service.hpp>
 #include <homestore/replication/repl_dev.h>
@@ -35,8 +35,9 @@ struct repl_journal_entry {
     uint16_t minor_version{JOURNAL_ENTRY_MINOR};
 
     journal_type_t code;
-    int32_t server_id; // Server id from where journal entry is originated
-    uint64_t dsn;      // Data seq number
+    trace_id_t traceID; // traceID provided by application, mostly for consolidate logs.
+    int32_t server_id;  // Server id from where journal entry is originated
+    uint64_t dsn;       // Data seq number
     uint32_t user_header_size;
     uint32_t key_size;
     uint32_t value_size;
@@ -57,6 +58,7 @@ struct repl_journal_entry {
 struct repl_dev_superblk {
     static constexpr uint64_t REPL_DEV_SB_MAGIC = 0xABCDF00D;
     static constexpr uint32_t REPL_DEV_SB_VERSION = 1;
+    static constexpr size_t max_name_len = 64;
 
     uint64_t magic{REPL_DEV_SB_MAGIC};
     uint32_t version{REPL_DEV_SB_VERSION};
@@ -67,9 +69,14 @@ struct repl_dev_superblk {
     repl_lsn_t checkpoint_lsn;     // LSN upto which this replica have checkpointed the Data
     repl_lsn_t compact_lsn;        // maximum LSN that can be compacted to
     uint64_t group_ordinal;        // Ordinal number which will be used to indicate the rdevXYZ for debugging
+    char rdev_name[max_name_len];  // Short name for the group for easy debugging
 
     uint64_t get_magic() const { return magic; }
     uint32_t get_version() const { return version; }
+    void set_rdev_name(std::string const& name) {
+        std::strncpy(rdev_name, name.c_str(), max_name_len - 1);
+        rdev_name[max_name_len - 1] = '\0';
+    }
 };
 #pragma pack()
 
@@ -88,4 +95,11 @@ auto make_async_success() {
     return folly::makeSemiFuture< ReplResult< folly::Unit > >(folly::Unit{});
 }
 
+inline uint64_t generateRandomTraceId() {
+    std::random_device rd;
+    std::mt19937_64 gen(rd());
+    std::uniform_int_distribution< uint64_t > dis;
+    return dis(gen);
+}
+
 } // namespace homestore
diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp
index 45a018d92..2303fda68 100644
--- a/src/lib/replication/repl_dev/raft_repl_dev.cpp
+++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp
@@ -15,9 +15,10 @@
 
 #include "common/homestore_assert.hpp"
 #include "common/homestore_config.hpp"
-// #include "common/homestore_flip.hpp"
+#include "common/homestore_utils.hpp"
 #include "replication/service/raft_repl_service.h"
 #include "replication/repl_dev/raft_repl_dev.h"
+#include "device/chunk.h"
 #include "device/device.h"
 #include "push_data_rpc_generated.h"
 #include "fetch_data_rpc_generated.h"
@@ -39,14 +40,16 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk
         m_data_journal = std::make_shared< ReplLogStore >(
             *this, *m_state_machine, m_rd_sb->logdev_id, m_rd_sb->logstore_id,
             [this](logstore_seq_num_t lsn, log_buffer buf, void* key) { on_log_found(lsn, buf, key); },
-            [this](std::shared_ptr< HomeLogStore > hs, logstore_seq_num_t lsn) { m_log_store_replay_done = true; });
+            [this](std::shared_ptr< HomeLogStore > hs, logstore_seq_num_t lsn) {
+                m_log_store_replay_done = true;
+                set_log_store_last_durable_lsn(hs->tail_lsn());
+            });
         m_next_dsn = m_rd_sb->last_applied_dsn + 1;
         m_commit_upto_lsn = m_rd_sb->durable_commit_lsn;
         m_last_flushed_commit_lsn = m_commit_upto_lsn;
         m_compact_lsn = m_rd_sb->compact_lsn;
 
-        m_rdev_name = fmt::format("rdev{}", m_rd_sb->group_ordinal);
-
+        m_rdev_name = m_rd_sb->rdev_name;
         // Its ok not to do compare exchange, because loading is always single threaded as of now
         if (m_rd_sb->group_ordinal >= s_next_group_ordinal.load()) {
             s_next_group_ordinal.store(m_rd_sb->group_ordinal + 1);
@@ -66,54 +69,502 @@ RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk
         m_rd_sb->logstore_id = m_data_journal->logstore_id();
         m_rd_sb->last_applied_dsn = 0;
         m_rd_sb->destroy_pending = 0x0;
+        m_rd_sb->last_snapshot_lsn = 0;
         m_rd_sb->group_ordinal = s_next_group_ordinal.fetch_add(1);
         m_rdev_name = fmt::format("rdev{}", m_rd_sb->group_ordinal);
+        m_rd_sb->set_rdev_name(m_rdev_name);
 
         if (m_rd_sb->is_timeline_consistent) {
             m_free_blks_journal = logstore_service().create_new_log_store(m_rd_sb->logdev_id, false /* append_mode */);
             m_rd_sb->free_blks_journal_id = m_free_blks_journal->get_store_id();
         }
         m_rd_sb.write();
+        bind_data_service();
     }
 
-    RD_LOG(INFO,
-           "Started {} RaftReplDev group_id={}, replica_id={}, raft_server_id={} commited_lsn={}, "
-           "compact_lsn={}, checkpoint_lsn:{}, next_dsn={} "
-           "log_dev={} log_store={}",
-           (load_existing ? "Existing" : "New"), group_id_str(), my_replica_id_str(), m_raft_server_id,
-           m_commit_upto_lsn.load(), m_compact_lsn.load(), m_rd_sb->checkpoint_lsn, m_next_dsn.load(),
-           m_rd_sb->logdev_id, m_rd_sb->logstore_id);
+    m_identify_str = m_rdev_name + ":" + group_id_str();
+
+    RD_LOGI(NO_TRACE_ID,
+            "Started {} RaftReplDev group_id={}, replica_id={}, raft_server_id={} commited_lsn={}, "
+            "compact_lsn={}, checkpoint_lsn:{}, next_dsn={} "
+            "log_dev={} log_store={}",
+            (load_existing ? "Existing" : "New"), group_id_str(), my_replica_id_str(), m_raft_server_id,
+            m_commit_upto_lsn.load(), m_compact_lsn.load(), m_rd_sb->checkpoint_lsn, m_next_dsn.load(),
+            m_rd_sb->logdev_id, m_rd_sb->logstore_id);
+}
 
+bool RaftReplDev::bind_data_service() {
+    RD_LOGI(NO_TRACE_ID, "Starting data channel, group_id={}, replica_id={}", group_id_str(), my_replica_id_str());
+    bool success = false;
 #ifdef _PRERELEASE
-    m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, [this](intrusive< sisl::GenericRpcData >& rpc_data) {
-        if (iomgr_flip::instance()->delay_flip("slow_down_data_channel", [this, rpc_data]() mutable {
-                RD_LOGI("Resuming after slow down data channel flip");
+    success =
+        m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, [this](intrusive< sisl::GenericRpcData >& rpc_data) {
+            if (iomgr_flip::instance()->delay_flip("slow_down_data_channel", [this, rpc_data]() mutable {
+                    RD_LOGI(NO_TRACE_ID, "Resuming after slow down data channel flip");
+                    on_push_data_received(rpc_data);
+                })) {
+                RD_LOGI(NO_TRACE_ID, "Slow down data channel flip is enabled, scheduling to call later");
+            } else {
                 on_push_data_received(rpc_data);
-            })) {
-            RD_LOGI("Slow down data channel flip is enabled, scheduling to call later");
-        } else {
-            on_push_data_received(rpc_data);
-        }
-    });
+            }
+        });
 #else
-    m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1));
+    success =
+        m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1));
 #endif
-
-    m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 1));
+    if (!success) {
+        RD_LOGE(NO_TRACE_ID, "Failed to bind data service request for PUSH_DATA");
+        return false;
+    }
+    success =
+        m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 1));
+    if (!success) {
+        RD_LOGE(NO_TRACE_ID, "Failed to bind data service request for FETCH_DATA");
+        return false;
+    }
+    return true;
 }
 
 bool RaftReplDev::join_group() {
+    bind_data_service();
     auto raft_result =
         m_msg_mgr.join_group(m_group_id, "homestore_replication",
                              std::dynamic_pointer_cast< nuraft_mesg::mesg_state_mgr >(shared_from_this()));
     if (!raft_result) {
-        HS_DBG_ASSERT(false, "Unable to join the group_id={} with error={}", boost::uuids::to_string(m_group_id),
-                      raft_result.error());
+        HS_DBG_ASSERT(false, "Unable to join the group_id={} with error={}", group_id_str(), raft_result.error());
         return false;
     }
     return true;
 }
 
+// All the steps in the implementation should be idempotent and retryable.
+AsyncReplResult<> RaftReplDev::start_replace_member(const replica_member_info& member_out,
+                                                    const replica_member_info& member_in, uint32_t commit_quorum,
+                                                    uint64_t trace_id) {
+    if (is_stopping()) {
+        RD_LOGI(trace_id, "repl dev is being shutdown!");
+        return make_async_error<>(ReplServiceError::STOPPING);
+    }
+    incr_pending_request_num();
+
+    RD_LOGI(trace_id, "Start replace member, member_out={} member_in={}", boost::uuids::to_string(member_out.id),
+            boost::uuids::to_string(member_in.id));
+
+    if (commit_quorum >= 1) {
+        // Two members are down and leader cant form the quorum. Reduce the quorum size.
+        reset_quorum_size(commit_quorum, trace_id);
+    }
+
+    // Step1, validate request
+    auto out_srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member_out.id));
+    if (!out_srv_cfg) {
+        auto in_srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member_in.id));
+        if (in_srv_cfg) {
+            RD_LOGI(
+                trace_id,
+                "Step1. Replace member, the intent has already been fulfilled, ignore it, member_out={} member_in={}",
+                boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id));
+            reset_quorum_size(0, trace_id);
+            decr_pending_request_num();
+            return make_async_success<>();
+        }
+        RD_LOGE(trace_id, "Step1. Replace member invalid parameter, out member is not found");
+        reset_quorum_size(0, trace_id);
+        decr_pending_request_num();
+        return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND);
+    }
+    if (m_my_repl_id != get_leader_id()) {
+        reset_quorum_size(0, trace_id);
+        decr_pending_request_num();
+        return make_async_error<>(ReplServiceError::NOT_LEADER);
+    }
+    // Check if leader itself is requested to move out.
+    if (m_my_repl_id == member_out.id) {
+        // immediate=false successor=-1, nuraft will choose an alive peer with highest priority as successor, and wait
+        // until the successor finishes the catch-up of the latest log, and then resign. Return NOT_LEADER and let
+        // client retry.
+        raft_server()->yield_leadership(false /* immediate */, -1 /* successor */);
+        RD_LOGI(trace_id, "Step1. Replace member, leader is the member_out so yield leadership");
+        reset_quorum_size(0, trace_id);
+        decr_pending_request_num();
+        return make_async_error<>(ReplServiceError::NOT_LEADER);
+    }
+    // quorum safety check. TODO currently only consider lsn, need to check last response time.
+    auto active_peers = get_active_peers();
+    // active_peers doesn't include leader itself.
+    auto quorum = active_peers.size() + 1;
+    for (const auto& p : active_peers) {
+        quorum = p == member_out.id ? quorum - 1 : quorum;
+        quorum = p == member_in.id ? quorum - 1 : quorum;
+    }
+    RD_LOGD(trace_id,
+            "Step1. Replace member, quorum safety check, active_peers={}, active_peers_exclude_out/in_member={}, "
+            "commit_quorum={}",
+            active_peers.size(), quorum, commit_quorum);
+    // commit_quorum=0 means actual commit quorum is the majority. In this case, active normal member count should be
+    // greater than 1. To be more specific, if we have S1(leader), S2, S3(out), S4(in), we don't allow
+    // replace_member(S3, S4) if S2 is down or laggy. Needs to recover S2 first or retry with commit_quorum=1.
+    if (quorum <= 1 && commit_quorum == 0) {
+        RD_LOGE(trace_id, "Step1. Replace member, quorum safety check failed, active_peers={}, active_peers_exclude_out/in_member={}, commit_quorum={}",
+                active_peers.size(), quorum, commit_quorum);
+        reset_quorum_size(0, trace_id);
+        decr_pending_request_num();
+        return make_async_error<>(ReplServiceError::QUORUM_NOT_MET);
+    }
+
+    // Step 2: Handle out member.
+#ifdef _PRERELEASE
+    if (iomgr_flip::instance()->test_flip("replace_member_set_learner_failure")) {
+        RD_LOGE(trace_id, "Simulating set member to learner failure");
+        return make_async_error(ReplServiceError::FAILED);
+    }
+#endif
+    RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner");
+    auto learner_ret = do_flip_learner(member_out, true, true, trace_id);
+    if (learner_ret != ReplServiceError::OK) {
+        RD_LOGE(trace_id, "Step2. Replace member, failed to flip out member to learner {}", learner_ret);
+        reset_quorum_size(0, trace_id);
+        decr_pending_request_num();
+        return make_async_error(std::move(learner_ret));
+    }
+    RD_LOGI(trace_id, "Step2. Replace member, flip out member to learner and set priority to 0");
+
+    // Step 3. Append log entry to mark the old member is out and new member is added.
+    RD_LOGI(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req, group_id={}",
+            group_id_str());
+    auto rreq = repl_req_ptr_t(new repl_req_ctx{});
+    replace_member_ctx members;
+    members.replica_out = member_out;
+    members.replica_in = member_in;
+
+    sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_member_ctx));
+    rreq->init(repl_key{.server_id = server_id(),
+                        .term = raft_server()->get_term(),
+                        .dsn = m_next_dsn.fetch_add(1),
+                        .traceID = trace_id},
+               journal_type_t::HS_CTRL_START_REPLACE, true, header, sisl::blob{}, 0, m_listener);
+
+    auto err = m_state_machine->propose_to_raft(std::move(rreq));
+    if (err != ReplServiceError::OK) {
+        RD_LOGE(trace_id, "Step3. Replace member, propose to raft for HS_CTRL_START_REPLACE req failed {}", err);
+        reset_quorum_size(0, trace_id);
+        decr_pending_request_num();
+        return make_async_error<>(std::move(err));
+    }
+
+    // Step 4. Add the new member, new member will inherit the priority of the out member.
+#ifdef _PRERELEASE
+    if (iomgr_flip::instance()->test_flip("replace_member_add_member_failure")) {
+        RD_LOGE(trace_id, "Simulating add member failure");
+        return make_async_error(ReplServiceError::FAILED);
+    }
+#endif
+    RD_LOGI(trace_id, "Step4. Replace member, propose to raft to add new member, group_id={}", group_id_str());
+    auto ret = do_add_member(member_in, trace_id);
+    if (ret != ReplServiceError::OK) {
+        RD_LOGE(trace_id, "Step4. Replace member, add member failed {}", ret);
+        reset_quorum_size(0, trace_id);
+        decr_pending_request_num();
+        return make_async_error<>(std::move(ret));
+    }
+    RD_LOGI(trace_id, "Step4. Replace member, proposed to raft to add member, member={}", boost::uuids::to_string(member_in.id));
+    reset_quorum_size(0, trace_id);
+    decr_pending_request_num();
+    return make_async_success<>();
+}
+
+AsyncReplResult<> RaftReplDev::complete_replace_member(const replica_member_info& member_out,
+                                                       const replica_member_info& member_in, uint32_t commit_quorum,
+                                                       uint64_t trace_id) {
+    if (is_stopping()) {
+        RD_LOGI(trace_id, "repl dev is being shutdown!");
+        return make_async_error<>(ReplServiceError::STOPPING);
+    }
+    incr_pending_request_num();
+
+    RD_LOGI(trace_id, "Complete replace member, member={}", boost::uuids::to_string(member_out.id),
+            boost::uuids::to_string(member_out.id));
+
+    if (commit_quorum >= 1) {
+        // Two members are down and leader cant form the quorum. Reduce the quorum size.
+        reset_quorum_size(commit_quorum, trace_id);
+    }
+
+    // Step 5: Remove member
+    RD_LOGI(trace_id, "Step5. Replace member, remove old member, member={}", boost::uuids::to_string(member_out.id));
+#ifdef _PRERELEASE
+    if (iomgr_flip::instance()->test_flip("replace_member_remove_member_failure")) {
+        RD_LOGE(trace_id, "Simulating remove member failure");
+        return make_async_error(ReplServiceError::FAILED);
+    }
+#endif
+    auto ret = do_remove_member(member_out, trace_id);
+    if (ret != ReplServiceError::OK) {
+        RD_LOGE(trace_id, "Step5. Replace member, failed to remove member, member={}, err={}",
+                boost::uuids::to_string(member_out.id), ret);
+        reset_quorum_size(0, trace_id);
+        decr_pending_request_num();
+        return make_async_error<>(std::move(ret));
+    }
+    RD_LOGI(trace_id, "Step5. Replace member, proposed to raft to remove member, member={}",
+            boost::uuids::to_string(member_out.id));
+    auto timeout = HS_DYNAMIC_CONFIG(consensus.wait_for_config_change_ms);
+    // TODO Move wait logic to nuraft_mesg
+    if (!wait_and_check(
+            [&]() {
+                auto srv_conf = raft_server()->get_srv_config(nuraft_mesg::to_server_id(member_out.id));
+                if (srv_conf) {
+                    RD_LOGD(trace_id, "out member still exists in raft group, member={}",
+                            boost::uuids::to_string(member_out.id));
+                    return false;
+                }
+                return true;
+            },
+            timeout)) {
+        RD_LOGD(trace_id,
+                "Step5.  Replace member, wait for old member removed timed out, cancel the request, timeout: {}",
+                timeout);
+        // If the member_out is down, leader will force remove it after
+        // leave_timeout=leave_limit_(default=5)*heart_beat_interval_, it's better for client to retry it.
+        return make_async_error<>(ReplServiceError::CANCELLED);
+    }
+    RD_LOGD(trace_id, "Step5.  Replace member, old member is removed, member={}",
+            boost::uuids::to_string(member_out.id));
+
+    // Step 2. Append log entry to complete replace member
+    RD_LOGI(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req, group_id={}",
+            group_id_str());
+    auto rreq = repl_req_ptr_t(new repl_req_ctx{});
+    replace_member_ctx members;
+    members.replica_out = member_out;
+    members.replica_in = member_in;
+
+    sisl::blob header(r_cast< uint8_t* >(&members), sizeof(replace_member_ctx));
+    rreq->init(repl_key{.server_id = server_id(),
+                        .term = raft_server()->get_term(),
+                        .dsn = m_next_dsn.fetch_add(1),
+                        .traceID = trace_id},
+               journal_type_t::HS_CTRL_COMPLETE_REPLACE, true, header, sisl::blob{}, 0, m_listener);
+
+    auto err = m_state_machine->propose_to_raft(std::move(rreq));
+    if (err != ReplServiceError::OK) {
+        RD_LOGE(trace_id, "Step6. Replace member, propose to raft for HS_CTRL_COMPLETE_REPLACE req failed , err={}",
+                err);
+        reset_quorum_size(0, trace_id);
+        decr_pending_request_num();
+        return make_async_error<>(std::move(err));
+    }
+
+    reset_quorum_size(0, trace_id);
+    decr_pending_request_num();
+    RD_LOGI(trace_id, "Complete replace member done, group_id={}, member_out={} member_in={}", group_id_str(),
+            boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id));
+    return make_async_success<>();
+}
+
+ReplServiceError RaftReplDev::do_add_member(const replica_member_info& member, uint64_t trace_id) {
+    if (m_my_repl_id != get_leader_id()) {
+        RD_LOGI(trace_id, "Member to add failed, not leader");
+        return ReplServiceError::BAD_REQUEST;
+    }
+    auto ret = retry_when_config_changing(
+        [&] {
+            auto rem_ret = m_msg_mgr.add_member(m_group_id, member.id)
+                               .via(&folly::InlineExecutor::instance())
+                               .thenValue([this, member, trace_id](auto&& e) -> nuraft::cmd_result_code {
+                                   return e.hasError() ? e.error() : nuraft::cmd_result_code::OK;
+                               });
+            return rem_ret.value();
+        },
+        trace_id);
+    if (ret == nuraft::cmd_result_code::SERVER_ALREADY_EXISTS) {
+        RD_LOGW(trace_id, "Ignoring error returned from nuraft add_member, member={}, err={}",
+                boost::uuids::to_string(member.id), ret);
+    } else if (ret != nuraft::cmd_result_code::OK) {
+        // Its ok to retry this request as the request
+        // of replace member is idempotent.
+        RD_LOGE(trace_id, "Add member failed, member={}, err={}", boost::uuids::to_string(member.id), ret);
+        return ReplServiceError::RETRY_REQUEST;
+    }
+    RD_LOGI(trace_id, "Proposed to raft to add member, member={}", boost::uuids::to_string(member.id));
+    return ReplServiceError::OK;
+}
+
+ReplServiceError RaftReplDev::do_remove_member(const replica_member_info& member, uint64_t trace_id) {
+    // The member should not be the leader.
+    if (m_my_repl_id == member.id && m_my_repl_id == get_leader_id()) {
+        // If leader is the member requested to move out, then give up leadership and return error.
+        // Client will retry replace_member request to the new leader.
+        raft_server()->yield_leadership(false /* immediate */, -1 /* successor */);
+        RD_LOGI(trace_id, "Member to remove is the leader so yield leadership");
+        return ReplServiceError::NOT_LEADER;
+    }
+    auto ret = retry_when_config_changing(
+        [&] {
+            auto rem_ret = m_msg_mgr.rem_member(m_group_id, member.id)
+                               .via(&folly::InlineExecutor::instance())
+                               .thenValue([this, member, trace_id](auto&& e) -> nuraft::cmd_result_code {
+                                   return e.hasError() ? e.error() : nuraft::cmd_result_code::OK;
+                               });
+            return rem_ret.value();
+        },
+        trace_id);
+    if (ret == nuraft::cmd_result_code::SERVER_NOT_FOUND) {
+        RD_LOGW(trace_id, "Remove member not found in group error, ignoring, member={}",
+                boost::uuids::to_string(member.id));
+    } else if (ret != nuraft::cmd_result_code::OK) {
+        // Its ok to retry this request as the request
+        // of replace member is idempotent.
+        RD_LOGE(trace_id, "Replace member failed to remove member, member={}, err={}",
+                boost::uuids::to_string(member.id), ret);
+        return ReplServiceError::RETRY_REQUEST;
+    }
+    RD_LOGI(trace_id, "Proposed to raft to remove member, member={}", boost::uuids::to_string(member.id));
+    return ReplServiceError::OK;
+}
+
+AsyncReplResult<> RaftReplDev::flip_learner_flag(const replica_member_info& member, bool target, uint32_t commit_quorum,
+                                                 bool wait_and_verify, uint64_t trace_id) {
+    RD_LOGI(trace_id, "Flip learner flag to {}, member={}", target, boost::uuids::to_string(member.id));
+    if (is_stopping()) {
+        RD_LOGI(trace_id, "repl dev is being shutdown!");
+        return make_async_error<>(ReplServiceError::STOPPING);
+    }
+    incr_pending_request_num();
+
+    if (commit_quorum >= 1) {
+        // Two members are down and leader cant form the quorum. Reduce the quorum size.
+        reset_quorum_size(commit_quorum, trace_id);
+    }
+    auto ret = do_flip_learner(member, target, wait_and_verify, trace_id);
+    if (ret != ReplServiceError::OK) {
+        RD_LOGE(trace_id, "Flip learner flag failed {}, member={}", ret, boost::uuids::to_string(member.id));
+        reset_quorum_size(0, trace_id);
+        decr_pending_request_num();
+        return make_async_error<>(std::move(ret));
+    }
+    RD_LOGI(trace_id, "Learner flag has been set to {}, member={}", target, boost::uuids::to_string(member.id));
+    return make_async_success<>();
+}
+
+ReplServiceError RaftReplDev::do_flip_learner(const replica_member_info& member, bool target, bool wait_and_verify,
+                                              uint64_t trace_id) {
+    // 1. Prerequisite check
+    if (m_my_repl_id != get_leader_id()) {
+        RD_LOGI(trace_id, "flip learner flag failed, not leader");
+        return ReplServiceError::NOT_LEADER;
+    }
+    if (!target && member.priority == 0) {
+        // If the intent is to take the learner back to normal member, then priority should not be 0(never has chance to
+        // become leader). Client need to trace the peers' priority, and give a meaningful value, currently default
+        // priorities of the quorum: leader=100, follower=66.
+        RD_LOGI(trace_id, "clear learner flag failed, priority is 0, member={}", boost::uuids::to_string(member.id));
+        return ReplServiceError::BAD_REQUEST;
+    }
+
+    // 2. Flip learner
+    RD_LOGI(trace_id, "flip learner flag to {}, member={}", target, boost::uuids::to_string(member.id));
+    auto srv_cfg = raft_server()->get_config()->get_server(nuraft_mesg::to_server_id(member.id));
+    if (!srv_cfg) {
+        RD_LOGE(trace_id, "invalid parameter, member is not found, member={}", boost::uuids::to_string(member.id));
+        return ReplServiceError::SERVER_NOT_FOUND;
+    }
+    if (srv_cfg->is_learner() != target) {
+        auto ret = retry_when_config_changing(
+            [&] {
+                auto learner_ret = raft_server()->flip_learner_flag(nuraft_mesg::to_server_id(member.id), target);
+                return learner_ret->get_result_code();
+            },
+            trace_id);
+        if (ret != nuraft::cmd_result_code::OK) {
+            RD_LOGE(trace_id, "Propose to raft to flip learner failed, err: {}", ret);
+            return ReplServiceError::RETRY_REQUEST;
+        }
+    } else {
+        RD_LOGD(trace_id, "learner flag has already been set to {}, skip, member={}", target,
+                boost::uuids::to_string(member.id));
+    }
+
+    // 3. Set priority
+    // Based on the current nuraft implementation, learner could be elected as leader, so we set priority to 0 to avoid
+    // it. And in turn, we need to revert prioiry change if the member is going to become a normal member.
+    // FIXME after nuraft fixes the bug, we can remove this logic.
+    auto priority = target ? 0 : member.priority;
+    RD_LOGI(trace_id, "Set the priority of the member to {}, member={}", priority, boost::uuids::to_string(member.id));
+    if (srv_cfg->get_priority() != priority) {
+        auto priority_ret = set_priority(member.id, priority);
+        if (priority_ret != ReplServiceError::OK) { return ReplServiceError::NOT_LEADER; }
+    } else {
+        RD_LOGD(trace_id, "Priority has already been set to {}, skip, member={}", priority,
+                boost::uuids::to_string(member.id));
+    }
+
+    // 4. Verification
+    if (wait_and_verify) {
+        auto timeout = HS_DYNAMIC_CONFIG(consensus.wait_for_config_change_ms);
+        if (!wait_and_check(
+                [&]() {
+                    auto srv_conf = raft_server()->get_srv_config(nuraft_mesg::to_server_id(member.id));
+                    return srv_conf->is_learner() && srv_conf->get_priority() == 0;
+                },
+                timeout)) {
+            RD_LOGD(trace_id, "Wait for learner and priority config change timed out, cancel the request, timeout: {}",
+                    timeout);
+            return ReplServiceError::CANCELLED;
+        }
+    }
+
+    return ReplServiceError::OK;
+}
+
+nuraft::cmd_result_code RaftReplDev::retry_when_config_changing(const std::function< nuraft::cmd_result_code() >& func,
+                                                              uint64_t trace_id) {
+    auto ret = nuraft::cmd_result_code::OK;
+    int32_t retries = HS_DYNAMIC_CONFIG(consensus.config_changing_error_retries);
+    for (auto i = 0; i < retries; i++) {
+        ret = func();
+        if (ret == nuraft::cmd_result_code::CONFIG_CHANGING) {
+            RD_LOGW(trace_id, "Propose to raft failed due to config_changing, attempt: {}", i);
+            std::this_thread::sleep_for(std::chrono::milliseconds(500));
+            continue;
+        }
+        break;
+    }
+    return ret;
+}
+
+bool RaftReplDev::wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms) {
+    auto times = timeout_ms / interval_ms;
+    if (times == 0) { times = 1; }
+    for (auto i = 0; i < static_cast< int32_t >(times); i++) {
+        if (check_func()) { return true; }
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+    return false;
+}
+
+ReplServiceError RaftReplDev::set_priority(const replica_id_t& member, int32_t priority, uint64_t trace_id) {
+    auto priority_ret = raft_server()->set_priority(nuraft_mesg::to_server_id(member), priority);
+    // Set_priority should be handled by leader, but if the intent is to set the leader's priority to 0, it returns
+    // BROADCAST. In this case return NOT_LEADER to let client retry new leader.
+    // If there is an uncommited_config, nuraft set_priority will honor this uncommited config and generate new
+    // config based on it and won't have config_changing error.
+    if (priority_ret != nuraft::raft_server::PrioritySetResult::SET) {
+        RD_LOGE(trace_id, "Propose to raft to set priority failed, result: {}",
+                priority_ret == nuraft::raft_server::PrioritySetResult::BROADCAST ? "BROADCAST" : "IGNORED");
+        return ReplServiceError::NOT_LEADER;
+    }
+    return ReplServiceError::OK;
+}
+
+void RaftReplDev::reset_quorum_size(uint32_t commit_quorum, uint64_t trace_id) {
+    RD_LOGI(trace_id, "Reset raft quorum size={}", commit_quorum);
+    nuraft::raft_params params = raft_server()->get_current_params();
+    params.with_custom_commit_quorum_size(commit_quorum);
+    params.with_custom_election_quorum_size(commit_quorum);
+    raft_server()->update_params(params);
+}
+
 folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() {
     // Set the intent to destroy the group
     m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::DESTROYING; });
@@ -131,24 +582,34 @@ folly::SemiFuture< ReplServiceError > RaftReplDev::destroy_group() {
 
     // here, we set the dsn to a new one , which is definitely unique in the follower, so that the new rreq will not
     // have a conflict with the old rreq.
-    rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)},
-               journal_type_t::HS_CTRL_DESTROY, true, sisl::blob{}, sisl::blob{}, 0);
+    auto err = init_req_ctx(rreq,
+                            repl_key{.server_id = server_id(),
+                                     .term = raft_server()->get_term(),
+                                     .dsn = m_next_dsn.fetch_add(1),
+                                     .traceID = std::numeric_limits< uint64_t >::max()},
+                            journal_type_t::HS_CTRL_DESTROY, true, sisl::blob{}, sisl::blob{}, 0, m_listener);
 
-    auto err = m_state_machine->propose_to_raft(std::move(rreq));
+    if (err != ReplServiceError::OK) {
+        // Failed to initialize the repl_req_ctx for replace member.
+        LOGERROR("Failed to initialize repl_req_ctx for destorying group, error={}", err);
+        return folly::makeSemiFuture< ReplServiceError >(std::move(err));
+    }
+
+    err = m_state_machine->propose_to_raft(std::move(rreq));
     if (err != ReplServiceError::OK) {
         m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::ACTIVE; });
         return folly::makeSemiFuture< ReplServiceError >(std::move(err));
         LOGERROR("RaftReplDev::destroy_group failed {}", err);
     }
 
-    LOGINFO("Raft repl dev destroy_group={}", boost::uuids::to_string(m_group_id));
+    LOGINFO("Raft repl dev destroy_group={}", group_id_str());
     return m_destroy_promise.getSemiFuture();
 }
 
 void RaftReplDev::use_config(json_superblk raft_config_sb) { m_raft_config_sb = std::move(raft_config_sb); }
 
 void RaftReplDev::on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done) {
-    RD_LOG(DEBUG, "create_snapshot last_idx={}/term={}", s.get_last_log_idx(), s.get_last_log_term());
+    RD_LOGD(NO_TRACE_ID, "create_snapshot last_idx={}/term={}", s.get_last_log_idx(), s.get_last_log_term());
     auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s);
     auto result = m_listener->create_snapshot(snp_ctx).get();
     auto null_except = std::shared_ptr< std::exception >();
@@ -159,13 +620,13 @@ void RaftReplDev::on_create_snapshot(nuraft::snapshot& s, nuraft::async_result<
 }
 
 void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& data,
-                                    repl_req_ptr_t rreq) {
+                                    repl_req_ptr_t rreq, bool part_of_batch, trace_id_t tid) {
     if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); }
 
     {
         auto const guard = m_stage.access();
         if (auto const stage = *guard.get(); stage != repl_dev_stage_t::ACTIVE) {
-            RD_LOGW("Raft channel: Not ready to accept writes, stage={}", enum_name(stage));
+            RD_LOGW(tid, "Raft channel: Not ready to accept writes, stage={}", enum_name(stage));
             handle_error(rreq,
                          (stage == repl_dev_stage_t::INIT) ? ReplServiceError::SERVER_IS_JOINING
                                                            : ReplServiceError::SERVER_IS_LEAVING);
@@ -173,9 +634,22 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const&
         }
     }
 
-    rreq->init(repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)},
-               data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true /* is_proposer */,
-               header, key, data.size);
+    auto status = init_req_ctx(rreq,
+                               repl_key{.server_id = server_id(),
+                                        .term = raft_server()->get_term(),
+                                        .dsn = m_next_dsn.fetch_add(1),
+                                        .traceID = tid},
+                               data.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED,
+                               true /* is_proposer */, header, key, data.size, m_listener);
+
+    if (status != ReplServiceError::OK) {
+        RD_LOGI(tid, "Initializing rreq failed error={}, failing this req", status);
+        handle_error(rreq, status);
+        return;
+    }
+
+    RD_LOGD(tid, "repl_key [{}], header size [{}] bytes, user_key size [{}] bytes, data size [{}] bytes", rreq->rkey(),
+            header.size(), key.size(), data.size);
 
     // Add the request to the repl_dev_rreq map, it will be accessed throughout the life cycle of this request
     auto const [it, happened] = m_repl_key_req_map.emplace(rreq->rkey(), rreq);
@@ -183,16 +657,21 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const&
 
     // If it is header only entry, directly propose to the raft
     if (rreq->has_linked_data()) {
-        push_data_to_all_followers(rreq, data);
-
-        // Step 1: Alloc Blkid
-        auto const status = rreq->alloc_local_blks(m_listener, data.size);
-        if (status != ReplServiceError::OK) {
-            RD_LOGD("Allocating blks failed error={}, failing this req", status);
-            handle_error(rreq, status);
+        if (rreq->is_proposer() && rreq->has_state(repl_req_state_t::DATA_COMMITTED)) {
+            RD_LOGE(tid, "data blks has already been allocated and committed, failing this req");
+            handle_error(rreq, ReplServiceError::DATA_DUPLICATED);
             return;
         }
 
+#ifdef _PRERELEASE
+        if (iomgr_flip::instance()->test_flip("disable_leader_push_data")) {
+            RD_LOGD(tid, "Simulating push data failure, so that all the follower will have to fetch data");
+        } else
+            push_data_to_all_followers(rreq, data);
+#else
+        push_data_to_all_followers(rreq, data);
+#endif
+
         COUNTER_INCREMENT(m_metrics, total_write_cnt, 1);
         COUNTER_INCREMENT(m_metrics, outstanding_data_write_cnt, 1);
 
@@ -221,7 +700,7 @@ void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const&
                 }
             });
     } else {
-        RD_LOGD("Skipping data channel send since value size is 0");
+        RD_LOGT(tid, "Skipping data channel send since value size is 0");
         rreq->add_state(repl_req_state_t::DATA_WRITTEN);
         auto raft_status = m_state_machine->propose_to_raft(rreq);
         if (raft_status != ReplServiceError::OK) { handle_error(rreq, raft_status); }
@@ -233,7 +712,7 @@ void RaftReplDev::push_data_to_all_followers(repl_req_ptr_t rreq, sisl::sg_list
 
     // Prepare the rpc request packet with all repl_reqs details
     builder.FinishSizePrefixed(CreatePushDataRequest(
-        builder, server_id(), rreq->term(), rreq->dsn(),
+        builder, rreq->traceID(), server_id(), rreq->term(), rreq->dsn(),
         builder.CreateVector(rreq->header().cbytes(), rreq->header().size()),
         builder.CreateVector(rreq->key().cbytes(), rreq->key().size()), data.size, get_time_since_epoch_ms()));
 
@@ -244,30 +723,37 @@ void RaftReplDev::push_data_to_all_followers(repl_req_ptr_t rreq, sisl::sg_list
            flatbuffers::FlatBufferToString(builder.GetBufferPointer() + sizeof(flatbuffers::uoffset_t),
                                            PushDataRequestTypeTable()));*/
 
-    RD_LOGD("Data Channel: Pushing data to all followers: rreq=[{}]", rreq->to_string());
-
-    group_msg_service()
-        ->data_service_request_unidirectional(nuraft_mesg::role_regex::ALL, PUSH_DATA, rreq->m_pkts)
-        .via(&folly::InlineExecutor::instance())
-        .thenValue([this, rreq = std::move(rreq)](auto e) {
-            if (e.hasError()) {
-                RD_LOGE("Data Channel: Error in pushing data to all followers: rreq=[{}] error={}", rreq->to_string(),
-                        e.error());
-                handle_error(rreq, RaftReplService::to_repl_error(e.error()));
-                return;
+    auto peers = get_active_peers();
+    auto calls = std::vector< nuraft_mesg::NullAsyncResult >();
+    for (auto peer : peers) {
+        RD_LOGD(rreq->traceID(), "Data Channel: Pushing data to follower {}, rreq=[{}]", peer, rreq->to_string());
+        calls.push_back(group_msg_service()
+                            ->data_service_request_unidirectional(peer, PUSH_DATA, rreq->m_pkts)
+                            .via(&folly::InlineExecutor::instance()));
+    }
+    folly::collectAllUnsafe(calls).thenValue([this, rreq](auto&& v_res) {
+        for (auto const& res : v_res) {
+            if (sisl_likely(res.value())) {
+                auto r = res.value();
+                if (r.hasError()) {
+                    // Just logging PushData error, no action is needed as follower can try by fetchData.
+                    RD_LOGI(rreq->traceID(), "Data Channel: Error in pushing data to all followers: rreq=[{}] error={}",
+                            rreq->to_string(), r.error());
+                }
             }
-            // Release the buffer which holds the packets
-            RD_LOGD("Data Channel: Data push completed for rreq=[{}]", rreq->to_string());
-            rreq->release_fb_builder();
-            rreq->m_pkts.clear();
-        });
+        }
+        RD_LOGD(rreq->traceID(), "Data Channel: Data push completed for rreq=[{}]", rreq->to_compact_string());
+        // Release the buffer which holds the packets
+        rreq->release_fb_builder();
+        rreq->m_pkts.clear();
+    });
 }
 
 void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_data) {
     auto const push_data_rcv_time = Clock::now();
     auto const& incoming_buf = rpc_data->request_blob();
     if (!incoming_buf.cbytes()) {
-        RD_LOGW("Data Channel: PushData received with empty buffer, ignoring this call");
+        RD_LOGW(NO_TRACE_ID, "Data Channel: PushData received with empty buffer, ignoring this call");
         rpc_data->send_response();
         return;
     }
@@ -275,20 +761,30 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d
     auto const fb_size =
         flatbuffers::ReadScalar< flatbuffers::uoffset_t >(incoming_buf.cbytes()) + sizeof(flatbuffers::uoffset_t);
     auto push_req = GetSizePrefixedPushDataRequest(incoming_buf.cbytes());
-    HS_DBG_ASSERT_EQ(fb_size + push_req->data_size(), incoming_buf.size(), "Size mismatch of data size vs buffer size");
-
+    if (fb_size + push_req->data_size() != incoming_buf.size()) {
+        RD_LOGW(NO_TRACE_ID,
+                "Data Channel: PushData received with size mismatch, header size {}, data size {}, received size {}",
+                fb_size, push_req->data_size(), incoming_buf.size());
+        rpc_data->send_response();
+        return;
+    }
     sisl::blob header = sisl::blob{push_req->user_header()->Data(), push_req->user_header()->size()};
     sisl::blob key = sisl::blob{push_req->user_key()->Data(), push_req->user_key()->size()};
-    repl_key rkey{.server_id = push_req->issuer_replica_id(), .term = push_req->raft_term(), .dsn = push_req->dsn()};
+    repl_key rkey{.server_id = push_req->issuer_replica_id(),
+                  .term = push_req->raft_term(),
+                  .dsn = push_req->dsn(),
+                  .traceID = push_req->trace_id()};
     auto const req_orig_time_ms = push_req->time_ms();
 
-    RD_LOGD("Data Channel: PushData received: time diff={} ms.", get_elapsed_time_ms(req_orig_time_ms));
+    RD_LOGD(rkey.traceID, "Data Channel: PushData received: time diff={} ms.", get_elapsed_time_ms(req_orig_time_ms));
 
 #ifdef _PRERELEASE
     if (iomgr_flip::instance()->test_flip("drop_push_data_request")) {
-        LOGINFO("Data Channel: Flip is enabled, skip on_push_data_received to simulate fetch remote data, "
+        RD_LOGI(rkey.traceID,
+                "Data Channel: Flip is enabled, skip on_push_data_received to simulate fetch remote data, "
                 "server_id={}, term={}, dsn={}",
                 push_req->issuer_replica_id(), push_req->raft_term(), push_req->dsn());
+        rpc_data->send_response();
         return;
     }
 #endif
@@ -296,15 +792,18 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d
     auto rreq = applier_create_req(rkey, journal_type_t::HS_DATA_LINKED, header, key, push_req->data_size(),
                                    true /* is_data_channel */);
     if (rreq == nullptr) {
-        RD_LOG(ERROR,
-               "Data Channel: Creating rreq on applier has failed, will ignore the push and let Raft channel send "
-               "trigger a fetch explicitly if needed. rkey={}",
-               rkey.to_string());
+        RD_LOGE(rkey.traceID,
+                "Data Channel: Creating rreq on applier has failed, will ignore the push and let Raft channel send "
+                "trigger a fetch explicitly if needed. rkey={}",
+                rkey.to_string());
+        rpc_data->send_response();
         return;
     }
 
     if (!rreq->save_pushed_data(rpc_data, incoming_buf.cbytes() + fb_size, push_req->data_size())) {
-        RD_LOGD("Data Channel: Data already received for rreq=[{}], ignoring this data", rreq->to_string());
+        RD_LOGT(rkey.traceID, "Data Channel: Data already received for rreq=[{}], ignoring this data",
+                rreq->to_string());
+        rpc_data->send_response();
         return;
     }
 
@@ -323,12 +822,15 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d
                 RD_DBG_ASSERT(false, "Error in writing data, error_code={}", err.value());
                 handle_error(rreq, ReplServiceError::DRIVE_WRITE_ERROR);
             } else {
+                rreq->release_data();
                 rreq->add_state(repl_req_state_t::DATA_WRITTEN);
                 rreq->m_data_written_promise.setValue();
+                // if rreq create time is earlier than push_data receive time, that means the rreq was created by raft
+                // channel log. Otherwise set to zero as rreq is created by data channel.
                 const auto data_log_diff_us =
                     push_data_rcv_time.time_since_epoch().count() > rreq->created_time().time_since_epoch().count()
                     ? get_elapsed_time_us(rreq->created_time(), push_data_rcv_time)
-                    : get_elapsed_time_us(push_data_rcv_time, rreq->created_time());
+                    : 0;
 
                 auto const data_write_latency = get_elapsed_time_us(push_data_rcv_time);
                 auto const total_data_write_latency = get_elapsed_time_us(rreq->created_time());
@@ -338,67 +840,64 @@ void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_d
                 HISTOGRAM_OBSERVE(m_metrics, rreq_push_data_latency_us, data_write_latency);
                 HISTOGRAM_OBSERVE(m_metrics, rreq_total_data_write_latency_us, total_data_write_latency);
 
-                RD_LOGD("Data Channel: Data write completed for rreq=[{}], time_diff_data_log_us={}, "
+                RD_LOGD(rreq->traceID(),
+                        "Data Channel: Data write completed for rreq=[{}], time_diff_data_log_us={}, "
                         "data_write_latency_us={}, total_data_write_latency_us(rreq creation to write complete)={}, "
                         "local_blkid.num_pieces={}",
-                        rreq->to_string(), data_log_diff_us, data_write_latency, total_data_write_latency,
+                        rreq->to_compact_string(), data_log_diff_us, data_write_latency, total_data_write_latency,
                         write_num_pieces);
             }
         });
 }
 
 repl_req_ptr_t RaftReplDev::applier_create_req(repl_key const& rkey, journal_type_t code, sisl::blob const& user_header,
-                                               sisl::blob const& key, uint32_t data_size,
-                                               [[maybe_unused]] bool is_data_channel) {
+                                               sisl::blob const& key, uint32_t data_size, bool is_data_channel,
+                                               int64_t lsn) {
+    if (is_data_channel) RD_DBG_ASSERT(-1 == lsn, "lsn from data channel should always be -1 , got lsn {}", lsn);
+
     auto const [it, happened] = m_repl_key_req_map.try_emplace(rkey, repl_req_ptr_t(new repl_req_ctx()));
     RD_DBG_ASSERT((it != m_repl_key_req_map.end()), "Unexpected error in map_repl_key_to_req");
     auto rreq = it->second;
 
     if (!happened) {
-        // We already have the entry in the map, check if we are already allocated the blk by previous caller, in
-        // that case we need to return the req.
+        // We already have the entry in the map, reset its start time to prevent it from being incorrectly gc during use.
+        rreq->set_created_time();
+        // Check if we are already allocated the blk by previous caller, in that case we need to return the req.
         if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) {
             // Do validation if we have the correct mapping
             // RD_REL_ASSERT(blob_equals(user_header, rreq->header), "User header mismatch for repl_key={}",
             //              rkey.to_string());
             // RD_REL_ASSERT(blob_equals(user_key, rreq->key), "User key mismatch for repl_key={}", rkey.to_string());
-            RD_LOGD("Repl_key=[{}] already received  ", rkey.to_string());
+            RD_LOGT(rkey.traceID, "Repl_key=[{}] already received  ", rkey.to_string());
             return rreq;
         }
     }
 
-    // We need to allocate the block, since entry doesn't exist or if it exist, two threads are trying to do the same
-    // thing. So take state mutex and allocate the blk
-    std::unique_lock< std::mutex > lg(rreq->m_state_mtx);
-    rreq->init(rkey, code, false /* is_proposer */, user_header, key, data_size);
-
-    // There is no data portion, so there is not need to allocate
-    if (!rreq->has_linked_data()) { return rreq; }
-    if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) { return rreq; }
-
-    auto alloc_status = rreq->alloc_local_blks(m_listener, data_size);
-#ifdef _PRERELEASE
-    if (is_data_channel) {
-        if (iomgr_flip::instance()->test_flip("fake_reject_append_data_channel")) {
-            LOGINFO("Data Channel: Reject append_entries flip is triggered for rkey={}", rkey.to_string());
-            alloc_status = ReplServiceError::NO_SPACE_LEFT;
-        }
-    } else {
-        if (iomgr_flip::instance()->test_flip("fake_reject_append_raft_channel")) {
-            LOGINFO("Raft Channel: Reject append_entries flip is triggered for rkey={}", rkey.to_string());
-            alloc_status = ReplServiceError::NO_SPACE_LEFT;
+    // rreq->init will allocate the block if it has linked data.
+    auto status = init_req_ctx(rreq, rkey, code, false /* is_proposer */, user_header, key, data_size, m_listener);
+
+    if (status != ReplServiceError::OK) {
+        RD_LOGD(rkey.traceID, "For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(),
+                status);
+        if (status == ReplServiceError::NO_SPACE_LEFT && !is_data_channel && !rreq->is_proposer()) {
+            const auto& chunk_id = rreq->local_blkid().chunk_num();
+            RD_LOGD(rkey.traceID,
+                    "For Repl_key=[{}] alloc hints returned error={} when trying to allocate blk on chunk={}",
+                    rkey.to_string(), status, chunk_id);
+            m_listener->on_no_space_left(lsn, chunk_id);
+        } else {
+            RD_LOGD(
+                rkey.traceID,
+                "For Repl_key=[{}] alloc hints returned error={}, failing this req, data_channl: {}, is_proposer: {} ",
+                rkey.to_string(), status, is_data_channel, rreq->is_proposer());
         }
-    }
-#endif
-
-    if (alloc_status != ReplServiceError::OK) {
-        RD_LOGE("For Repl_key=[{}] alloc hints returned error={}, failing this req", rkey.to_string(), alloc_status);
         // Do not call handle_error here, because handle_error is for rreq which needs to be terminated. This one can be
         // retried.
         return nullptr;
     }
 
-    RD_LOGD("in follower_create_req: rreq={}, addr={}", rreq->to_string(), reinterpret_cast< uintptr_t >(rreq.get()));
+    RD_LOGD(rkey.traceID, , "in follower_create_req: rreq={}, addr=0x{:x}", rreq->to_string(),
+            reinterpret_cast< uintptr_t >(rreq.get()));
     return rreq;
 }
 
@@ -412,7 +911,7 @@ folly::Future< folly::Unit > RaftReplDev::notify_after_data_written(std::vector<
         if (!rreq->has_linked_data()) { continue; }
         auto const status = uint32_cast(rreq->state());
         if (status & uint32_cast(repl_req_state_t::DATA_WRITTEN)) {
-            RD_LOGD("Raft Channel: Data write completed and blkid mapped: rreq=[{}]", rreq->to_string());
+            RD_LOGD(rreq->traceID(), "Data written and blkid mapped: rkey=[{}]", rreq->to_compact_string());
             continue;
         }
 
@@ -455,15 +954,16 @@ folly::Future< folly::Unit > RaftReplDev::notify_after_data_written(std::vector<
             HS_DBG_ASSERT(rreq->has_state(repl_req_state_t::DATA_WRITTEN),
                           "Data written promise raised without updating DATA_WRITTEN state for rkey={}",
                           rreq->rkey().to_string());
-            RD_LOGD("Raft Channel: Data write completed and blkid mapped: rreq=[{}]", rreq->to_string());
+            RD_LOGD(rreq->traceID(), "Data write completed and blkid mapped: rreq=[{}]", rreq->to_compact_string());
         }
 #endif
-        RD_LOGT("Data Channel: {} pending reqs's data are written", rreqs->size());
+        RD_LOGT(NO_TRACE_ID, "{} pending reqs's data are written", rreqs->size());
         return folly::makeFuture< folly::Unit >(folly::Unit{});
     });
 }
 
-bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms) {
+bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms,
+                                        std::vector< repl_req_ptr_t >* timeout_rreqs) {
     std::vector< folly::Future< folly::Unit > > futs;
     std::vector< repl_req_ptr_t > only_wait_reqs;
     only_wait_reqs.reserve(rreqs.size());
@@ -484,20 +984,27 @@ bool RaftReplDev::wait_for_data_receive(std::vector< repl_req_ptr_t > const& rre
     // sometime before do an explicit fetch. This is so that, it is possible raft channel has come ahead of data
     // channel and waiting for sometime avoid expensive fetch. On steady state, after a little bit of wait data
     // would be reached automatically.
-    RD_LOG(DEBUG,
-           "We haven't received data for {} out {} in reqs batch, will fetch and wait for {} ms, in_resync_mode()={} ",
-           only_wait_reqs.size(), rreqs.size(), timeout_ms, is_resync_mode());
+    RD_LOGD(NO_TRACE_ID,
+            "We haven't received data for {} out {} in reqs batch, will fetch and wait for {} ms, in_resync_mode()={} ",
+            only_wait_reqs.size(), rreqs.size(), timeout_ms, is_resync_mode());
 
     // We are yet to support reactive fetch from remote.
     if (is_resync_mode()) {
-        check_and_fetch_remote_data(std::move(only_wait_reqs));
+        check_and_fetch_remote_data(only_wait_reqs);
     } else {
-        m_repl_svc.add_to_fetch_queue(shared_from_this(), std::move(only_wait_reqs));
+        m_repl_svc.add_to_fetch_queue(shared_from_this(), only_wait_reqs);
     }
 
     // block waiting here until all the futs are ready (data channel filled in and promises are made);
-    auto all_futs = folly::collectAllUnsafe(futs).wait(std::chrono::milliseconds(timeout_ms));
-    return (all_futs.isReady());
+    auto all_futs_ready = folly::collectAllUnsafe(futs).wait(std::chrono::milliseconds(timeout_ms)).isReady();
+    if (!all_futs_ready && timeout_rreqs != nullptr) {
+        timeout_rreqs->clear();
+        for (size_t i{0}; i < futs.size(); ++i) {
+            if (!futs[i].isReady()) { timeout_rreqs->emplace_back(only_wait_reqs[i]); }
+        }
+        all_futs_ready = timeout_rreqs->empty();
+    }
+    return all_futs_ready;
 }
 
 void RaftReplDev::check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreqs) {
@@ -509,12 +1016,12 @@ void RaftReplDev::check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreq
     for (auto const& rreq : rreqs) {
         auto const cur_state = uint32_cast(rreq->state());
         if (cur_state == uint32_cast(repl_req_state_t::ERRORED)) {
-            // We already received the data before, just ignore this data
-            RD_LOGD("Raft Channel: rreq=[{}] already errored out, ignoring the fetch", rreq->to_string());
+            RD_LOGD(rreq->traceID(), "rreq=[{}] already errored out, ignoring the fetch", rreq->to_compact_string());
             continue;
         } else if (cur_state == uint32_cast(repl_req_state_t::DATA_RECEIVED)) {
             // We already received the data before, just ignore this data
-            RD_LOGD("Raft Channel: Data already received for rreq=[{}], ignoring the fetch", rreq->to_string());
+            RD_LOGD(rreq->traceID(), "Data already received for rreq=[{}], ignoring the fetch",
+                    rreq->to_compact_string());
             continue;
         }
 
@@ -542,7 +1049,8 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) {
     entries.reserve(rreqs.size());
 
     shared< flatbuffers::FlatBufferBuilder > builder = std::make_shared< flatbuffers::FlatBufferBuilder >();
-    RD_LOGD("Data Channel : FetchData from remote: rreq.size={}, my server_id={}", rreqs.size(), server_id());
+    RD_LOGD(NO_TRACE_ID, "Data Channel : FetchData from remote: rreq.size={}, my server_id={}", rreqs.size(),
+            server_id());
     auto const& originator = rreqs.front()->remote_blkid().server_id;
 
     for (auto const& rreq : rreqs) {
@@ -558,7 +1066,8 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) {
         RD_DBG_ASSERT_EQ(rreq->remote_blkid().server_id, originator, "Unexpected originator for rreq={}",
                          rreq->to_string());
 
-        RD_LOGT("Fetching data from originator={}, remote: rreq=[{}], remote_blkid={}, my server_id={}", originator,
+        RD_LOGT(rreq->traceID(),
+                "Fetching data from originator={}, remote: rreq=[{}], remote_blkid={}, my server_id={}", originator,
                 rreq->to_string(), rreq->remote_blkid().blkid.to_string(), server_id());
     }
 
@@ -583,15 +1092,15 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) {
             auto const fetch_latency_us = get_elapsed_time_us(fetch_start_time);
             HISTOGRAM_OBSERVE(m_metrics, rreq_data_fetch_latency_us, fetch_latency_us);
 
-            RD_LOGD("Data Channel: FetchData from remote completed, time taken={} us", fetch_latency_us);
+            RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData from remote completed, time taken={} us", fetch_latency_us);
 
             if (!response) {
                 // if we are here, it means the original who sent the log entries are down.
                 // we need to handle error and when the other member becomes leader, it will resend the log entries;
-                RD_LOG(ERROR,
-                       "Not able to fetching data from originator={}, error={}, probably originator is down. Will "
-                       "retry when new leader start appending log entries",
-                       rreqs.front()->remote_blkid().server_id, response.error());
+                RD_LOGE(NO_TRACE_ID,
+                        "Not able to fetching data from originator={}, error={}, probably originator is down. Will "
+                        "retry when new leader start appending log entries",
+                        rreqs.front()->remote_blkid().server_id, response.error());
                 for (auto const& rreq : rreqs) {
                     // TODO: Set the data_received promise with error, so that waiting threads can be unblocked and
                     // reject the request. Without that, it will timeout and then reject it.
@@ -619,13 +1128,14 @@ void RaftReplDev::fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs) {
 void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_data) {
     auto const& incoming_buf = rpc_data->request_blob();
     if (!incoming_buf.cbytes()) {
-        RD_LOGW("Data Channel: PushData received with empty buffer, ignoring this call");
+        RD_LOGW(NO_TRACE_ID, "Data Channel: PushData received with empty buffer, ignoring this call");
         rpc_data->send_response();
         return;
     }
     auto fetch_req = GetSizePrefixedFetchData(incoming_buf.cbytes());
 
-    RD_LOGD("Data Channel: FetchData received: fetch_req.size={}", fetch_req->request()->entries()->size());
+    RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData received: fetch_req.size={}",
+            fetch_req->request()->entries()->size());
 
     std::vector< sisl::sg_list > sgs_vec;
     std::vector< folly::Future< bool > > futs;
@@ -636,33 +1146,29 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_
         auto const& lsn = req->lsn();
         auto const& originator = req->blkid_originator();
         auto const& remote_blkid = req->remote_blkid();
-
-        // release this assert if in the future we want to fetch from non-originator;
-        RD_REL_ASSERT_EQ(originator, server_id(),
-                         "Not expect to receive fetch data from remote when I am not the originator of this request");
-
-        // fetch data based on the remote_blkid
-        if (originator == server_id()) {
-            // We are the originator of the blkid, read data locally;
-            MultiBlkId local_blkid;
-
-            // convert remote_blkid serialized data to local blkid
-            local_blkid.deserialize(sisl::blob{remote_blkid->Data(), remote_blkid->size()}, true /* copy */);
-
-            RD_LOGD("Data Channel: FetchData received: dsn={} lsn={} my_blkid={}", req->dsn(), lsn,
-                    local_blkid.to_string());
-
-            // prepare the sgs data buffer to read into;
-            auto const total_size = local_blkid.blk_count() * get_blk_size();
-            sisl::sg_list sgs;
-            sgs.size = total_size;
-            sgs.iovs.emplace_back(
-                iovec{.iov_base = iomanager.iobuf_alloc(get_blk_size(), total_size), .iov_len = total_size});
-
-            // accumulate the sgs for later use (send back to the requester));
-            sgs_vec.push_back(sgs);
-            futs.emplace_back(async_read(local_blkid, sgs, total_size));
+        MultiBlkId local_blkid;
+        local_blkid.deserialize(sisl::blob{remote_blkid->Data(), remote_blkid->size()}, true /* copy */);
+        // prepare the sgs data buffer to read into;
+        auto const total_size = local_blkid.blk_count() * get_blk_size();
+        sisl::sg_list sgs;
+        sgs.size = total_size;
+        sgs.iovs.emplace_back(
+            iovec{.iov_base = iomanager.iobuf_alloc(get_blk_size(), total_size), .iov_len = total_size});
+
+        // accumulate the sgs for later use (send back to the requester));
+        sgs_vec.push_back(sgs);
+
+        if (originator != server_id()) {
+            RD_LOGD(NO_TRACE_ID, "non-originator FetchData received:  dsn={} lsn={} originator={}, my_server_id={}",
+                    req->dsn(), lsn, originator, server_id());
+        } else {
+            RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData received:  dsn={} lsn={}", req->dsn(), lsn);
         }
+
+        auto const& header = req->user_header();
+        sisl::blob user_header = sisl::blob{header->Data(), header->size()};
+        RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData handled, my_blkid={}", local_blkid.to_string());
+        futs.emplace_back(std::move(m_listener->on_fetch_data(lsn, user_header, local_blkid, sgs)));
     }
 
     folly::collectAllUnsafe(futs).thenValue(
@@ -677,7 +1183,7 @@ void RaftReplDev::on_fetch_data_received(intrusive< sisl::GenericRpcData >& rpc_
                 }
             }
 
-            RD_LOGD("Data Channel: FetchData data read completed for {} buffers", sgs_vec.size());
+            RD_LOGT(NO_TRACE_ID, "Data Channel: FetchData data read completed for {} buffers", sgs_vec.size());
 
             // now prepare the io_blob_list to response back to requester;
             nuraft_mesg::io_blob_list_t pkts = sisl::io_blob_list_t{};
@@ -709,7 +1215,7 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons
     RD_DBG_ASSERT_GT(total_size, 0, "Empty response from remote");
     RD_DBG_ASSERT(raw_data, "Empty response from remote");
 
-    RD_LOGD("Data Channel: FetchData completed for {} requests", rreqs.size());
+    RD_LOGD(NO_TRACE_ID, "Data Channel: FetchData completed for {} requests", rreqs.size());
 
     for (auto const& rreq : rreqs) {
         auto const data_size = rreq->remote_blkid().blkid.blk_count() * get_blk_size();
@@ -720,8 +1226,9 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons
             RD_DBG_ASSERT_EQ(data_size, local_size, "Data size mismatch for rreq={} remote size: {}, local size: {}",
                              rreq->to_string(), data_size, local_size);
 
-            RD_LOGD("Data Channel: Data already received for rreq=[{}], skip and move on to next rreq.",
-                    rreq->to_string());
+            RD_LOGT(rreq->traceID(),
+                    "Data Channel: Data already received for rreq=[{}], skip and move on to next rreq.",
+                    rreq->to_compact_string());
         } else {
             auto const data_write_start_time = Clock::now();
             COUNTER_INCREMENT(m_metrics, total_write_cnt, 1);
@@ -741,16 +1248,19 @@ void RaftReplDev::handle_fetch_data_response(sisl::GenericClientResponse respons
 
                     RD_REL_ASSERT(!err,
                                   "Error in writing data"); // TODO: Find a way to return error to the Listener
+                    rreq->release_data();
                     rreq->add_state(repl_req_state_t::DATA_WRITTEN);
                     rreq->m_data_written_promise.setValue();
 
-                    RD_LOGD("Data Channel: Data Write completed rreq=[{}], data_write_latency_us={}, "
+                    RD_LOGD(rreq->traceID(),
+                            "Data Channel: Data Write completed rreq=[{}], data_write_latency_us={}, "
                             "total_write_latency_us={}, write_num_pieces={}",
-                            rreq->to_string(), data_write_latency, total_data_write_latency, write_num_pieces);
+                            rreq->to_compact_string(), data_write_latency, total_data_write_latency, write_num_pieces);
                 });
 
-            RD_LOGD("Data Channel: Data fetched from remote: rreq=[{}], data_size: {}, total_size: {}, local_blkid: {}",
-                    rreq->to_string(), data_size, total_size, rreq->local_blkid().to_string());
+            RD_LOGT(rreq->traceID(),
+                    "Data Channel: Data fetched from remote: rreq=[{}], data_size: {}, total_size: {}, local_blkid: {}",
+                    rreq->to_compact_string(), data_size, total_size, rreq->local_blkid().to_string());
         }
         raw_data += data_size;
         total_size -= data_size;
@@ -770,24 +1280,46 @@ void RaftReplDev::commit_blk(repl_req_ptr_t rreq) {
     }
 }
 
+void RaftReplDev::handle_rollback(repl_req_ptr_t rreq) {
+    // 1. call the listener to rollback
+    RD_LOGD(rreq->traceID(), "Rolling back rreq: {}", rreq->to_compact_string());
+    m_listener->on_rollback(rreq->lsn(), rreq->header(), rreq->key(), rreq);
+    // 2. remove the request from maps
+    m_state_machine->unlink_lsn_to_req(rreq->lsn(), rreq);
+    m_repl_key_req_map.erase(rreq->rkey());
+
+    // 3. free the allocated blocks
+    if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) {
+        auto blkid = rreq->local_blkid();
+        data_service().async_free_blk(blkid).thenValue([this, blkid, rreq](auto&& err) {
+            HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak", blkid.to_string());
+            RD_LOGD(rreq->traceID(), "Releasing blkid={} freed successfully", blkid.to_string());
+        });
+    }
+}
+
 void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) {
-    commit_blk(rreq);
+    if (!rreq->has_state(repl_req_state_t::DATA_COMMITTED)) { commit_blk(rreq); }
 
     // Remove the request from repl_key map.
     m_repl_key_req_map.erase(rreq->rkey());
     // Remove the request from lsn map.
-    m_state_machine->unlink_lsn_to_req(rreq->lsn());
+    m_state_machine->unlink_lsn_to_req(rreq->lsn(), rreq);
 
     auto cur_dsn = m_next_dsn.load(std::memory_order_relaxed);
     while (cur_dsn <= rreq->dsn()) {
         m_next_dsn.compare_exchange_strong(cur_dsn, rreq->dsn() + 1);
     }
 
-    RD_LOGD("Raft channel: Commit rreq=[{}]", rreq->to_string());
+    RD_LOGD(rreq->traceID(), "Raft channel: Commit rreq=[{}]", rreq->to_compact_string());
     if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY) {
         leave();
+    } else if (rreq->op_code() == journal_type_t::HS_CTRL_START_REPLACE) {
+        start_replace_member(rreq);
+    } else if (rreq->op_code() == journal_type_t::HS_CTRL_COMPLETE_REPLACE) {
+        complete_replace_member(rreq);
     } else {
-        m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkid(), rreq);
+        m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), {rreq->local_blkid()}, rreq);
     }
 
     if (!recovery) {
@@ -796,23 +1328,55 @@ void RaftReplDev::handle_commit(repl_req_ptr_t rreq, bool recovery) {
                          "Out of order commit of lsns, it is not expected in RaftReplDev. cur_lsns={}, prev_lsns={}",
                          rreq->lsn(), prev_lsn);
     }
-    if (!rreq->is_proposer()) { rreq->clear(); }
+
+    if (!rreq->is_proposer()) rreq->clear();
+}
+
+void RaftReplDev::handle_config_commit(const repl_lsn_t lsn, raft_cluster_config_ptr_t& new_conf) {
+    // when reaching here, the new config has already been applied to the cluster.
+    // since we didn't create repl req for config change, we just need to update m_commit_upto_lsn here.
+    RD_LOGD(NO_TRACE_ID, "config commit on lsn {}", lsn);
+    // keep this variable in case it is needed later
+    (void) new_conf;
+    auto prev_lsn = m_commit_upto_lsn.load(std::memory_order_relaxed);
+    if (prev_lsn >= lsn || !m_commit_upto_lsn.compare_exchange_strong(prev_lsn, lsn)) {
+        RD_LOGE(NO_TRACE_ID, "Raft Channel: unexpected log {} commited before config {} committed", prev_lsn, lsn);
+    }
+}
+
+void RaftReplDev::handle_config_rollback(const repl_lsn_t lsn, raft_cluster_config_ptr_t& conf) {
+    RD_LOGD(NO_TRACE_ID, "roll back config on lsn {}", lsn);
+    // keep this variable in case it is needed later
+    (void)conf;
+    m_listener->on_config_rollback(lsn);
 }
 
 void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err) {
     if (err == ReplServiceError::OK) { return; }
+    RD_LOGE(rreq->traceID(), "Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err);
 
     if (!rreq->add_state_if_not_already(repl_req_state_t::ERRORED)) {
-        RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err);
+        RD_LOGE(rreq->traceID(), "Raft Channel: Error has been added for rreq=[{}] error={}", rreq->to_string(), err);
         return;
     }
 
     // Remove from the map and thus its no longer accessible from applier_create_req
     m_repl_key_req_map.erase(rreq->rkey());
 
-    if (rreq->op_code() == journal_type_t::HS_DATA_INLINED) {
+    // Ensure non-volatile lsn not exist because handle_error should not be called after append entries.
+    auto exist_rreq = m_state_machine->lsn_to_req(rreq->lsn());
+    if (exist_rreq != nullptr && !exist_rreq->is_volatile()) {
+        HS_REL_ASSERT(false, "Unexpected: LSN={} is already ready to commit, exist_rreq=[{}]", rreq->lsn(),
+                      exist_rreq->to_string());
+    }
+    if (err == ReplServiceError::DATA_DUPLICATED) {
+        RD_LOGE(rreq->traceID(), "Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err);
+        m_listener->on_error(err, rreq->header(), rreq->key(), rreq);
+        rreq->clear();
+        return;
+    }
+    if (rreq->op_code() == journal_type_t::HS_DATA_LINKED) {
         // Free the blks which is allocated already
-        RD_LOGE("Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err);
         if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) {
             auto blkid = rreq->local_blkid();
             data_service().async_free_blk(blkid).thenValue([blkid](auto&& err) {
@@ -820,8 +1384,12 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err)
                               blkid.to_string());
             });
         }
-    } else if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY) {
-        if (rreq->is_proposer()) { m_destroy_promise.setValue(err); }
+    } else if (rreq->op_code() == journal_type_t::HS_CTRL_DESTROY ||
+               rreq->op_code() == journal_type_t::HS_CTRL_COMPLETE_REPLACE) {
+        if (rreq->is_proposer()) {
+            RD_LOGE(rreq->traceID(), "Raft Channel: Error in processing rreq=[{}] error={}", rreq->to_string(), err);
+            m_destroy_promise.setValue(err);
+        }
     }
 
     // TODO: Validate if this is a correct assert or not. Is it possible that the log is already flushed and we receive
@@ -836,6 +1404,35 @@ void RaftReplDev::handle_error(repl_req_ptr_t const& rreq, ReplServiceError err)
     rreq->clear();
 }
 
+void RaftReplDev::start_replace_member(repl_req_ptr_t rreq) {
+    auto members = r_cast< const replace_member_ctx* >(rreq->header().cbytes());
+
+    RD_LOGI(rreq->traceID(), "Raft repl start_replace_member commit member_out={} member_in={}",
+            boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id));
+
+    m_listener->on_start_replace_member(members->replica_out, members->replica_in, rreq->traceID());
+    // record the replace_member intent
+    std::unique_lock lg{m_sb_mtx};
+    m_rd_sb->replace_member_ctx.replica_in = members->replica_in.id;
+    m_rd_sb->replace_member_ctx.replica_out = members->replica_out.id;
+    m_rd_sb.write();
+}
+
+void RaftReplDev::complete_replace_member(repl_req_ptr_t rreq) {
+    auto members = r_cast< const replace_member_ctx* >(rreq->header().cbytes());
+
+    RD_LOGI(rreq->traceID(), "Raft repl complete_replace_member commit member_out={} member_in={}",
+            boost::uuids::to_string(members->replica_out.id), boost::uuids::to_string(members->replica_in.id));
+
+    m_listener->on_complete_replace_member(members->replica_out, members->replica_in, rreq->traceID());
+
+    // clear the replace_member intent
+    std::unique_lock lg{m_sb_mtx};
+    m_rd_sb->replace_member_ctx = replace_member_ctx_superblk{};
+    m_rd_sb.write();
+    RD_LOGI(rreq->traceID(), "Raft repl replace_member_ctx has been cleared.");
+}
+
 static bool blob_equals(sisl::blob const& a, sisl::blob const& b) {
     if (a.size() != b.size()) { return false; }
     return (std::memcmp(a.cbytes(), b.cbytes(), a.size()) == 0);
@@ -848,11 +1445,15 @@ repl_req_ptr_t RaftReplDev::repl_key_to_req(repl_key const& rkey) const {
 }
 
 folly::Future< std::error_code > RaftReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size,
-                                                         bool part_of_batch) {
+                                                         bool part_of_batch, trace_id_t tid) {
+    if (is_stopping()) {
+        LOGINFO("repl dev is being shutdown!");
+        return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled));
+    }
     return data_service().async_read(bid, sgs, size, part_of_batch);
 }
 
-void RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid) {
+folly::Future< std::error_code > RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) {
     // TODO: For timeline consistency required, we should retain the blkid that is changed and write that to another
     // journal.
     data_service().async_free_blk(bid);
@@ -861,7 +1462,8 @@ void RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid) {
 AsyncReplResult<> RaftReplDev::become_leader() {
     return m_msg_mgr.become_leader(m_group_id).via(&folly::InlineExecutor::instance()).thenValue([this](auto&& e) {
         if (e.hasError()) {
-            RD_LOGE("Error in becoming leader: {}", e.error());
+            RD_LOGE(NO_TRACE_ID, "Error in becoming leader: {}", e.error());
+            decr_pending_request_num();
             return make_async_error<>(RaftReplService::to_repl_error(e.error()));
         }
         return make_async_success<>();
@@ -882,11 +1484,42 @@ std::vector< peer_info > RaftReplDev::get_replication_status() const {
     for (auto const& pinfo : rep_status) {
         pi.emplace_back(peer_info{.id_ = boost::lexical_cast< replica_id_t >(pinfo.id_),
                                   .replication_idx_ = pinfo.last_log_idx_,
-                                  .last_succ_resp_us_ = pinfo.last_succ_resp_us_});
+                                  .last_succ_resp_us_ = pinfo.last_succ_resp_us_,
+                                  .priority_ = pinfo.priority_,
+                                  .can_vote = !pinfo.is_learner_});
     }
     return pi;
 }
 
+std::set< replica_id_t > RaftReplDev::get_active_peers() const {
+    auto repl_status = get_replication_status();
+    std::set< replica_id_t > res;
+    auto my_committed_idx = m_commit_upto_lsn.load();
+    auto laggy=HS_DYNAMIC_CONFIG(consensus.laggy_threshold);
+    uint64_t least_active_repl_idx = my_committed_idx > HS_DYNAMIC_CONFIG(consensus.laggy_threshold)
+        ? my_committed_idx - HS_DYNAMIC_CONFIG(consensus.laggy_threshold)
+        : 0;
+    // peer's last log idx should also >= leader's start_index-1(ensure existence), otherwise leader can't append log entries to it
+    // and baseline resync will be triggerred. Try to avoid conflict between baseline resync and normal replication.
+    least_active_repl_idx = std::max(least_active_repl_idx, m_data_journal->start_index() - 1);
+    for (auto p : repl_status) {
+        if (p.id_ == m_my_repl_id) { continue; }
+        if (p.replication_idx_ >= least_active_repl_idx) {
+            res.insert(p.id_);
+            RD_LOGT(NO_TRACE_ID,
+                    "Found active peer {}, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}, laggy={}", p.id_,
+                    my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_, least_active_repl_idx,
+                    laggy);
+        } else {
+            RD_LOGW(NO_TRACE_ID,
+                    "Excluding peer {} from active_peers, lag {}, my lsn {}, peer lsn {}, least_active_repl_idx {}",
+                    p.id_, my_committed_idx - p.replication_idx_, my_committed_idx, p.replication_idx_,
+                    least_active_repl_idx);
+        }
+    }
+    return res;
+}
+
 uint32_t RaftReplDev::get_blk_size() const { return data_service().get_blk_size(); }
 
 nuraft_mesg::repl_service_ctx* RaftReplDev::group_msg_service() { return m_repl_svc_ctx.get(); }
@@ -960,8 +1593,8 @@ nuraft::ptr< nuraft::cluster_config > RaftReplDev::load_config() {
 
     if (!js.contains("config")) {
         auto cluster_conf = nuraft::cs_new< nuraft::cluster_config >();
-        cluster_conf->get_servers().push_back(
-            nuraft::cs_new< nuraft::srv_config >(m_raft_server_id, my_replica_id_str()));
+        cluster_conf->get_servers().push_back(nuraft::cs_new< nuraft::srv_config >(
+            m_raft_server_id, 0, my_replica_id_str(), "", false, raft_leader_priority));
         js["config"] = serialize_cluster_config(*cluster_conf);
     }
     return deserialize_cluster_config(js["config"]);
@@ -971,12 +1604,17 @@ void RaftReplDev::save_config(const nuraft::cluster_config& config) {
     std::unique_lock lg{m_config_mtx};
     (*m_raft_config_sb)["config"] = serialize_cluster_config(config);
     m_raft_config_sb.write();
+    RD_LOGI(NO_TRACE_ID, "Saved config {}", (*m_raft_config_sb)["config"].dump());
 }
 
 void RaftReplDev::save_state(const nuraft::srv_state& state) {
     std::unique_lock lg{m_config_mtx};
-    (*m_raft_config_sb)["state"] = nlohmann::json{{"term", state.get_term()}, {"voted_for", state.get_voted_for()}};
+    (*m_raft_config_sb)["state"] = nlohmann::json{{"term", state.get_term()},
+                                                  {"voted_for", state.get_voted_for()},
+                                                  {"election_timer_allowed", state.is_election_timer_allowed()},
+                                                  {"catching_up", state.is_catching_up()}};
     m_raft_config_sb.write();
+    RD_LOGI(NO_TRACE_ID, "Saved state {}", (*m_raft_config_sb)["state"].dump());
 }
 
 nuraft::ptr< nuraft::srv_state > RaftReplDev::read_state() {
@@ -984,11 +1622,16 @@ nuraft::ptr< nuraft::srv_state > RaftReplDev::read_state() {
     auto& js = *m_raft_config_sb;
     auto state = nuraft::cs_new< nuraft::srv_state >();
     if (js["state"].empty()) {
-        js["state"] = nlohmann::json{{"term", state->get_term()}, {"voted_for", state->get_voted_for()}};
+        js["state"] = nlohmann::json{{"term", state->get_term()},
+                                     {"voted_for", state->get_voted_for()},
+                                     {"election_timer_allowed", state->is_election_timer_allowed()},
+                                     {"catching_up", state->is_catching_up()}};
     } else {
         try {
             state->set_term(uint64_cast(js["state"]["term"]));
             state->set_voted_for(static_cast< int >(js["state"]["voted_for"]));
+            state->allow_election_timer(static_cast< bool >(js["state"]["election_timer_allowed"]));
+            state->set_catching_up(static_cast< bool >(js["state"]["catching_up"]));
         } catch (std::out_of_range const&) {
             LOGWARN("State data was not in the expected format [group_id={}]!", m_group_id)
         }
@@ -1000,7 +1643,7 @@ nuraft::ptr< nuraft::log_store > RaftReplDev::load_log_store() { return m_data_j
 
 int32_t RaftReplDev::server_id() { return m_raft_server_id; }
 
-bool RaftReplDev::is_destroy_pending() const { return (m_rd_sb->destroy_pending == 0x1); }
+bool RaftReplDev::is_destroy_pending() const { return (*m_stage.access().get() == repl_dev_stage_t::DESTROYED); }
 bool RaftReplDev::is_destroyed() const { return (*m_stage.access().get() == repl_dev_stage_t::PERMANENT_DESTROYED); }
 
 ///////////////////////////////////  nuraft_mesg::mesg_state_mgr overrides ////////////////////////////////////
@@ -1013,88 +1656,218 @@ uint32_t RaftReplDev::get_logstore_id() const { return m_data_journal->logstore_
 std::shared_ptr< nuraft::state_machine > RaftReplDev::get_state_machine() { return m_state_machine; }
 
 void RaftReplDev::permanent_destroy() {
-    RD_LOGI("Permanent destroy for raft repl dev");
-    m_rd_sb.destroy();
+    RD_LOGI(NO_TRACE_ID, "Permanent destroy for raft repl dev group_id={}", group_id_str());
+    // let the listener know at first, so that they can cleanup persistent structures before raft repl dev is destroyed
+    m_listener->on_destroy(group_id());
     m_raft_config_sb.destroy();
     m_data_journal->remove_store();
     logstore_service().destroy_log_dev(m_data_journal->logdev_id());
     m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::PERMANENT_DESTROYED; });
+
+    // we should destroy repl_dev superblk only after all the resources are cleaned up, so that is crash recovery
+    // occurs, we have a chance to find the stale repl_dev and reclaim all the stale resources.
+    m_rd_sb.destroy();
 }
 
 void RaftReplDev::leave() {
+    // this will be called in 3 cases :
+    //  1. commit log entry of journal_type_t::HS_CTRL_DESTROY
+    //  2. it is removed from the cluster and the new config(excluding this node) is being committed on this node
+    //  3. it is removed from the cluster , but the node is down and new config log(excluding this node) is not
+    //  replicated to this removed node. when the node restart, leader will not send any append entry to this node,
+    //  since it is not a member of the raft group. it will become a condidate and send request-vote request to other
+    //  members of this raft group. a member will send RemovedFromCluster to the node if this member finds the node
+    //  is no longer a member of the raft group.
+
+    // leave() will never be called concurrently, since config change and journal_type_t::HS_CTRL_DESTROY are all log
+    // entry, which will be committed sequentially.
+    if (is_destroy_pending()) return;
+
     // We update that this repl_dev in destroyed state, actual clean up of resources happen in reaper thread later
     m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::DESTROYED; });
     m_destroyed_time = Clock::now();
 
-    // We let the listener know right away, so that they can cleanup persistent structures soonest. This will
-    // reduce the time window of leaked resources if any
-    m_listener->on_destroy();
-
     // Persist that destroy pending in superblk, so that in case of crash before cleanup of resources, it can be done
     // post restart.
     m_rd_sb->destroy_pending = 0x1;
     m_rd_sb.write();
 
-    RD_LOGI("RaftReplDev leave group");
+    RD_LOGI(NO_TRACE_ID, "RaftReplDev leave group_id={}", group_id_str());
     m_destroy_promise.setValue(ReplServiceError::OK); // In case proposer is waiting for the destroy to complete
 }
 
-std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nuraft::cb_func::Type type,
-                                                                              nuraft::cb_func::Param* param) {
+nuraft::cb_func::ReturnCode RaftReplDev::raft_event(nuraft::cb_func::Type type, nuraft::cb_func::Param* param) {
     auto ret = nuraft::cb_func::ReturnCode::Ok;
 
-    if (type == nuraft::cb_func::Type::GotAppendEntryReqFromLeader) {
+    switch (type) {
+    case nuraft::cb_func::Type::GotAppendEntryReqFromLeader: {
         auto raft_req = r_cast< nuraft::req_msg* >(param->ctx);
         auto const& entries = raft_req->log_entries();
 
         auto start_lsn = raft_req->get_last_log_idx() + 1;
-        RD_LOGD("Raft channel: Received {} append entries on follower from leader, term {}, lsn {} ~ {} , my commited "
-                "lsn {} , leader commmited lsn {}",
+        if (entries.size() == 0) {
+            RD_LOGT(NO_TRACE_ID, "Raft channel: Received no entry, leader committed lsn {}",
+                    raft_req->get_commit_idx());
+            return ret;
+        }
+        RD_LOGT(NO_TRACE_ID,
+                "Raft channel: Received {} append entries on follower from leader, term {}, lsn {} ~ {} , my "
+                "committed lsn {} , leader committed lsn {}",
                 entries.size(), raft_req->get_last_log_term(), start_lsn, start_lsn + entries.size() - 1,
                 m_commit_upto_lsn.load(), raft_req->get_commit_idx());
 
-        if (!entries.empty()) {
-            RD_LOGT("Raft channel: Received {} append entries on follower from leader, localizing them",
-                    entries.size());
-
-            auto reqs = sisl::VectorPool< repl_req_ptr_t >::alloc();
-            for (auto& entry : entries) {
-                if (entry->get_val_type() != nuraft::log_val_type::app_log) { continue; }
-                if (entry->get_buf_ptr()->size() == 0) { continue; }
-                auto req = m_state_machine->localize_journal_entry_prepare(*entry);
-                if (req == nullptr) {
-                    sisl::VectorPool< repl_req_ptr_t >::free(reqs);
-                    return {true, nuraft::cb_func::ReturnCode::ReturnNull};
-                }
-                reqs->emplace_back(std::move(req));
+        auto reqs = sisl::VectorPool< repl_req_ptr_t >::alloc();
+        auto last_commit_lsn = uint64_cast(get_last_commit_lsn());
+        for (unsigned long i = 0; i < entries.size(); i++) {
+            auto& entry = entries[i];
+            auto lsn = start_lsn + i;
+            auto term = entry->get_term();
+            if (entry->get_val_type() != nuraft::log_val_type::app_log) { continue; }
+            if (entry->get_buf_ptr()->size() == 0) { continue; }
+            // skipping localize for already committed log(dup), they anyway will be discard
+            // by nuraft before append_log.
+            if (lsn <= last_commit_lsn) {
+                RD_LOGT(NO_TRACE_ID, "Raft channel: term {}, lsn {}, skipping dup, last_commit_lsn {}", term, lsn,
+                        last_commit_lsn);
+                continue;
             }
+            // Those LSNs already in logstore but not yet committed, will be dedup here,
+            // applier_create_req will return same req as previous one
+            auto req = m_state_machine->localize_journal_entry_prepare(*entry, lsn);
+            if (req == nullptr) {
+                sisl::VectorPool< repl_req_ptr_t >::free(reqs);
+                // The hint set here will be used by the next after next appendEntry, the next one
+                // always go with -1 from NuRraft code.
+                //
+                // We are rejecting this log entry, meaning we can accept previous log entries.
+                // If there is nothing we can accept(i==0), that maens we are waiting for commit
+                // of previous lsn, set it to 1 in this case.
+                m_state_machine->reset_next_batch_size_hint(std::max(1ul, i));
+                return nuraft::cb_func::ReturnCode::ReturnNull;
+            }
+            report_blk_metrics_if_needed(req);
+            reqs->emplace_back(std::move(req));
+        }
 
-            // Wait till we receive the data from its originator for all the requests
-            if (!wait_for_data_receive(*reqs, HS_DYNAMIC_CONFIG(consensus.data_receive_timeout_ms))) {
-                for (auto const& rreq : *reqs) {
-                    handle_error(rreq, ReplServiceError::TIMEOUT);
-                }
-                ret = nuraft::cb_func::ReturnCode::ReturnNull;
+        // Wait till we receive the data from its originator for all the requests
+        std::vector< repl_req_ptr_t > timeout_rreqs;
+        if (!wait_for_data_receive(*reqs, HS_DYNAMIC_CONFIG(consensus.data_receive_timeout_ms), &timeout_rreqs)) {
+            for (auto const& rreq : timeout_rreqs) {
+                handle_error(rreq, ReplServiceError::TIMEOUT);
             }
-            sisl::VectorPool< repl_req_ptr_t >::free(reqs);
+            ret = nuraft::cb_func::ReturnCode::ReturnNull;
         }
-        return {true, ret};
-    } else {
-        return {false, ret};
+        sisl::VectorPool< repl_req_ptr_t >::free(reqs);
+        if (ret == nuraft::cb_func::ReturnCode::Ok) { m_state_machine->inc_next_batch_size_hint(); }
+        return ret;
+    }
+    case nuraft::cb_func::Type::JoinedCluster:
+        RD_LOGD(NO_TRACE_ID, "Raft channel: Received JoinedCluster, implies become_follower");
+        become_follower_cb();
+        return nuraft::cb_func::ReturnCode::Ok;
+    case nuraft::cb_func::Type::BecomeFollower: {
+        RD_LOGD(NO_TRACE_ID, "Raft channel: Received BecomeFollower");
+        become_follower_cb();
+        return nuraft::cb_func::ReturnCode::Ok;
+    }
+    case nuraft::cb_func::Type::BecomeLeader: {
+        RD_LOGD(NO_TRACE_ID, "Raft channel: Received BecomeLeader");
+        become_leader_cb();
+        return nuraft::cb_func::ReturnCode::Ok;
+    }
+
+    // RemovedFromCluster will be handled in nuraft_mesg::generic_raft_event_handler where leave() is called
+
+    // TODO: Add more type handler if necessary
+    default:
+        break;
     }
+    return nuraft::cb_func::ReturnCode::Ok;
 }
 
 void RaftReplDev::flush_durable_commit_lsn() {
     auto const lsn = m_commit_upto_lsn.load();
+    m_listener->notify_committed_lsn(lsn);
+
+    if (is_destroyed()) {
+        RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore flush durable commit lsn");
+        return;
+    }
+
+    RD_LOGT(NO_TRACE_ID, "Flushing durable commit lsn to {}", lsn);
     std::unique_lock lg{m_sb_mtx};
     m_rd_sb->durable_commit_lsn = lsn;
     m_rd_sb.write();
 }
 
+void RaftReplDev::check_replace_member_status() {
+    if (is_destroyed()) {
+        RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore check replace member status");
+        return;
+    }
+    if (!m_repl_svc_ctx || !is_leader()) { return; }
+    if (m_rd_sb->replace_member_ctx.replica_in == boost::uuids::nil_uuid() ||
+        m_rd_sb->replace_member_ctx.replica_out == boost::uuids::nil_uuid()) {
+        RD_LOGT(NO_TRACE_ID, "No replace member in progress, return");
+        return;
+    }
+
+    auto peers = get_replication_status();
+    auto replica_in = m_rd_sb->replace_member_ctx.replica_in;
+    auto replica_out = m_rd_sb->replace_member_ctx.replica_out;
+    repl_lsn_t in_lsn = 0;
+    repl_lsn_t out_lsn = 0;
+    repl_lsn_t laggy = HS_DYNAMIC_CONFIG(consensus.laggy_threshold);
+
+    for (auto& peer : peers) {
+        if (peer.id_ == replica_out) {
+            out_lsn = peer.replication_idx_;
+            RD_LOGD(NO_TRACE_ID, "Replica out {} with lsn {}", boost::uuids::to_string(peer.id_), out_lsn);
+        } else if (peer.id_ == replica_in) {
+            in_lsn = peer.replication_idx_;
+            RD_LOGD(NO_TRACE_ID, "Replica in {} with lsn {}", boost::uuids::to_string(peer.id_), in_lsn);
+        }
+    }
+    // TODO optimize the condition
+    bool catch_up = in_lsn + laggy >= out_lsn;
+
+    if (!catch_up) {
+        RD_LOGD(NO_TRACE_ID, "Checking replace member status, replica_in={} with lsn={}, replica_out={} with lsn={}",
+                boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn);
+        return;
+    }
+
+    RD_LOGD(NO_TRACE_ID,
+            "Checking replace member status, new member has caught up, replica_in={} with lsn={}, replica_out={} with "
+            "lsn={}",
+            boost::uuids::to_string(replica_in), in_lsn, boost::uuids::to_string(replica_out), out_lsn);
+
+    trace_id_t trace_id = generateRandomTraceId();
+
+    RD_LOGD(trace_id, "Trigger complete_replace_member, replica_in={}, replica_out={}",
+            boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out));
+
+    replica_member_info out{replica_out, ""};
+    replica_member_info in{replica_in, ""};
+    auto ret = complete_replace_member(out, in, 0, trace_id).get();
+    if (ret.hasError()) {
+        RD_LOGE(trace_id, "Failed to complete replace member, next time will retry it, error={}", ret.error());
+        return;
+    }
+    RD_LOGI(trace_id, "Complete replace member, replica_in={}, replica_out={}",
+            boost::uuids::to_string(replica_in), boost::uuids::to_string(replica_out))
+}
+
 ///////////////////////////////////  Private metohds ////////////////////////////////////
-void RaftReplDev::cp_flush(CP* cp) {
-    auto const lsn = m_commit_upto_lsn.load();
-    auto const clsn = m_compact_lsn.load();
+void RaftReplDev::cp_flush(CP* cp, cshared< ReplDevCPContext > ctx) {
+    if (is_destroyed()) {
+        RD_LOGI(NO_TRACE_ID, "Raft repl dev is destroyed, ignore cp flush");
+        return;
+    }
+
+    auto const lsn = ctx->cp_lsn;
+    auto const clsn = ctx->compacted_to_lsn;
+    auto const dsn = ctx->last_applied_dsn;
 
     if (lsn == m_last_flushed_commit_lsn) {
         // Not dirtied since last flush ignore
@@ -1103,58 +1876,116 @@ void RaftReplDev::cp_flush(CP* cp) {
 
     std::unique_lock lg{m_sb_mtx};
     m_rd_sb->compact_lsn = clsn;
-    m_rd_sb->durable_commit_lsn = lsn;
+    // dc_lsn is also flushed in flush_durable_commit_lsn()
+    // we need to take a max to avoid rolling back.
+    m_rd_sb->durable_commit_lsn = std::max(lsn, m_rd_sb->durable_commit_lsn);
     m_rd_sb->checkpoint_lsn = lsn;
-    m_rd_sb->last_applied_dsn = m_next_dsn.load();
+    m_rd_sb->last_applied_dsn = dsn;
     m_rd_sb.write();
     m_last_flushed_commit_lsn = lsn;
-    RD_LOGD("cp flush in raft repl dev, lsn={}, clsn={}, next_dsn={}, cp string:{}", lsn, clsn, m_next_dsn.load(),
-            cp->to_string());
+    RD_LOGD(NO_TRACE_ID, "cp flush in raft repl dev, lsn={}, clsn={}, next_dsn={}, cp string:{}", lsn, clsn,
+            m_next_dsn.load(), cp->to_string());
+}
+
+cshared< ReplDevCPContext > RaftReplDev::get_cp_ctx(CP* cp) {
+    auto const cp_lsn = m_commit_upto_lsn.load();
+    auto const clsn = m_compact_lsn.load();
+    auto const dsn = m_next_dsn.load();
+
+    RD_LOGD(NO_TRACE_ID, "getting cp_ctx for raft repl dev {}, cp_lsn={}, clsn={}, next_dsn={}, cp string:{}",
+            (void*)this, cp_lsn, clsn, dsn, cp->to_string());
+    auto dev_ctx = std::make_shared< ReplDevCPContext >();
+    dev_ctx->cp_lsn = cp_lsn;
+    dev_ctx->compacted_to_lsn = clsn;
+    dev_ctx->last_applied_dsn = dsn;
+    return dev_ctx;
 }
 
 void RaftReplDev::cp_cleanup(CP*) {}
 
 void RaftReplDev::gc_repl_reqs() {
-    std::vector< int64_t > expired_keys;
-    m_state_machine->iterate_repl_reqs([this, &expired_keys](auto key, auto rreq) {
+    auto cur_dsn = m_next_dsn.load();
+    if (cur_dsn != 0) cur_dsn = cur_dsn - 1;
+    // On follower, DSN below cur_dsn should very likely be commited.
+    // It is not guaranteed because DSN and LSN are generated separately,
+    // DSN in async_alloc_write before pushing data, LSN later when
+    // proposing to raft. Two simultaneous write requests on leader can have
+    // <LSN=100, DSN=102> and <LSN=101, DSN =101> during the window.
+    std::vector< repl_req_ptr_t > expired_rreqs;
+
+    auto req_map_size = m_repl_key_req_map.size();
+    RD_LOGI(NO_TRACE_ID, "m_repl_key_req_map size is {};", req_map_size);
+    for (auto [key, rreq] : m_repl_key_req_map) {
+        // FIXME: Skipping proposer for now, the DSN in proposer increased in proposing stage, not when commit().
+        // Need other mechanism.
+        if (rreq->is_proposer()) {
+            // don't clean up proposer's request
+            continue;
+        }
+        if (rreq->dsn() < cur_dsn && rreq->is_expired()) {
+            // The DSN can be out of order, wait till rreq expired.
+            RD_LOGD(rreq->traceID(),
+                    "legacy req with commited DSN, rreq=[{}] , dsn = {}, next_dsn = {}, gap= {}, elapsed_time_sec {}",
+                    rreq->to_string(), rreq->dsn(), cur_dsn, cur_dsn - rreq->dsn(),
+                    get_elapsed_time_sec(rreq->created_time()));
+            expired_rreqs.push_back(rreq);
+        }
+    }
+    int sm_req_cnt = 0;
+    // FIXME: we ensured data written before appending log to log store, in which we add rreq to state_machine
+    // and during pre-commit/commit we retrieve rreq from state_machine. Removing requests outside of state
+    // machine is risky.
+    // Below logs are logging only, can be removed once we get more confidence.
+    m_state_machine->iterate_repl_reqs([this, cur_dsn, &sm_req_cnt](auto key, auto rreq) {
+        sm_req_cnt++;
         if (rreq->is_proposer()) {
             // don't clean up proposer's request
             return;
         }
-
         if (rreq->is_expired()) {
-            expired_keys.push_back(key);
-            RD_LOGD("rreq=[{}] is expired, cleaning up; elapsed_time_sec{};", rreq->to_string(),
+            RD_LOGD(rreq->traceID(), "StateMachine: rreq=[{}] is expired, elapsed_time_sec{};", rreq->to_string(),
                     get_elapsed_time_sec(rreq->created_time()));
-
-            // do garbage collection
-            // 1. free the allocated blocks
-            if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) {
-                auto blkid = rreq->local_blkid();
-                data_service().async_free_blk(blkid).thenValue([this, blkid](auto&& err) {
-                    HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak",
-                                  blkid.to_string());
-                    RD_LOGD("blkid={} freed successfully", blkid.to_string());
-                });
-            }
-
-            // 2. remove from the m_repl_key_req_map
-            // handle_error during fetch data response might have already removed the rreq from the this map
-            if (m_repl_key_req_map.find(rreq->rkey()) != m_repl_key_req_map.end()) {
-                m_repl_key_req_map.erase(rreq->rkey());
-            }
         }
     });
+    RD_LOGT(NO_TRACE_ID, "state_machine req map size is {};", sm_req_cnt);
 
-    for (auto const& l : expired_keys) {
-        m_state_machine->unlink_lsn_to_req(l);
+    for (auto removing_rreq : expired_rreqs) {
+        // once log flushed, the commit progress controlled by raft
+        if (removing_rreq->has_state(repl_req_state_t::LOG_FLUSHED)) {
+            RD_LOGT(removing_rreq->traceID(), "Skipping GC rreq [{}] because it is in state machine",
+                    removing_rreq->to_string());
+            continue;
+        }
+        // do garbage collection
+        // 1. free the allocated blocks
+        RD_LOGD(removing_rreq->traceID(), "Removing rreq [{}]", removing_rreq->to_string());
+        if (removing_rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) {
+            auto blkid = removing_rreq->local_blkid();
+            data_service().async_free_blk(blkid).thenValue([this, blkid, removing_rreq](auto&& err) {
+                HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak",
+                              blkid.to_string());
+                RD_LOGD(removing_rreq->traceID(), "GC rreq: Releasing blkid={} freed successfully", blkid.to_string());
+            });
+        }
+        // 2. remove from the m_repl_key_req_map
+        if (m_repl_key_req_map.find(removing_rreq->rkey()) != m_repl_key_req_map.end()) {
+            m_repl_key_req_map.erase(removing_rreq->rkey());
+        }
     }
 }
 
+void RaftReplDev::set_log_store_last_durable_lsn(store_lsn_t lsn) { m_data_journal->set_last_durable_lsn(lsn); }
+
 void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) {
     auto repl_lsn = to_repl_lsn(lsn);
+    if (need_skip_processing(repl_lsn)) {
+        RD_LOGI(NO_TRACE_ID,
+                "Raft Channel: Log {} is outdated and will be handled by baseline resync. Ignoring replay.", lsn);
+        return;
+    }
+
     // apply the log entry if the lsn is between checkpoint lsn and durable commit lsn
-    if (repl_lsn < m_rd_sb->checkpoint_lsn) { return; }
+    if (repl_lsn <= m_rd_sb->checkpoint_lsn) { return; }
 
     // 1. Get the log entry and prepare rreq
     auto const lentry = to_nuraft_log_entry(buf);
@@ -1166,8 +1997,9 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx
     RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR,
                       "Mismatched version of journal entry received from RAFT peer");
 
-    RD_LOGT("Raft Channel: Applying Raft log_entry upon recovery: server_id={}, term={}, journal_entry=[{}] ",
-            jentry->server_id, lentry->get_term(), jentry->to_string());
+    RD_LOGT(jentry->traceID,
+            "Raft Channel: Applying Raft log_entry upon recovery: server_id={}, term={}, lsn={}, journal_entry=[{}] ",
+            jentry->server_id, lentry->get_term(), repl_lsn, jentry->to_string());
 
     auto entry_to_hdr = [](repl_journal_entry* jentry) {
         return sisl::blob{uintptr_cast(jentry) + sizeof(repl_journal_entry), jentry->user_header_size};
@@ -1184,7 +2016,8 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx
                           jentry->value_size};
     };
 
-    repl_key const rkey{.server_id = jentry->server_id, .term = lentry->get_term(), .dsn = jentry->dsn};
+    repl_key const rkey{
+        .server_id = jentry->server_id, .term = lentry->get_term(), .dsn = jentry->dsn, .traceID = jentry->traceID};
 
     auto const [it, happened] = m_repl_key_req_map.try_emplace(rkey, repl_req_ptr_t(new repl_req_ctx()));
     RD_DBG_ASSERT((it != m_repl_key_req_map.end()), "Unexpected error in map_repl_key_to_req");
@@ -1192,33 +2025,97 @@ void RaftReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx
     RD_DBG_ASSERT(happened, "rreq already exists for rkey={}", rkey.to_string());
     uint32_t data_size{0u};
 
+    // If the data is linked and value_size is non-zero, it means blks have been allocated for data.
+    // Since the log is flushed after data is written, the data has already been received.
     if ((jentry->code == journal_type_t::HS_DATA_LINKED) && (jentry->value_size > 0)) {
         MultiBlkId entry_blkid;
         entry_blkid.deserialize(entry_to_val(jentry), true /* copy */);
         data_size = entry_blkid.blk_count() * get_blk_size();
-        rreq->set_local_blkid(entry_blkid);
+        rreq->set_local_blkids({entry_blkid});
+        rreq->add_state(repl_req_state_t::BLK_ALLOCATED);
+        rreq->add_state(repl_req_state_t::DATA_RECEIVED);
     }
 
     rreq->set_lsn(repl_lsn);
     // keep lentry in scope for the lyfe cycle of the rreq
     rreq->set_lentry(lentry);
-    rreq->init(rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry), entry_to_key(jentry), data_size);
-    RD_LOGD("Replay log on restart, rreq=[{}]", rreq->to_string());
+    auto status = init_req_ctx(rreq, rkey, jentry->code, false /* is_proposer */, entry_to_hdr(jentry),
+                               entry_to_key(jentry), data_size, m_listener);
+    if (status != ReplServiceError::OK) {
+        RD_LOGE(jentry->traceID, "Initializing rreq failed, rreq=[{}], error={}", rreq->to_string(), status);
+    }
+
+    // we load the log from log device, implies log flushed.  We only flush log after data is written to data device.
+    rreq->add_state(repl_req_state_t::DATA_WRITTEN);
+    rreq->add_state(repl_req_state_t::LOG_RECEIVED);
+    rreq->add_state(repl_req_state_t::LOG_FLUSHED);
+    RD_LOGD(rreq->traceID(), "Replay log on restart, rreq=[{}]", rreq->to_string());
 
+    // 2. Pre-commit the log entry as in nuraft pre-commit was called once log appended to logstore.
+    m_listener->on_pre_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq);
+
+    // LSN above dc_lsn we forgot their states,  they can either
+    // a. be committed before, but DC_LSN not yet flushed
+    // b. not yet committed,  might be committed or rollback
     if (repl_lsn > m_rd_sb->durable_commit_lsn) {
         // In memory state of these blks is lost. Commit them now to avoid usage of same blk twice.
         commit_blk(rreq);
+        // add rreq to state machine, state-machine will decide to commit or rollback this rreq.
         m_state_machine->link_lsn_to_req(rreq, int64_cast(repl_lsn));
         return;
     }
 
-    // 2. Pre-commit the log entry
-    m_listener->on_pre_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq);
-
     // 3. Commit the log entry
     handle_commit(rreq, true /* recovery */);
 }
 
+void RaftReplDev::create_snp_resync_data(raft_buf_ptr_t& data_out) {
+    snp_repl_dev_data msg;
+    auto msg_size = sizeof(snp_repl_dev_data);
+    msg.dsn = m_next_dsn;
+    auto crc = crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(&msg), msg_size);
+    RD_LOGD(NO_TRACE_ID, "create snapshot resync msg, dsn={}, crc={}", msg.dsn, crc);
+    msg.crc = crc;
+    data_out = nuraft::buffer::alloc(msg_size);
+    std::memcpy(data_out->data_begin(), &msg, msg_size);
+}
+
+bool RaftReplDev::save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s) {
+    auto msg = r_cast< snp_repl_dev_data* >(data.data_begin());
+    if (msg->magic_num != HOMESTORE_RESYNC_DATA_MAGIC ||
+        msg->protocol_version != HOMESTORE_RESYNC_DATA_PROTOCOL_VERSION_V1) {
+        RD_LOGE(NO_TRACE_ID, "Snapshot resync data validation failed, magic={}, version={}", msg->magic_num,
+                msg->protocol_version);
+        return false;
+    }
+    auto received_crc = msg->crc;
+    RD_LOGD(NO_TRACE_ID, "received snapshot resync msg, dsn={}, crc={}, received crc={}", msg->dsn, msg->crc,
+            received_crc);
+    // Clear the crc field before verification, because the crc value computed by leader doesn't contain it.
+    msg->crc = 0;
+    auto computed_crc =
+        crc32_ieee(init_crc32, reinterpret_cast< const unsigned char* >(msg), sizeof(snp_repl_dev_data));
+    if (received_crc != computed_crc) {
+        RD_LOGE(NO_TRACE_ID, "Snapshot resync data crc mismatch, received_crc={}, computed_crc={}", received_crc,
+                computed_crc);
+        return false;
+    }
+    {
+        // Save last_snapshot_lsn, so that we can skip the replay/commit operation for logs included in baseline resync.
+        // The reason is baseline resync will clear existing resources on the upper layer, skipping replay/commit
+        // operations can avoid accessing unavailable resources
+        std::unique_lock lg{m_sb_mtx};
+        m_rd_sb->last_snapshot_lsn = s_cast< repl_lsn_t >(s.get_last_log_idx());
+        m_rd_sb.write();
+    }
+    if (msg->dsn > m_next_dsn) {
+        m_next_dsn = msg->dsn;
+        RD_LOGD(NO_TRACE_ID, "Update next_dsn from {} to {}", m_next_dsn.load(), msg->dsn);
+        return true;
+    }
+    return true;
+}
+
 void RaftReplDev::on_restart() { m_listener->on_restart(); }
 
 bool RaftReplDev::is_resync_mode() {
@@ -1227,10 +2124,116 @@ bool RaftReplDev::is_resync_mode() {
     auto diff = leader_commited_lsn - my_log_idx;
     bool resync_mode = (diff > HS_DYNAMIC_CONFIG(consensus.resync_log_idx_threshold));
     if (resync_mode) {
-        RD_LOGD("Raft Channel: Resync mode, leader_commited_lsn={}, my_log_idx={}, diff={}", leader_commited_lsn,
-                my_log_idx, diff);
+        RD_LOGD(NO_TRACE_ID, "Raft Channel: Resync mode, leader_commited_lsn={}, my_log_idx={}, diff={}",
+                leader_commited_lsn, my_log_idx, diff);
     }
     return resync_mode;
 }
 
+void RaftReplDev::report_blk_metrics_if_needed(repl_req_ptr_t rreq) {
+    auto chunk_id = rreq->local_blkid().chunk_num();
+    auto chunk = hs()->device_mgr()->get_chunk(chunk_id);
+    if (chunk->get_blk_usage() >= chunk->get_blk_usage_report_threshold()) {
+        auto local_blk_num = rreq->local_blkid().blk_num();
+        auto remote_blk_num = rreq->remote_blkid().blkid.blk_num();
+        // Focus only on cases where the locally allocated blocks exceed the proposer's allocated blocks,
+        // as this indicates that the member might encounter NO_SPACE_LEFT before the proposer.
+        auto blk_diff_with_remote = local_blk_num > remote_blk_num ? local_blk_num - remote_blk_num : 0;
+        HISTOGRAM_OBSERVE(m_metrics, blk_diff_with_proposer, blk_diff_with_remote);
+    }
+}
+
+void RaftReplDev::quiesce_reqs() {
+    // all the block allocation happens in rreq->init. so after we wait for all the pending req has been initialized we
+    // can make sure
+    // 1 all the pending reqs has allocated their blocks
+    // 2 no new pending reqs will be initialized again.
+    m_in_quience.store(true, std::memory_order_release);
+    RD_LOGD(NO_TRACE_ID, "enter quience state, waiting for all the pending req to be initialized");
+    while (true) {
+        uint64_t pending_req_num = get_pending_init_req_num();
+        if (pending_req_num) {
+            RD_LOGD(NO_TRACE_ID, "wait for {} pending create_req requests to be completed", pending_req_num);
+            std::this_thread::sleep_for(std::chrono::microseconds(1));
+        } else
+            break;
+    }
+}
+
+void RaftReplDev::resume_accepting_reqs() { m_in_quience.store(false, std::memory_order_release); }
+
+void RaftReplDev::clear_chunk_req(chunk_num_t chunk_id) {
+    RD_LOGD(NO_TRACE_ID,
+            "start cleaning all the in-memory rreqs, which has allocated blk on the emergent chunk={} before handling "
+            "no_space_left error",
+            chunk_id);
+    std::vector< folly::Future< folly::Unit > > futs;
+    for (auto& [key, rreq] : m_repl_key_req_map) {
+        if (rreq->has_state(repl_req_state_t::BLK_ALLOCATED)) {
+            auto blkid = rreq->local_blkid();
+            if (chunk_id == blkid.chunk_num()) {
+                // only clean the rreqs which has allocated blks on the emergent chunk
+                futs.emplace_back(
+                    std::move(data_service().async_free_blk(blkid).thenValue([this, &blkid, &key](auto&& err) {
+                        HS_LOG_ASSERT(!err, "freeing blkid={} upon error failed, potential to cause blk leak",
+                                      blkid.to_string());
+                        RD_LOGD(NO_TRACE_ID, "blkid={} freed successfully for handling no_space_left error",
+                                blkid.to_string());
+                        m_repl_key_req_map.erase(key); // remove from the req map after freeing the blk
+                    })));
+            }
+        }
+    }
+
+    folly::collectAllUnsafe(futs)
+        .thenValue([this](auto&& vf) {
+            // TODO:: handle the error in freeing blk if necessary in the future.
+            // for nuobject case, error for freeing blk in the emergent chunk can be ingored
+            RD_LOGD(
+                NO_TRACE_ID,
+                "all the necessary in-memory rreqs which has allocated blks on the emergent chunk have been cleaned up "
+                "successfully, continue to handle no_space_left error.");
+        })
+        // need to wait for the completion
+        .wait();
+}
+
+ReplServiceError RaftReplDev::init_req_ctx(repl_req_ptr_t rreq, repl_key rkey, journal_type_t op_code, bool is_proposer,
+                                           sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size,
+                                           cshared< ReplDevListener >& listener) {
+    if (!rreq) {
+        RD_LOGD(rkey.traceID, "got nullptr for initing req, rkey=[{}]", rkey.to_string());
+        return ReplServiceError::CANCELLED;
+    }
+
+    init_req_counter counter(m_pending_init_req_num);
+    if (is_in_quience()) {
+        // In quience state, reject any new requests.
+        RD_LOGD(rkey.traceID, "Rejecting new request in quience state, rkey=[{}]", rkey.to_string());
+        return ReplServiceError::QUIENCE_STATE;
+    }
+
+    return rreq->init(rkey, op_code, is_proposer, user_header, key, data_size, m_listener);
+}
+
+void RaftReplDev::become_leader_cb() {
+    auto new_gate = raft_server()->get_last_log_idx();
+    repl_lsn_t existing_gate = 0;
+    if (!m_traffic_ready_lsn.compare_exchange_strong(existing_gate, new_gate)) {
+        // was a follower, m_traffic_ready_lsn should be zero on follower.
+        RD_REL_ASSERT(!existing_gate, "existing gate should be zero");
+    }
+    RD_LOGD(NO_TRACE_ID, "become_leader_cb: setting traffic_ready_lsn from {} to {}", existing_gate, new_gate);
+}
+
+bool RaftReplDev::is_ready_for_traffic() const {
+    if (is_stopping()) return false;
+    auto committed_lsn = m_commit_upto_lsn.load();
+    auto gate = m_traffic_ready_lsn.load();
+    bool ready = committed_lsn >= gate;
+    if (!ready) {
+        RD_LOGD(NO_TRACE_ID, "Not yet ready for traffic, committed to {} but gate is {}", committed_lsn, gate);
+    }
+    return ready;
+}
 } // namespace homestore
diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h
index 41594b528..abede36bf 100644
--- a/src/lib/replication/repl_dev/raft_repl_dev.h
+++ b/src/lib/replication/repl_dev/raft_repl_dev.h
@@ -15,6 +15,10 @@
 #include "replication/log_store/repl_log_store.h"
 
 namespace homestore {
+struct replace_member_ctx_superblk {
+    replica_id_t replica_out;
+    replica_id_t replica_in;
+};
 
 #pragma pack(1)
 struct raft_repl_dev_superblk : public repl_dev_superblk {
@@ -25,6 +29,8 @@ struct raft_repl_dev_superblk : public repl_dev_superblk {
     uint8_t is_timeline_consistent; // Flag to indicate whether the recovery of followers need to be timeline consistent
     uint64_t last_applied_dsn;      // Last applied data sequence number
     uint8_t destroy_pending;        // Flag to indicate whether the group is in destroy pending state
+    repl_lsn_t last_snapshot_lsn;   // Last snapshot LSN follower received from leader
+    replace_member_ctx_superblk replace_member_ctx; // Replace members context, used to track the replace member status
 
     uint32_t get_raft_sb_version() const { return raft_sb_version; }
 };
@@ -35,6 +41,11 @@ using raft_cluster_config_ptr_t = nuraft::ptr< nuraft::cluster_config >;
 
 ENUM(repl_dev_stage_t, uint8_t, INIT, ACTIVE, DESTROYING, DESTROYED, PERMANENT_DESTROYED);
 
+struct replace_member_ctx {
+    replica_member_info replica_out;
+    replica_member_info replica_in;
+};
+
 class RaftReplDevMetrics : public sisl::MetricsGroup {
 public:
     explicit RaftReplDevMetrics(const char* inst_name) : sisl::MetricsGroup("RaftReplDev", inst_name) {
@@ -84,6 +95,13 @@ class RaftReplDevMetrics : public sisl::MetricsGroup {
         REGISTER_HISTOGRAM(rreq_pieces_per_write, "Number of individual pieces per write",
                            HistogramBucketsType(LinearUpto64Buckets));
 
+        // In the identical layout chunk, the blk num of the follower and leader is expected to be the same.
+        // However, due to the concurrency between the data channel and the raft channel, there might be some
+        // allocation differences on the same lsn. When a leader switch occurs, these differences could become garbage.
+        // This metric can partially reflect the potential amount of garbage.
+        REGISTER_HISTOGRAM(blk_diff_with_proposer,
+                           "allocated blk num diff on the same lsn with proposer when chunk usage >= 0.9");
+
         // Raft channel metrics
         REGISTER_HISTOGRAM(raft_end_of_append_batch_latency_us, "Raft end_of_append_batch latency in us",
                            "raft_logstore_append_latency", {"op", "end_of_append_batch"});
@@ -102,18 +120,70 @@ class RaftReplDevMetrics : public sisl::MetricsGroup {
 
 class RaftReplService;
 class CP;
+struct ReplDevCPContext {
+    repl_lsn_t cp_lsn;
+    repl_lsn_t compacted_to_lsn;
+    uint64_t last_applied_dsn;
+};
+
+class nuraft_snapshot_context : public snapshot_context {
+public:
+    nuraft_snapshot_context(nuraft::snapshot& snp) : snapshot_context(snp.get_last_log_idx()) {
+        auto snp_buf = snp.serialize();
+        snapshot_ = nuraft::snapshot::deserialize(*snp_buf);
+    }
+
+    nuraft_snapshot_context(sisl::io_blob_safe const& snp_ctx) : snapshot_context(0) { deserialize(snp_ctx); }
+
+    sisl::io_blob_safe serialize() override {
+        // Dump the context from nuraft buffer to the io blob.
+        auto snp_buf = snapshot_->serialize();
+        sisl::io_blob_safe blob{s_cast< size_t >(snp_buf->size())};
+        std::memcpy(blob.bytes(), snp_buf->data_begin(), snp_buf->size());
+        return blob;
+    }
+
+    void deserialize(const sisl::io_blob_safe& snp_ctx) {
+        // Load the context from the io blob to nuraft buffer.
+        auto snp_buf = nuraft::buffer::alloc(snp_ctx.size());
+        snp_buf->put_raw(snp_ctx.cbytes(), snp_ctx.size());
+        snp_buf->pos(0);
+        snapshot_ = nuraft::snapshot::deserialize(*snp_buf);
+        lsn_ = snapshot_->get_last_log_idx();
+    }
+
+    nuraft::ptr< nuraft::snapshot > nuraft_snapshot() { return snapshot_; }
+
+private:
+    nuraft::ptr< nuraft::snapshot > snapshot_;
+};
+
 class RaftReplDev : public ReplDev,
                     public nuraft_mesg::mesg_state_mgr,
                     public std::enable_shared_from_this< RaftReplDev > {
+private:
+    class init_req_counter {
+    public:
+        init_req_counter(std::atomic_uint64_t& counter) : my_counter(counter) {
+            my_counter.fetch_add(1, std::memory_order_acq_rel);
+        }
+
+        ~init_req_counter() { my_counter.fetch_sub(1, std::memory_order_acq_rel); }
+
+    private:
+        std::atomic_uint64_t& my_counter;
+    };
+
 private:
     shared< RaftStateMachine > m_state_machine;
     RaftReplService& m_repl_svc;
     folly::ConcurrentHashMap< repl_key, repl_req_ptr_t, repl_key::Hasher > m_repl_key_req_map;
     nuraft_mesg::Manager& m_msg_mgr;
-    group_id_t m_group_id;     // Replication Group id
-    std::string m_rdev_name;   // Short name for the group for easy debugging
-    replica_id_t m_my_repl_id; // This replica's uuid
-    int32_t m_raft_server_id;  // Server ID used by raft (unique within raft group)
+    group_id_t m_group_id;      // Replication Group id
+    std::string m_rdev_name;    // Short name for the group for easy debugging
+    std::string m_identify_str; // combination of rdev_name:group_id
+    replica_id_t m_my_repl_id;  // This replica's uuid
+    int32_t m_raft_server_id;   // Server ID used by raft (unique within raft group)
     shared< ReplLogStore > m_data_journal;
     shared< HomeLogStore > m_free_blks_journal;
     sisl::urcu_scoped_ptr< repl_dev_stage_t > m_stage;
@@ -124,8 +194,12 @@ class RaftReplDev : public ReplDev,
     mutable folly::SharedMutexWritePriority m_sb_lock; // Lock to protect staged sb and persisting sb
     raft_repl_dev_superblk m_sb_in_mem;                // Cached version which is used to read and for staging
 
-    std::atomic< repl_lsn_t > m_commit_upto_lsn{0}; // LSN which was lastly written, to track flushes
+    std::atomic< repl_lsn_t > m_commit_upto_lsn{0}; // LSN which was lastly committed, to track flushes
     std::atomic< repl_lsn_t > m_compact_lsn{0};     // LSN upto which it was compacted, it is used to track where to
+    // The `traffic_ready_lsn` variable holds the Log Sequence Number (LSN) up to which
+    // the state machine should committed to before accepting traffic. This threshold ensures that
+    // all potential committed log be committed before handling incoming requests.
+    std::atomic< repl_lsn_t > m_traffic_ready_lsn{0};
 
     std::mutex m_sb_mtx; // Lock to protect the repl dev superblock
 
@@ -143,53 +217,123 @@ class RaftReplDev : public ReplDev,
     static std::atomic< uint64_t > s_next_group_ordinal;
     bool m_log_store_replay_done{false};
 
+    // pending create requests, including both raft and data channel
+    std::atomic_uint64_t m_pending_init_req_num;
+    std::atomic< bool > m_in_quience;
+
 public:
     friend class RaftStateMachine;
 
     RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk >&& rd_sb, bool load_existing);
     virtual ~RaftReplDev() = default;
 
+    bool bind_data_service();
     bool join_group();
+    AsyncReplResult<> start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in,
+                                           uint32_t commit_quorum = 0, uint64_t trace_id = 0);
+    AsyncReplResult<> complete_replace_member(const replica_member_info& member_out,
+                                              const replica_member_info& member_in, uint32_t commit_quorum = 0,
+                                              uint64_t trace_id = 0);
+    AsyncReplResult<> flip_learner_flag(const replica_member_info& member, bool target, uint32_t commit_quorum,
+                                        bool wait_and_verify = true, uint64_t trace_id = 0);
+    ReplServiceError do_add_member(const replica_member_info& member, uint64_t trace_id = 0);
+    ReplServiceError do_remove_member(const replica_member_info& member, uint64_t trace_id = 0);
+    ReplServiceError do_flip_learner(const replica_member_info& member, bool target, bool wait_and_verify,
+                                     uint64_t trace_id = 0);
+    ReplServiceError set_priority(const replica_id_t& member, int32_t priority, uint64_t trace_id = 0);
+    nuraft::cmd_result_code retry_when_config_changing(const std::function< nuraft::cmd_result_code() >& func,
+                                                     uint64_t trace_id = 0);
+    bool wait_and_check(const std::function< bool() >& check_func, uint32_t timeout_ms, uint32_t interval_ms = 100);
+
     folly::SemiFuture< ReplServiceError > destroy_group();
 
     //////////////// All ReplDev overrides/implementation ///////////////////////
+    virtual std::error_code alloc_blks(uint32_t size, const blk_alloc_hints& hints,
+                                       std::vector< MultiBlkId >& out_blkids) override {
+        RD_REL_ASSERT(false, "NOT SUPPORTED");
+        return std::make_error_code(std::errc::operation_not_supported);
+    }
+    virtual folly::Future< std::error_code > async_write(const std::vector< MultiBlkId >& blkids,
+                                                         sisl::sg_list const& value, bool part_of_batch = false,
+                                                         trace_id_t tid = 0) override {
+        RD_REL_ASSERT(false, "NOT SUPPORTED");
+        return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_not_supported));
+    }
+
+    virtual void async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header,
+                                     sisl::blob const& key, uint32_t data_size, repl_req_ptr_t ctx,
+                                     trace_id_t tid = 0) override {
+        RD_REL_ASSERT(false, "NOT SUPPORTED");
+    }
+
     void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value,
-                           repl_req_ptr_t ctx) override;
+                           repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) override;
     folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size,
-                                                bool part_of_batch = false) override;
-    void async_free_blks(int64_t lsn, MultiBlkId const& blkid) override;
+                                                bool part_of_batch = false, trace_id_t tid = 0) override;
+    folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid, trace_id_t tid = 0) override;
     AsyncReplResult<> become_leader() override;
     bool is_leader() const override;
     replica_id_t get_leader_id() const override;
     std::vector< peer_info > get_replication_status() const override;
+    std::set< replica_id_t > get_active_peers() const;
     group_id_t group_id() const override { return m_group_id; }
+    void set_custom_rdev_name(std::string const& name) override {
+        RD_LOGI(NO_TRACE_ID, "Resetting repl dev name from {} to {}", m_rdev_name, name);
+        m_rdev_name = name;
+        m_identify_str = name + ":" + group_id_str();
+        m_rd_sb->set_rdev_name(m_rdev_name);
+    }
     std::string group_id_str() const { return boost::uuids::to_string(m_group_id); }
-    std::string rdev_name() const { return m_rdev_name; }
+    std::string rdev_name() const { return m_rd_sb->rdev_name; };
+    std::string identify_str() const { return m_identify_str; };
     std::string my_replica_id_str() const { return boost::uuids::to_string(m_my_repl_id); }
     uint32_t get_blk_size() const override;
-    repl_lsn_t get_last_commit_lsn() const { return m_commit_upto_lsn.load(); }
+    repl_lsn_t get_last_commit_lsn() const override { return m_commit_upto_lsn.load(); }
     void set_last_commit_lsn(repl_lsn_t lsn) { m_commit_upto_lsn.store(lsn); }
+    repl_lsn_t get_last_append_lsn() override { return raft_server()->get_last_log_idx(); }
     bool is_destroy_pending() const;
     bool is_destroyed() const;
+
     Clock::time_point destroyed_time() const { return m_destroyed_time; }
+    bool is_ready_for_traffic() const override;
+    // purge all resources (e.g., logs in logstore) is a very dangerous operation, it is not supported yet.
+    void purge() override { RD_REL_ASSERT(false, "NOT SUPPORTED YET"); }
+
+    std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override {
+        return std::make_shared< nuraft_snapshot_context >(snp_ctx);
+    }
 
     //////////////// Accessor/shortcut methods ///////////////////////
     nuraft_mesg::repl_service_ctx* group_msg_service();
+
     nuraft::raft_server* raft_server();
     RaftReplDevMetrics& metrics() { return m_metrics; }
 
     //////////////// Methods needed for other Raft classes to access /////////////////
     void use_config(json_superblk raft_config_sb);
     void handle_commit(repl_req_ptr_t rreq, bool recovery = false);
+    void handle_config_commit(const repl_lsn_t lsn, raft_cluster_config_ptr_t& new_conf);
+    void handle_rollback(repl_req_ptr_t rreq);
+    void handle_config_rollback(const repl_lsn_t lsn, raft_cluster_config_ptr_t& old_conf);
     repl_req_ptr_t repl_key_to_req(repl_key const& rkey) const;
     repl_req_ptr_t applier_create_req(repl_key const& rkey, journal_type_t code, sisl::blob const& user_header,
-                                      sisl::blob const& key, uint32_t data_size, bool is_data_channel);
+                                      sisl::blob const& key, uint32_t data_size, bool is_data_channel,
+                                      int64_t lsn = -1 /*init lsn*/);
     folly::Future< folly::Unit > notify_after_data_written(std::vector< repl_req_ptr_t >* rreqs);
     void check_and_fetch_remote_data(std::vector< repl_req_ptr_t > rreqs);
-    void cp_flush(CP* cp);
+    void cp_flush(CP* cp, cshared< ReplDevCPContext > ctx);
+    cshared< ReplDevCPContext > get_cp_ctx(CP* cp);
     void cp_cleanup(CP* cp);
     void become_ready();
 
+    void become_leader_cb();
+
+    void become_follower_cb() {
+        // m_traffic_ready_lsn should be zero on follower.
+        m_traffic_ready_lsn.store(0);
+        RD_LOGD(NO_TRACE_ID, "become_follower_cb setting  traffic_ready_lsn to 0");
+    }
+
     /// @brief This method is called when the data journal is compacted
     ///
     /// @param upto_lsn : LSN upto which the data journal was compacted
@@ -207,6 +351,7 @@ class RaftReplDev : public ReplDev,
      */
     void on_create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done);
 
+#if 0
     /**
      * Truncates the replication log by providing a specified number of reserved entries.
      *
@@ -215,6 +360,7 @@ class RaftReplDev : public ReplDev,
     void truncate(uint32_t num_reserved_entries) {
         m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load());
     }
+#endif
 
     void wait_for_logstore_ready() { m_data_journal->wait_for_log_store_ready(); }
 
@@ -225,6 +371,11 @@ class RaftReplDev : public ReplDev,
      */
     void flush_durable_commit_lsn();
 
+    /**
+     * Check the replace_member status, if the new member is fully synced up and ready to take over, remove the old member.
+     */
+    void check_replace_member_status();
+
     /**
      * \brief This method is called during restart to notify the upper layer
      */
@@ -238,6 +389,22 @@ class RaftReplDev : public ReplDev,
      */
     void force_leave() { leave(); }
 
+    /**
+     * \brief This method is called to check if the given LSN is within the last snapshot LSN received from the leader.
+     * All logs with LSN less than or equal to the last snapshot LSN are considered as part of the baseline resync,
+     * which doesn't need any more operations (e.g., replay, commit).
+     *
+     * \param lsn The LSN to be checked.
+     * \return true if the LSN is within the last snapshot LSN, false otherwise.
+     */
+    bool need_skip_processing(const repl_lsn_t lsn) { return lsn <= m_rd_sb->last_snapshot_lsn; }
+
+    void quiesce_reqs();
+    void resume_accepting_reqs();
+
+    // clear reqs that has allocated blks on the given chunk.
+    void clear_chunk_req(chunk_num_t chunk_id);
+
 protected:
     //////////////// All nuraft::state_mgr overrides ///////////////////////
     nuraft::ptr< nuraft::cluster_config > load_config() override;
@@ -253,8 +420,8 @@ class RaftReplDev : public ReplDev,
     std::shared_ptr< nuraft::state_machine > get_state_machine() override;
     void permanent_destroy() override;
     void leave() override;
-    std::pair< bool, nuraft::cb_func::ReturnCode > handle_raft_event(nuraft::cb_func::Type,
-                                                                     nuraft::cb_func::Param*) override;
+
+    nuraft::cb_func::ReturnCode raft_event(nuraft::cb_func::Type, nuraft::cb_func::Param*) override;
 
 private:
     shared< nuraft::log_store > data_journal() { return m_data_journal; }
@@ -264,10 +431,32 @@ class RaftReplDev : public ReplDev,
     void fetch_data_from_remote(std::vector< repl_req_ptr_t > rreqs);
     void handle_fetch_data_response(sisl::GenericClientResponse response, std::vector< repl_req_ptr_t > rreqs);
     bool is_resync_mode();
+
+    /**
+     * \brief This method handles errors that occur during append entries or data receiving.
+     * It should not be called after the append entries phase.
+     */
     void handle_error(repl_req_ptr_t const& rreq, ReplServiceError err);
-    bool wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms);
+
+    bool wait_for_data_receive(std::vector< repl_req_ptr_t > const& rreqs, uint64_t timeout_ms,
+                               std::vector< repl_req_ptr_t >* timeout_rreqs = nullptr);
     void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx);
+    void set_log_store_last_durable_lsn(store_lsn_t lsn);
     void commit_blk(repl_req_ptr_t rreq);
+    void start_replace_member(repl_req_ptr_t rreq);
+    void complete_replace_member(repl_req_ptr_t rreq);
+    void reset_quorum_size(uint32_t commit_quorum, uint64_t trace_id);
+    void create_snp_resync_data(raft_buf_ptr_t& data_out);
+    bool save_snp_resync_data(nuraft::buffer& data, nuraft::snapshot& s);
+
+    void report_blk_metrics_if_needed(repl_req_ptr_t rreq);
+    ReplServiceError init_req_ctx(repl_req_ptr_t rreq, repl_key rkey, journal_type_t op_code, bool is_proposer,
+                                  sisl::blob const& user_header, sisl::blob const& key, uint32_t data_size,
+                                  cshared< ReplDevListener >& listener);
+
+    bool is_in_quience() { return m_in_quience.load(std::memory_order_acquire); }
+
+    uint64_t get_pending_init_req_num() { return m_pending_init_req_num.load(std::memory_order_acquire); }
 };
 
 } // namespace homestore
diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp
index 0b932bbe1..c0f910741 100644
--- a/src/lib/replication/repl_dev/raft_state_machine.cpp
+++ b/src/lib/replication/repl_dev/raft_state_machine.cpp
@@ -1,4 +1,5 @@
 #include <iomgr/iomgr_timer.hpp>
+#include <iomgr/iomgr_flip.hpp>
 #include <sisl/logging/logging.h>
 #include <sisl/fds/utils.hpp>
 #include <sisl/fds/vector_pool.hpp>
@@ -9,6 +10,7 @@
 #include "repl_dev/raft_repl_dev.h"
 #include <homestore/homestore.hpp>
 #include "common/homestore_config.hpp"
+#include "common/crash_simulator.hpp"
 
 namespace homestore {
 
@@ -30,7 +32,7 @@ static std::pair< sisl::blob, sisl::blob > header_only_extract(nuraft::buffer& b
 
 ReplServiceError RaftStateMachine::propose_to_raft(repl_req_ptr_t rreq) {
     rreq->create_journal_entry(true /* raft_buf */, m_rd.server_id());
-    RD_LOGT("Raft Channel: propose journal_entry=[{}] ", rreq->journal_entry()->to_string());
+    RD_LOGT(rreq->traceID(), "Raft Channel: propose journal_entry=[{}] ", rreq->journal_entry()->to_string());
 
     auto* vec = sisl::VectorPool< raft_buf_ptr_t >::alloc();
     vec->push_back(rreq->raft_journal_buf());
@@ -39,21 +41,21 @@ ReplServiceError RaftStateMachine::propose_to_raft(repl_req_ptr_t rreq) {
     sisl::VectorPool< raft_buf_ptr_t >::free(vec);
 
     if (append_status && !append_status->get_accepted()) {
-        RD_LOGE("Raft Channel: Failed to propose rreq=[{}] result_code={}", rreq->to_compact_string(),
+        RD_LOGE(rreq->traceID(), "Raft Channel: Failed to propose rreq=[{}] result_code={}", rreq->to_compact_string(),
                 append_status->get_result_code());
         return RaftReplService::to_repl_error(append_status->get_result_code());
     }
     return ReplServiceError::OK;
 }
 
-repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entry& lentry) {
+repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entry& lentry, int64_t lsn) {
     // Validate the journal entry and see if it needs to be transformed
     repl_journal_entry* jentry = r_cast< repl_journal_entry* >(lentry.get_buf().data_begin());
     RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR,
                       "Mismatched version of journal entry received from RAFT peer");
 
-    RD_LOGT("Raft Channel: Localizing Raft log_entry: server_id={}, term={}, journal_entry=[{}] ", jentry->server_id,
-            lentry.get_term(), jentry->to_string());
+    RD_LOGT(jentry->traceID, "Raft Channel: Localizing Raft log_entry: server_id={}, term={}, journal_entry=[{}] ",
+            jentry->server_id, lentry.get_term(), jentry->to_string());
 
     auto entry_to_hdr = [](repl_journal_entry* jentry) {
         return sisl::blob{uintptr_cast(jentry) + sizeof(repl_journal_entry), jentry->user_header_size};
@@ -70,7 +72,8 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr
                           jentry->value_size};
     };
 
-    repl_key const rkey{.server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn};
+    repl_key const rkey{
+        .server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn, .traceID = jentry->traceID};
 
     // Create a new rreq (or) Pull rreq from the map given the repl_key, header and key. Any new rreq will
     // allocate the blks (in case of large data). We will use the new blkid and transform the current journal entry's
@@ -80,8 +83,9 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr
         MultiBlkId entry_blkid;
         entry_blkid.deserialize(entry_to_val(jentry), true /* copy */);
 
-        rreq = m_rd.applier_create_req(rkey, jentry->code, entry_to_hdr(jentry), entry_to_key(jentry),
-                                       (entry_blkid.blk_count() * m_rd.get_blk_size()), false /* is_data_channel */);
+        rreq =
+            m_rd.applier_create_req(rkey, jentry->code, entry_to_hdr(jentry), entry_to_key(jentry),
+                                    (entry_blkid.blk_count() * m_rd.get_blk_size()), false /* is_data_channel */, lsn);
         if (rreq == nullptr) { goto out; }
 
         rreq->set_remote_blkid(RemoteBlkId{jentry->server_id, entry_blkid});
@@ -106,7 +110,8 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr
         std::memcpy(blkid_location, rreq->local_blkid().serialize().cbytes(), local_size);
     } else {
         rreq = m_rd.applier_create_req(rkey, jentry->code, entry_to_hdr(jentry), entry_to_key(jentry),
-                                       jentry->value_size, false /* is_data_channel */);
+                                       jentry->value_size, false /* is_data_channel */, lsn);
+        if (rreq == nullptr) goto out;
     }
 
     // We might have localized the journal entry with new blkid. We need to also update the header/key pointers pointing
@@ -116,9 +121,9 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_prepare(nuraft::log_entr
 
 out:
     if (rreq == nullptr) {
-        RD_LOG(ERROR,
-               "Failed to localize journal entry rkey={} jentry=[{}], we return error and let Raft resend this req",
-               rkey.to_string(), jentry->to_string());
+        RD_LOGE(rkey.traceID,
+                "Failed to localize journal entry rkey={} jentry=[{}], we return error and let Raft resend this req",
+                rkey.to_string(), jentry->to_string());
     }
     return rreq;
 }
@@ -146,11 +151,13 @@ repl_req_ptr_t RaftStateMachine::localize_journal_entry_finish(nuraft::log_entry
     RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR,
                       "Mismatched version of journal entry received from RAFT peer");
 
-    repl_key rkey{.server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn};
+    repl_key rkey{
+        .server_id = jentry->server_id, .term = lentry.get_term(), .dsn = jentry->dsn, .traceID = jentry->traceID};
 
     auto rreq = m_rd.repl_key_to_req(rkey);
     if ((rreq == nullptr) || (rreq->is_localize_pending())) {
-        rreq = localize_journal_entry_prepare(lentry);
+        rreq = localize_journal_entry_prepare(lentry,
+                                              -1 /* lsn=-1, since this is a finish call and we don't have lsn yet */);
         if (rreq == nullptr) {
             RELEASE_ASSERT(rreq != nullptr,
                            "We get an linked data for rkey=[{}], jentry=[{}] not as part of Raft Append but "
@@ -176,7 +183,7 @@ raft_buf_ptr_t RaftStateMachine::pre_commit_ext(nuraft::state_machine::ext_op_pa
     int64_t lsn = s_cast< int64_t >(params.log_idx);
 
     repl_req_ptr_t rreq = lsn_to_req(lsn);
-    RD_LOGD("Raft channel: Precommit rreq=[{}]", rreq->to_compact_string());
+    RD_LOGT(rreq->traceID(), "Precommit rreq=[{}]", rreq->to_compact_string());
     m_rd.m_listener->on_pre_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq);
 
     return m_success_ptr;
@@ -184,24 +191,91 @@ raft_buf_ptr_t RaftStateMachine::pre_commit_ext(nuraft::state_machine::ext_op_pa
 
 raft_buf_ptr_t RaftStateMachine::commit_ext(nuraft::state_machine::ext_op_params const& params) {
     int64_t lsn = s_cast< int64_t >(params.log_idx);
-    RD_LOGD("Raft channel: Received Commit message lsn {} store {} logdev {} size {}", lsn,
-            m_rd.m_data_journal->logstore_id(), m_rd.m_data_journal->logdev_id(), params.data->size());
     repl_req_ptr_t rreq = lsn_to_req(lsn);
+    if (m_rd.need_skip_processing(lsn)) {
+        RD_LOGI(rreq->traceID(), "Raft Channel: Log {} is expected to be handled by snapshot. Skipping commit.", lsn);
+        return m_success_ptr;
+    }
     RD_DBG_ASSERT(rreq != nullptr, "Raft channel got null rreq for lsn={}", lsn);
-    RD_LOGD("Raft channel: Received Commit message rreq=[{}]", rreq->to_string());
+    RD_LOGT(rreq->traceID(), "Raft channel: Received Commit message rreq=[{}]", rreq->to_string());
     if (rreq->is_proposer()) {
         // This is the time to ensure flushing of journal happens in the proposer
         rreq->add_state(repl_req_state_t::LOG_FLUSHED);
     }
-
     m_rd.handle_commit(rreq);
-
     return m_success_ptr;
 }
 
 void RaftStateMachine::commit_config(const ulong log_idx, raft_cluster_config_ptr_t& new_conf) {
-    RD_LOGD("Raft channel: Commit new cluster conf , log_idx = {}", log_idx);
-    // TODO:add more logic here if necessary
+    // when reaching here, the config change log has already been committed, and the new config has been applied to the
+    // cluster
+    if (m_rd.need_skip_processing(s_cast< repl_lsn_t >(log_idx))) {
+        RD_LOGI(NO_TRACE_ID, "Raft Channel: Config {} is expected to be handled by snapshot. Skipping commit.",
+                log_idx);
+        return;
+    }
+
+    RD_LOGD(NO_TRACE_ID, "Raft channel: Commit new cluster conf , log_idx = {}", log_idx);
+
+#ifdef _PRERELEASE
+    auto& servers_in_new_conf = new_conf->get_servers();
+    std::vector< int32_t > server_ids_in_new_conf;
+    for (auto& server : servers_in_new_conf)
+        server_ids_in_new_conf.emplace_back(server->get_id());
+
+    auto my_id = m_rd.server_id();
+
+    std::ostringstream oss;
+    auto it = server_ids_in_new_conf.begin();
+    if (it != server_ids_in_new_conf.end()) {
+        oss << *it;
+        ++it;
+    }
+    for (; it != server_ids_in_new_conf.end(); ++it) {
+        oss << "," << *it;
+    }
+
+    RD_LOGI(NO_TRACE_ID, "Raft channel: server ids in new cluster conf : {}, my_id {}, group_id {}", oss.str(), my_id,
+            m_rd.group_id_str());
+#endif
+
+    m_rd.handle_config_commit(s_cast< repl_lsn_t >(log_idx), new_conf);
+}
+
+void RaftStateMachine::rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) {
+    RD_LOGD(NO_TRACE_ID, "Raft channel: Rollback cluster conf , log_idx = {}", log_idx);
+    m_rd.handle_config_rollback(s_cast< repl_lsn_t >(log_idx), conf);
+}
+
+void RaftStateMachine::rollback_ext(const nuraft::state_machine::ext_op_params& params) {
+    int64_t lsn = s_cast< int64_t >(params.log_idx);
+    repl_req_ptr_t rreq = lsn_to_req(lsn);
+    if (rreq == nullptr) {
+        RD_LOGE(NO_TRACE_ID, "Raft channel: Rollback lsn {} rreq not found", lsn);
+        return;
+    }
+
+    RD_LOGD(rreq->traceID(), "Raft channel: Rollback lsn {}, rreq=[{}]", lsn, rreq->to_string());
+    m_rd.handle_rollback(rreq);
+}
+
+int64_t RaftStateMachine::get_next_batch_size_hint_in_bytes() { return next_batch_size_hint; }
+
+int64_t RaftStateMachine::inc_next_batch_size_hint() {
+    constexpr int64_t next_batch_size_hint_limit = 16;
+    // set to minimal if previous hint is negative (i.e do not want any log)
+    if (next_batch_size_hint < 0) {
+        next_batch_size_hint = 1;
+        return next_batch_size_hint;
+    }
+    // Exponential growth till next_batch_size_hint_limit,  set to 0 afterward means leader take control.
+    next_batch_size_hint = next_batch_size_hint * 2 > next_batch_size_hint_limit ? 0 : next_batch_size_hint * 2;
+    return next_batch_size_hint;
+}
+
+int64_t RaftStateMachine::reset_next_batch_size_hint(int64_t new_hint) {
+    next_batch_size_hint = new_hint;
+    return next_batch_size_hint;
 }
 
 void RaftStateMachine::iterate_repl_reqs(std::function< void(int64_t, repl_req_ptr_t rreq) > const& cb) {
@@ -211,18 +285,17 @@ void RaftStateMachine::iterate_repl_reqs(std::function< void(int64_t, repl_req_p
 }
 
 uint64_t RaftStateMachine::last_commit_index() {
-    RD_LOG(DEBUG, "Raft channel: last_commit_index {}", uint64_cast(m_rd.get_last_commit_lsn()));
+    RD_LOGD(NO_TRACE_ID, "Raft channel: last_commit_index {}", uint64_cast(m_rd.get_last_commit_lsn()));
     return uint64_cast(m_rd.get_last_commit_lsn());
 }
 
 void RaftStateMachine::become_ready() { m_rd.become_ready(); }
 
-void RaftStateMachine::unlink_lsn_to_req(int64_t lsn) {
-    auto const it = m_lsn_req_map.find(lsn);
-    if (it != m_lsn_req_map.cend()) {
-        RD_LOG(DEBUG, "Raft channel: erase lsn {},  rreq {}", lsn, it->second->to_string());
-        m_lsn_req_map.erase(lsn);
-    }
+void RaftStateMachine::unlink_lsn_to_req(int64_t lsn, repl_req_ptr_t rreq) {
+    // it is possible a LSN mapped to different rreq in history
+    // due to log overwritten. Verify the rreq before removing
+    auto deleted = m_lsn_req_map.erase_if_equal(lsn, rreq);
+    if (deleted) { RD_LOGT(rreq->traceID(), "Raft channel: erase lsn {},  rreq {}", lsn, rreq->to_string()); }
 }
 
 void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) {
@@ -230,8 +303,12 @@ void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) {
     rreq->add_state(repl_req_state_t::LOG_RECEIVED);
     // reset the rreq created_at time to now https://github.com/eBay/HomeStore/issues/506
     rreq->set_created_time();
-    [[maybe_unused]] auto r = m_lsn_req_map.insert(lsn, std::move(rreq));
-    RD_DBG_ASSERT_EQ(r.second, true, "lsn={} already in precommit list", lsn);
+    auto r = m_lsn_req_map.insert(lsn, std::move(rreq));
+    if (!r.second) {
+        RD_LOGE(rreq->traceID(), "lsn={} already in precommit list, exist_term={}, is_volatile={}", lsn,
+                r.first->second->term(), r.first->second->is_volatile());
+        // TODO: we need to think about the case where volatile is in the map already, is it safe to overwrite it?
+    }
 }
 
 repl_req_ptr_t RaftStateMachine::lsn_to_req(int64_t lsn) {
@@ -253,18 +330,39 @@ void RaftStateMachine::create_snapshot(nuraft::snapshot& s, nuraft::async_result
 
 int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, ulong obj_id, raft_buf_ptr_t& data_out,
                                            bool& is_last_obj) {
+
+    // Ensure all logs snapshot included are committed to prevent the following scenario:
+    // If a crash occurs during snapshot creation, the snapshot might be persisted while the rd's sb is not.
+    // This means the durable_commit_lsn is less than the snapshot's log_idx. Upon restart, the changes in
+    // uncommitted logs may or may not included in the snapshot data sent by leader,
+    // depending on the racing of commit vs snapshot read, leading to data inconsistency.
+    if (s_cast< repl_lsn_t >(s.get_last_log_idx()) > m_rd.get_last_commit_lsn()) {
+        RD_LOGW(NO_TRACE_ID,
+                "not ready to read because there are some uncommitted logs in snapshot, "
+                "let nuraft retry later. snapshot log_idx={}, last_commit_lsn={}",
+                s.get_last_log_idx(), m_rd.get_last_commit_lsn());
+        return -1;
+    }
+
+    // For Nuraft baseline resync, we separate the process into two layers: HomeStore layer and Application layer.
+    // We use the highest bit of the obj_id to indicate the message type: 0 is for HS, 1 is for Application.
+    if (is_hs_snp_obj(obj_id)) {
+        // This is the preserved msg for homestore to resync data
+        m_rd.create_snp_resync_data(data_out);
+        is_last_obj = false;
+        return 0;
+    }
     auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s);
-    auto snp_data = std::make_shared< snapshot_data >();
+    auto snp_data = std::make_shared< snapshot_obj >();
     snp_data->user_ctx = user_ctx;
     snp_data->offset = obj_id;
     snp_data->is_last_obj = is_last_obj;
 
     // Listener will read the snapshot data and we pass through the same.
-    int ret = m_rd.m_listener->read_snapshot_data(snp_ctx, snp_data);
+    int ret = m_rd.m_listener->read_snapshot_obj(snp_ctx, snp_data);
+    user_ctx = snp_data->user_ctx; // Have to pass the user_ctx to NuRaft even if ret<0 to get it freed later
     if (ret < 0) return ret;
 
-    // Update user_ctx and whether is_last_obj
-    user_ctx = snp_data->user_ctx;
     is_last_obj = snp_data->is_last_obj;
 
     // We are doing a copy here.
@@ -276,28 +374,52 @@ int RaftStateMachine::read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx,
 
 void RaftStateMachine::save_logical_snp_obj(nuraft::snapshot& s, ulong& obj_id, nuraft::buffer& data, bool is_first_obj,
                                             bool is_last_obj) {
+    if (is_hs_snp_obj(obj_id)) {
+        // Homestore preserved msg
+        if (m_rd.save_snp_resync_data(data, s)) {
+            obj_id = snp_obj_id_type_app;
+            LOGDEBUG("save_snp_resync_data success, next obj_id={}", obj_id);
+        }
+        return;
+    }
     auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s);
-    auto snp_data = std::make_shared< snapshot_data >();
+    auto snp_data = std::make_shared< snapshot_obj >();
     snp_data->offset = obj_id;
     snp_data->is_first_obj = is_first_obj;
     snp_data->is_last_obj = is_last_obj;
 
     // We are doing a copy here.
-    sisl::io_blob_safe blob{s_cast< size_t >(data.size())};
+    sisl::io_blob_safe blob{static_cast< uint32_t >(data.size())};
     std::memcpy(blob.bytes(), data.data_begin(), data.size());
     snp_data->blob = std::move(blob);
 
-    m_rd.m_listener->write_snapshot_data(snp_ctx, snp_data);
+    m_rd.m_listener->write_snapshot_obj(snp_ctx, snp_data);
+    if (is_last_obj) {
+        hs()->cp_mgr().trigger_cp_flush(true).wait(); // ensure DSN is flushed to disk
+    }
 
     // Update the object offset.
     obj_id = snp_data->offset;
+
+#ifdef _PRERELEASE
+    if (iomgr_flip::instance()->test_flip("baseline_resync_restart_new_follower")) {
+        LOGINFO("Hit flip baseline_resync_restart_new_follower crashing");
+        hs()->crash_simulator().crash_now();
+    }
+#endif
 }
 
 bool RaftStateMachine::apply_snapshot(nuraft::snapshot& s) {
+    // NOTE: Currently, NuRaft considers the snapshot applied once compaction and truncation are completed, even if a
+    // crash occurs before apply_snapshot() is called. Therefore, the LSN must be updated here to ensure it is
+    // persisted AFTER log truncation.
     m_rd.set_last_commit_lsn(s.get_last_log_idx());
     m_rd.m_data_journal->set_last_durable_lsn(s.get_last_log_idx());
+
     auto snp_ctx = std::make_shared< nuraft_snapshot_context >(s);
-    return m_rd.m_listener->apply_snapshot(snp_ctx);
+    auto res = m_rd.m_listener->apply_snapshot(snp_ctx);
+    hs()->cp_mgr().trigger_cp_flush(true /* force */).get();
+    return res;
 }
 
 nuraft::ptr< nuraft::snapshot > RaftStateMachine::last_snapshot() {
@@ -308,6 +430,6 @@ nuraft::ptr< nuraft::snapshot > RaftStateMachine::last_snapshot() {
 
 void RaftStateMachine::free_user_snp_ctx(void*& user_snp_ctx) { m_rd.m_listener->free_user_snp_ctx(user_snp_ctx); }
 
-std::string RaftStateMachine::rdev_name() const { return m_rd.rdev_name(); }
+std::string RaftStateMachine::identify_str() const { return m_rd.identify_str(); }
 
 } // namespace homestore
diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h
index b931e42f4..0de9b2744 100644
--- a/src/lib/replication/repl_dev/raft_state_machine.h
+++ b/src/lib/replication/repl_dev/raft_state_machine.h
@@ -24,43 +24,35 @@ namespace homestore {
 class ReplicaSetImpl;
 class StateMachineStore;
 
-#define RD_LOG(level, msg, ...)                                                                                        \
-    LOG##level##MOD_FMT(replication, ([&](fmt::memory_buffer& buf, const char* msgcb, auto&&... args) -> bool {        \
-                            fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "},                          \
-                                            fmt::make_format_args(file_name(__FILE__), __LINE__));                     \
-                            fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "},                          \
-                                            fmt::make_format_args("rd", rdev_name()));                                 \
-                            fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb},                               \
-                                            fmt::make_format_args(std::forward< decltype(args) >(args)...));           \
-                            return true;                                                                               \
-                        }),                                                                                            \
-                        msg, ##__VA_ARGS__);
+#define NO_TRACE_ID "n/a"
+#define RD_LOG(level, traceID, msg, ...)                                                                               \
+    LOG##level##MOD(replication, "[traceID={}] [{}] " msg, traceID, identify_str(), ##__VA_ARGS__)
 
 #define RD_ASSERT_CMP(assert_type, val1, cmp, val2, ...)                                                               \
     {                                                                                                                  \
         assert_type##_ASSERT_CMP(                                                                                      \
             val1, cmp, val2,                                                                                           \
             [&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool {                            \
-                fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "},                                      \
-                                fmt::make_format_args(file_name(__FILE__), __LINE__));                                 \
+                fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}:{}] "},                                   \
+                                fmt::make_format_args(file_name(__FILE__), __LINE__, __FUNCTION__));                   \
                 sisl::logging::default_cmp_assert_formatter(buf, msgcb, std::forward< decltype(args) >(args)...);      \
-                fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "},                                      \
-                                fmt::make_format_args("rd", rdev_name()));                                             \
+                fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}] "}, fmt::make_format_args(identify_str())); \
                 return true;                                                                                           \
             },                                                                                                         \
             ##__VA_ARGS__);                                                                                            \
     }
 #define RD_ASSERT(assert_type, cond, ...)                                                                              \
     {                                                                                                                  \
-        assert_type##_ASSERT_FMT(cond,                                                                                 \
-                                 ([&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool {      \
-                                     fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "},                 \
-                                                     fmt::make_format_args("rd", rdev_name()));                        \
-                                     fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb},                      \
-                                                     fmt::make_format_args(std::forward< decltype(args) >(args)...));  \
-                                     return true;                                                                      \
-                                 }),                                                                                   \
-                                 ##__VA_ARGS__);                                                                       \
+        assert_type##_ASSERT_FMT(                                                                                      \
+            cond, ([&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool {                     \
+                fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}:{}] "},                                   \
+                                fmt::make_format_args(file_name(__FILE__), __LINE__, __FUNCTION__));                   \
+                fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}] "}, fmt::make_format_args(identify_str())); \
+                fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb},                                           \
+                                fmt::make_format_args(std::forward< decltype(args) >(args)...));                       \
+                return true;                                                                                           \
+            }),                                                                                                        \
+            ##__VA_ARGS__);                                                                                            \
     }
 
 #define RD_DBG_ASSERT(cond, ...) RD_ASSERT(DEBUG, cond, ##__VA_ARGS__)
@@ -79,12 +71,16 @@ class StateMachineStore;
 #define RD_REL_ASSERT_GT(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, >, val2, ##__VA_ARGS__)
 #define RD_REL_ASSERT_GE(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, >=, val2, ##__VA_ARGS__)
 
-#define RD_LOGT(...) RD_LOG(TRACE, ##__VA_ARGS__)
-#define RD_LOGD(...) RD_LOG(DEBUG, ##__VA_ARGS__)
-#define RD_LOGI(...) RD_LOG(INFO, ##__VA_ARGS__)
-#define RD_LOGW(...) RD_LOG(WARN, ##__VA_ARGS__)
-#define RD_LOGE(...) RD_LOG(ERROR, ##__VA_ARGS__)
-#define RD_LOGC(...) RD_LOG(CRITICAL, ##__VA_ARGS__)
+#define RD_LOGT(traceID, ...) RD_LOG(TRACE, traceID, ##__VA_ARGS__)
+#define RD_LOGD(traceID, ...) RD_LOG(DEBUG, traceID, ##__VA_ARGS__)
+#define RD_LOGI(traceID, ...) RD_LOG(INFO, traceID, ##__VA_ARGS__)
+#define RD_LOGW(traceID, ...) RD_LOG(WARN, traceID, ##__VA_ARGS__)
+#define RD_LOGE(traceID, ...) RD_LOG(ERROR, traceID, ##__VA_ARGS__)
+#define RD_LOGC(traceID, ...) RD_LOG(CRITICAL, traceID, ##__VA_ARGS__)
+
+// For the logic snapshot obj_id, we use the highest bit to indicate the type of the snapshot message.
+// 0 is for HS, 1 is for Application.
+static constexpr uint64_t snp_obj_id_type_app = 1ULL << 63;
 
 using AsyncNotify = folly::SemiFuture< folly::Unit >;
 using AsyncNotifier = folly::Promise< folly::Unit >;
@@ -97,6 +93,7 @@ class RaftStateMachine : public nuraft::state_machine {
     nuraft::ptr< nuraft::buffer > m_success_ptr; // Preallocate the success return to raft
     // iomgr::timer_handle_t m_wait_blkid_write_timer_hdl{iomgr::null_timer_handle};
     bool m_resync_mode{false};
+    int64_t next_batch_size_hint{0};
 
 public:
     RaftStateMachine(RaftReplDev& rd);
@@ -109,8 +106,10 @@ class RaftStateMachine : public nuraft::state_machine {
     raft_buf_ptr_t pre_commit_ext(const nuraft::state_machine::ext_op_params& params) override;
     raft_buf_ptr_t commit_ext(const nuraft::state_machine::ext_op_params& params) override;
     void commit_config(const ulong log_idx, raft_cluster_config_ptr_t& new_conf) override;
-    void rollback(uint64_t lsn, nuraft::buffer&) override { LOGCRITICAL("Unimplemented rollback on: [{}]", lsn); }
+    void rollback_config(const ulong log_idx, raft_cluster_config_ptr_t& conf) override;
+    void rollback_ext(const nuraft::state_machine::ext_op_params& params) override;
     void become_ready();
+    int64_t get_next_batch_size_hint_in_bytes() override;
 
     void create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done) override;
     int read_logical_snp_obj(nuraft::snapshot& s, void*& user_ctx, ulong obj_id, raft_buf_ptr_t& data_out,
@@ -123,16 +122,20 @@ class RaftStateMachine : public nuraft::state_machine {
 
     ////////// APIs outside of nuraft::state_machine requirements ////////////////////
     ReplServiceError propose_to_raft(repl_req_ptr_t rreq);
-    repl_req_ptr_t localize_journal_entry_prepare(nuraft::log_entry& lentry);
+    repl_req_ptr_t localize_journal_entry_prepare(nuraft::log_entry& lentry, int64_t lsn = -1);
     repl_req_ptr_t localize_journal_entry_finish(nuraft::log_entry& lentry);
     void link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn);
-    void unlink_lsn_to_req(int64_t lsn);
+    void unlink_lsn_to_req(int64_t lsn, repl_req_ptr_t rreq);
     repl_req_ptr_t lsn_to_req(int64_t lsn);
     nuraft_mesg::repl_service_ctx* group_msg_service();
 
     void iterate_repl_reqs(std::function< void(int64_t, repl_req_ptr_t rreq) > const& cb);
 
-    std::string rdev_name() const;
+    std::string identify_str() const;
+    int64_t reset_next_batch_size_hint(int64_t new_hint);
+    int64_t inc_next_batch_size_hint();
+
+    static bool is_hs_snp_obj(uint64_t obj_id) { return (obj_id & snp_obj_id_type_app) == 0; }
 
 private:
     void after_precommit_in_leader(const nuraft::raft_server::req_ext_cb_params& params);
diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp
index e5e2cb1a5..03b540184 100644
--- a/src/lib/replication/repl_dev/solo_repl_dev.cpp
+++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp
@@ -1,6 +1,7 @@
 #include <boost/smart_ptr/intrusive_ref_counter.hpp>
 #include "replication/repl_dev/solo_repl_dev.h"
 #include "replication/repl_dev/common.h"
+#include <homestore/homestore.hpp>
 #include <homestore/blkdata_service.hpp>
 #include <homestore/logstore_service.hpp>
 #include <homestore/superblk_handler.hpp>
@@ -10,44 +11,56 @@ namespace homestore {
 SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing) :
         m_rd_sb{std::move(rd_sb)}, m_group_id{m_rd_sb->group_id} {
     if (load_existing) {
-        logstore_service().open_logdev(m_rd_sb->logdev_id);
+        m_logdev_id = m_rd_sb->logdev_id;
+        logstore_service().open_logdev(m_rd_sb->logdev_id, flush_mode_t::TIMER);
         logstore_service()
             .open_log_store(m_rd_sb->logdev_id, m_rd_sb->logstore_id, true /* append_mode */)
             .thenValue([this](auto log_store) {
                 m_data_journal = std::move(log_store);
                 m_rd_sb->logstore_id = m_data_journal->get_store_id();
                 m_data_journal->register_log_found_cb(bind_this(SoloReplDev::on_log_found, 3));
+                m_is_recovered = true;
             });
     } else {
-        m_logdev_id = logstore_service().create_new_logdev();
+        m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::TIMER);
         m_data_journal = logstore_service().create_new_log_store(m_logdev_id, true /* append_mode */);
         m_rd_sb->logstore_id = m_data_journal->get_store_id();
         m_rd_sb->logdev_id = m_logdev_id;
         m_rd_sb.write();
+        m_is_recovered = true;
     }
 }
 
 void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value,
-                                    repl_req_ptr_t rreq) {
+                                    repl_req_ptr_t rreq, bool part_of_batch, trace_id_t tid) {
     if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); }
-    rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1},
-               value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, header, key,
-               value.size);
 
+    // incr_pending_request_num();
+    auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1, .traceID = tid},
+                             value.size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true,
+                             header, key, value.size, m_listener);
+    HS_REL_ASSERT_EQ(status, ReplServiceError::OK, "Error in allocating local blks");
     // If it is header only entry, directly write to the journal
-    if (rreq->has_linked_data()) {
-        // Step 1: Alloc Blkid
-        auto const status = rreq->alloc_local_blks(m_listener, value.size);
-        HS_REL_ASSERT_EQ(status, ReplServiceError::OK, "Error in allocating local blks");
-
+    if (rreq->has_linked_data() && !rreq->has_state(repl_req_state_t::DATA_WRITTEN)) {
         // Write the data
-        data_service().async_write(value, rreq->local_blkid()).thenValue([this, rreq = std::move(rreq)](auto&& err) {
+        data_service().async_write(value, rreq->local_blkids()).thenValue([this, rreq = std::move(rreq)](auto&& err) {
             HS_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener
             write_journal(std::move(rreq));
         });
-    } else {
-        write_journal(std::move(rreq));
+    } else { write_journal(std::move(rreq)); }
+}
+
+// destroy is only called in worker thread;
+void SoloReplDev::destroy() {
+    HS_REL_ASSERT(iomanager.am_i_worker_reactor(), "Destroy should be called in worker thread");
+    while (!m_is_recovered) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
     }
+
+    hs()->logstore_service().remove_log_store(m_logdev_id, m_data_journal->get_store_id());
+    hs()->logstore_service().destroy_log_dev(m_logdev_id);
+
+    m_rd_sb.destroy();
 }
 
 void SoloReplDev::write_journal(repl_req_ptr_t rreq) {
@@ -62,17 +75,97 @@ void SoloReplDev::write_journal(repl_req_ptr_t rreq) {
             auto cur_lsn = m_commit_upto.load();
             if (cur_lsn < lsn) { m_commit_upto.compare_exchange_strong(cur_lsn, lsn); }
 
-            data_service().commit_blk(rreq->local_blkid());
-            m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkid(), rreq);
+            for (const auto& blkid : rreq->local_blkids()) {
+                data_service().commit_blk(blkid);
+            }
+            m_listener->on_commit(rreq->lsn(), rreq->header(), rreq->key(), rreq->local_blkids(), rreq);
+            // decr_pending_request_num();
         });
 }
 
+std::error_code SoloReplDev::alloc_blks(uint32_t data_size, const blk_alloc_hints& hints,
+                                        std::vector< MultiBlkId >& out_blkids) {
+    // if (is_stopping()) { return std::make_error_code(std::errc::operation_canceled); }
+
+    // incr_pending_request_num();
+    std::vector< BlkId > blkids;
+    auto status =
+        data_service().alloc_blks(sisl::round_up(uint32_cast(data_size), data_service().get_blk_size()), hints, blkids);
+    if (status != BlkAllocStatus::SUCCESS) {
+        DEBUG_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "Unable to allocate blks");
+        // decr_pending_request_num();
+        return std::make_error_code(std::errc::no_space_on_device);
+    }
+    for (auto& blkid : blkids) {
+        out_blkids.emplace_back(blkid);
+    }
+    // decr_pending_request_num();
+    return std::error_code{};
+}
+
+folly::Future< std::error_code > SoloReplDev::async_write(const std::vector< MultiBlkId >& blkids,
+                                                          sisl::sg_list const& value, bool part_of_batch,
+                                                          trace_id_t tid) {
+    /*if (is_stopping()) {
+        return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled));
+    }*/
+
+    // incr_pending_request_num();
+    HS_REL_ASSERT_GT(blkids.size(), 0, "Empty blkid vec");
+    std::vector< folly::Future< std::error_code > > futs;
+    futs.reserve(blkids.size());
+    sisl::sg_iterator sg_it{value.iovs};
+
+    for (const auto& blkid : blkids) {
+        auto sgs_size = blkid.blk_count() * data_service().get_blk_size();
+        const auto iovs = sg_it.next_iovs(sgs_size);
+        uint32_t total_size = 0;
+        for (auto& iov : iovs) {
+            total_size += iov.iov_len;
+        }
+        if (total_size != sgs_size) {
+            LOGINFO("Block size mismatch total_size={} sgs_size={}", total_size, sgs_size);
+            return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::invalid_argument));
+        }
+        sisl::sg_list sgs{sgs_size, iovs};
+        futs.emplace_back(data_service().async_write(sgs, blkid, part_of_batch));
+    }
+
+    return folly::collectAllUnsafe(futs).thenValue([this](auto&& v_res) {
+        for (const auto& err_c : v_res) {
+            if (sisl_unlikely(err_c.value())) {
+                return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::io_error));
+            }
+        }
+
+        // decr_pending_request_num();
+        return folly::makeFuture< std::error_code >(std::error_code{});
+    });
+}
+
+void SoloReplDev::async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header,
+                                      sisl::blob const& key, uint32_t data_size, repl_req_ptr_t rreq, trace_id_t tid) {
+    // if (is_stopping()) { return; }
+    // incr_pending_request_num();
+
+    // We expect clients to provide valid repl req ctx with blocks allocated.
+    HS_REL_ASSERT(rreq, "Invalid repl req ctx");
+    rreq->add_state(repl_req_state_t::BLK_ALLOCATED);
+    rreq->set_local_blkids(blkids);
+    auto status = rreq->init(repl_key{.server_id = 0, .term = 1, .dsn = 1, .traceID = tid},
+                             data_size ? journal_type_t::HS_DATA_LINKED : journal_type_t::HS_DATA_INLINED, true, header,
+                             key, data_size, m_listener);
+    HS_REL_ASSERT_EQ(status, ReplServiceError::OK, "Error in initializing repl req context.");
+
+    // Write to journal.
+    write_journal(std::move(rreq));
+}
+
 void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) {
     repl_journal_entry const* entry = r_cast< repl_journal_entry const* >(buf.bytes());
     uint32_t remain_size = buf.size() - sizeof(repl_journal_entry);
     HS_REL_ASSERT_EQ(entry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR,
                      "Mismatched version of journal entry found");
-    HS_REL_ASSERT_EQ(entry->code, journal_type_t::HS_DATA_LINKED, "Found a journal entry which is not data");
 
     uint8_t const* raw_ptr = r_cast< uint8_t const* >(entry) + sizeof(repl_journal_entry);
     sisl::blob header{raw_ptr, entry->user_header_size};
@@ -85,24 +178,44 @@ void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx
     raw_ptr += entry->key_size;
     remain_size -= entry->key_size;
 
-    sisl::blob value_blob{raw_ptr, remain_size};
-    MultiBlkId blkid;
-    if (remain_size) { blkid.deserialize(value_blob, true /* copy */); }
+    std::vector< MultiBlkId > blkids;
+    while (remain_size > 0) {
+        MultiBlkId blkid;
+        sisl::blob value_blob{raw_ptr, sizeof(BlkId)};
+        blkid.deserialize(value_blob, true /* copy */);
+        raw_ptr += sizeof(BlkId);
+        remain_size -= sizeof(BlkId);
+        blkids.push_back(blkid);
+    }
 
     m_listener->on_pre_commit(lsn, header, key, nullptr);
 
     auto cur_lsn = m_commit_upto.load();
     if (cur_lsn < lsn) { m_commit_upto.compare_exchange_strong(cur_lsn, lsn); }
 
-    m_listener->on_commit(lsn, header, key, blkid, nullptr);
+    m_listener->on_commit(lsn, header, key, blkids, nullptr);
 }
 
 folly::Future< std::error_code > SoloReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size,
-                                                         bool part_of_batch) {
-    return data_service().async_read(bid, sgs, size, part_of_batch);
+                                                         bool part_of_batch, trace_id_t tid) {
+    /*if (is_stopping()) {
+        return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled));
+    }*/
+    // incr_pending_request_num();
+    auto result = data_service().async_read(bid, sgs, size, part_of_batch);
+    // decr_pending_request_num();
+    return result;
 }
 
-void SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid) { data_service().async_free_blk(bid); }
+folly::Future< std::error_code > SoloReplDev::async_free_blks(int64_t, MultiBlkId const& bid, trace_id_t tid) {
+    /*if (is_stopping()) {
+        return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::operation_canceled));
+    }*/
+    // incr_pending_request_num();
+    auto result = data_service().async_free_blk(bid);
+    // decr_pending_request_num();
+    return result;
+}
 
 uint32_t SoloReplDev::get_blk_size() const { return data_service().get_blk_size(); }
 
diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h
index cddb94856..9cf41dcce 100644
--- a/src/lib/replication/repl_dev/solo_repl_dev.h
+++ b/src/lib/replication/repl_dev/solo_repl_dev.h
@@ -16,6 +16,7 @@
 
 #include <boost/smart_ptr/intrusive_ref_counter.hpp>
 #include <boost/intrusive_ptr.hpp>
+#include <boost/uuid/nil_generator.hpp>
 
 #include <homestore/replication_service.hpp>
 #include <homestore/replication/repl_dev.h>
@@ -30,37 +31,70 @@ class CP;
 class SoloReplDev : public ReplDev {
 private:
     logdev_id_t m_logdev_id;
-    std::shared_ptr< HomeLogStore > m_data_journal;
+    std::shared_ptr< HomeLogStore > m_data_journal{nullptr};
     superblk< repl_dev_superblk > m_rd_sb;
     uuid_t m_group_id;
     std::atomic< logstore_seq_num_t > m_commit_upto{-1};
+    std::atomic< bool > m_is_recovered{false};
 
 public:
     SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing);
     virtual ~SoloReplDev() = default;
 
+    virtual std::error_code alloc_blks(uint32_t data_size, const blk_alloc_hints& hints,
+                                       std::vector< MultiBlkId >& out_blkids) override;
+    virtual folly::Future< std::error_code > async_write(const std::vector< MultiBlkId >& blkids,
+                                                         sisl::sg_list const& value, bool part_of_batch = false,
+                                                         trace_id_t tid = 0) override;
+    virtual void async_write_journal(const std::vector< MultiBlkId >& blkids, sisl::blob const& header,
+                                     sisl::blob const& key, uint32_t data_size, repl_req_ptr_t ctx,
+                                     trace_id_t tid = 0) override;
+
     void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value,
-                           repl_req_ptr_t ctx) override;
+                           repl_req_ptr_t ctx, bool part_of_batch = false, trace_id_t tid = 0) override;
 
     folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size,
-                                                bool part_of_batch = false) override;
+                                                bool part_of_batch = false, trace_id_t tid = 0) override;
 
-    void async_free_blks(int64_t lsn, MultiBlkId const& blkid) override;
+    folly::Future< std::error_code > async_free_blks(int64_t lsn, MultiBlkId const& blkid, trace_id_t tid = 0) override;
 
     AsyncReplResult<> become_leader() override { return make_async_error(ReplServiceError::OK); }
     bool is_leader() const override { return true; }
     replica_id_t get_leader_id() const override { return m_group_id; }
     std::vector< peer_info > get_replication_status() const override {
-        return std::vector< peer_info >{peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0}};
+        return std::vector< peer_info >{
+            peer_info{.id_ = m_group_id, .replication_idx_ = 0, .last_succ_resp_us_ = 0, .priority_ = 1}};
+    }
+    bool is_ready_for_traffic() const override { return true; }
+    void purge() override {}
+
+    std::shared_ptr< snapshot_context > deserialize_snapshot_context(sisl::io_blob_safe& snp_ctx) override {
+        return nullptr;
     }
 
     uuid_t group_id() const override { return m_group_id; }
 
+    void set_custom_rdev_name(std::string const& name) override {
+        std::strncpy(m_rd_sb->rdev_name, name.c_str(), m_rd_sb->max_name_len - 1);
+        m_rd_sb->rdev_name[m_rd_sb->max_name_len - 1] = '\0';
+    }
+
+    repl_lsn_t get_last_commit_lsn() const override { return 0; }
+    repl_lsn_t get_last_append_lsn() override { return 0; };
+
     uint32_t get_blk_size() const override;
 
+    void quiesce_reqs() override { return; }
+    void resume_accepting_reqs() override { return; }
+
+    // clear reqs that has allocated blks on the given chunk.
+    void clear_chunk_req(chunk_num_t chunk_id) override { return; }
+
     void cp_flush(CP* cp);
     void cp_cleanup(CP* cp);
 
+    void destroy();
+
 private:
     void write_journal(repl_req_ptr_t rreq);
     void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx);
diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp
index 89800df3f..6f3861d59 100644
--- a/src/lib/replication/service/generic_repl_svc.cpp
+++ b/src/lib/replication/service/generic_repl_svc.cpp
@@ -16,6 +16,7 @@
 #include <homestore/meta_service.hpp>
 #include <homestore/blkdata_service.hpp>
 #include <homestore/logstore_service.hpp>
+#include <boost/uuid/uuid.hpp>
 #include "common/homestore_assert.hpp"
 #include "replication/service/generic_repl_svc.h"
 #include "replication/service/raft_repl_service.h"
@@ -87,6 +88,9 @@ void SoloReplService::start() {
     }
     m_sb_bufs.clear();
 
+    LOGINFO("Repl devs load completed, calling upper layer on_repl_devs_init_completed");
+    m_repl_app->on_repl_devs_init_completed();
+
     hs()->data_service().start();
     hs()->logstore_service().start(hs()->is_first_time_boot());
 
@@ -95,8 +99,23 @@ void SoloReplService::start() {
 }
 
 void SoloReplService::stop() {
-    GenericReplService::stop();
+    /*start_stopping();
+    while (true) {
+        auto pending_request_num = get_pending_request_num();
+        if (!pending_request_num) break;
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+    }*/
+
+    // stop all repl_devs
+    {
+        std::unique_lock lg(m_rd_map_mtx);
+        for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) {
+            auto rdev = std::dynamic_pointer_cast< SoloReplDev >(it->second);
+            rdev->stop();
+        }
+    }
     hs()->logstore_service().stop();
+    hs()->data_service().stop();
 }
 
 AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t group_id,
@@ -109,6 +128,7 @@ AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t
     auto listener = m_repl_app->create_repl_dev_listener(group_id);
     listener->set_repl_dev(rdev);
     rdev->attach_listener(std::move(listener));
+    // incr_pending_request_num();
 
     {
         std::unique_lock lg(m_rd_map_mtx);
@@ -116,15 +136,42 @@ AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t
         if (!happened) {
             // We should never reach here, as we have failed to emplace in map, but couldn't find entry
             DEBUG_ASSERT(false, "Unable to put the repl_dev in rd map");
+            // decr_pending_request_num();
             return make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_ALREADY_EXISTS);
         }
     }
 
+    // decr_pending_request_num();
     return make_async_success< shared< ReplDev > >(rdev);
 }
 
 folly::SemiFuture< ReplServiceError > SoloReplService::remove_repl_dev(group_id_t group_id) {
-    return folly::makeSemiFuture< ReplServiceError >(ReplServiceError::NOT_IMPLEMENTED);
+    // RD_LOGI("Removing repl dev for group_id={}", boost::uuids::to_string(group_id));
+    auto rdev = get_repl_dev(group_id);
+    if (rdev.hasError()) { return folly::makeSemiFuture(rdev.error()); }
+
+    auto rdev_ptr = rdev.value();
+
+    // 1. Firstly stop the repl dev which waits for any outstanding requests to finish
+    rdev_ptr->stop();
+
+    // 2. Destroy the repl dev which will remove the logstore and free the memory;
+    dp_cast< SoloReplDev >(rdev_ptr)->destroy();
+
+    // 3. detaches both ways:
+    // detach rdev from its listener and listener from rdev;
+    rdev_ptr->detach_listener();
+    {
+        // 4. remove from rd map which finally call SoloReplDev's destructor because this is the last one holding ref to
+        // this instance;
+        std::unique_lock lg(m_rd_map_mtx);
+        m_rd_map.erase(group_id);
+    }
+
+    // 5. now destroy the upper layer's listener instance;
+    m_repl_app->destroy_repl_dev_listener(group_id);
+
+    return folly::makeSemiFuture(ReplServiceError::OK);
 }
 
 void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) {
@@ -147,23 +194,31 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki
     }
 }
 
-AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, replica_id_t member_out,
-                                                  replica_id_t member_in) const {
+AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const replica_member_info& member_out,
+                                                        const replica_member_info& member_in, uint32_t commit_quorum,
+                                                        uint64_t trace_id) const {
     return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED);
 }
 
-std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; }
+AsyncReplResult<> SoloReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target,
+                                    uint32_t commit_quorum, bool wait_and_verify, uint64_t trace_id) const {
+    return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED);
+}
+
+std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) {
+    return std::make_unique< CPContext >(new_cp);
+}
 
 folly::Future< bool > SoloReplServiceCPHandler::cp_flush(CP* cp) {
     repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) {
-        if (repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_flush(cp); }
+        if (repl_dev) { dp_cast< SoloReplDev >(repl_dev)->cp_flush(cp); }
     });
     return folly::makeFuture< bool >(true);
 }
 
 void SoloReplServiceCPHandler::cp_cleanup(CP* cp) {
     repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) {
-        if (repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_cleanup(cp); }
+        if (repl_dev) { dp_cast< SoloReplDev >(repl_dev)->cp_cleanup(cp); }
     });
 }
 
diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h
index e2d445427..cd63a8866 100644
--- a/src/lib/replication/service/generic_repl_svc.h
+++ b/src/lib/replication/service/generic_repl_svc.h
@@ -73,8 +73,12 @@ class SoloReplService : public GenericReplService {
                                                          std::set< replica_id_t > const& members) override;
     folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override;
     void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override;
-    AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out,
-                                     replica_id_t member_in) const override;
+    AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out,
+                                           const replica_member_info& member_in, uint32_t commit_quorum = 0,
+                                           uint64_t trace_id = 0) const override;
+    AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target,
+                                        uint32_t commit_quorum, bool wait_and_verify = true,
+                                        uint64_t trace_id = 0) const override;
 };
 
 class SoloReplServiceCPHandler : public CPCallbacks {
diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp
index 65d928390..8df5d5e6a 100644
--- a/src/lib/replication/service/raft_repl_service.cpp
+++ b/src/lib/replication/service/raft_repl_service.cpp
@@ -59,6 +59,17 @@ ReplServiceError RaftReplService::to_repl_error(nuraft::cmd_result_code code) {
     return ret;
 }
 
+// NuRaft priority decay coefficient is set to 0.8(currently not configurable). For more details, please refer to
+// https://github.com/eBay/NuRaft/blob/master/docs/leader_election_priority.md
+int32_t RaftReplService::compute_raft_follower_priority() {
+    auto max_wait_round = std::min(raft_priority_election_round_upper_limit,
+                                   HS_DYNAMIC_CONFIG(consensus.max_wait_rounds_of_priority_election));
+    if (max_wait_round == 0) { return raft_leader_priority; }
+    auto priority = 1 + static_cast< int32_t >(
+                        std::ceil(raft_leader_priority * std::pow(raft_priority_decay_coefficient, max_wait_round)));
+    return priority;
+}
+
 RaftReplService::RaftReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} {
     m_config_sb_bufs.reserve(100);
     meta_service().register_handler(
@@ -79,12 +90,20 @@ void RaftReplService::start() {
         .ssl_key_ = ioenvironment.get_ssl_key(),
         .ssl_cert_ = ioenvironment.get_ssl_cert(),
         .token_verifier_ = std::dynamic_pointer_cast< sisl::GrpcTokenVerifier >(ioenvironment.get_token_verifier()),
-        .token_client_ = std::dynamic_pointer_cast< sisl::GrpcTokenClient >(ioenvironment.get_token_client())};
+        .token_client_ = std::dynamic_pointer_cast< sisl::GrpcTokenClient >(ioenvironment.get_token_client()),
+        .max_receive_message_size_ = HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size),
+        .max_send_message_size_ = HS_DYNAMIC_CONFIG(consensus.max_grpc_message_size)};
     m_msg_mgr = nuraft_mesg::init_messaging(params, weak_from_this(), true /* with_data_channel */);
 
     LOGINFO("Starting RaftReplService with server_uuid={} port={}", boost::uuids::to_string(params.server_uuid_),
             params.mesg_port_);
 
+    // check if ssl cert files are provided, if yes, monitor the changes
+    if (!params.ssl_key_.empty() && !params.ssl_cert_.empty()) {
+        ioenvironment.with_file_watcher();
+        monitor_cert_changes();
+    }
+
     // Step 2: Register all RAFT parameters. At the end of this step, raft is ready to be created/join group
     auto r_params = nuraft::raft_params()
                         .with_election_timeout_lower(HS_DYNAMIC_CONFIG(consensus.elect_to_low_ms))
@@ -99,7 +118,13 @@ void RaftReplService::start() {
                         .with_snapshot_enabled(HS_DYNAMIC_CONFIG(consensus.snapshot_freq_distance))
                         .with_leadership_expiry(HS_DYNAMIC_CONFIG(consensus.leadership_expiry_ms))
                         .with_reserved_log_items(HS_DYNAMIC_CONFIG(consensus.num_reserved_log_items))
+                        .with_snapshot_sync_ctx_timeout(HS_DYNAMIC_CONFIG(consensus.snapshot_sync_ctx_timeout_ms))
                         .with_auto_forwarding(false);
+    // new_joiner_type fully disabled log pack behavior.
+    // There is no callback available for handling and localizing the log entries within the pack, which could
+    // result in data corruption.
+    r_params.use_new_joiner_type_ = false;
+    r_params.use_bg_thread_for_snapshot_io_ = HS_DYNAMIC_CONFIG(consensus.use_bg_thread_for_snapshot_io);
     r_params.return_method_ = nuraft::raft_params::async_handler;
     m_msg_mgr->register_mgr_type(params.default_group_type_, r_params);
 
@@ -118,23 +143,47 @@ void RaftReplService::start() {
     // We need to first load the repl_dev with its config and then attach the raft config to that repl dev.
     for (auto const& [buf, mblk] : m_config_sb_bufs) {
         auto rdev = raft_group_config_found(buf, voidptr_cast(mblk));
-        rdev->on_restart();
+        // if repl_dev is in destroy_pending state, it will not be loaded.
+        if (rdev) rdev->on_restart();
     }
     m_config_sb_bufs.clear();
+    LOGINFO("Repl devs load completed, calling upper layer on_repl_devs_init_completed");
+    m_repl_app->on_repl_devs_init_completed();
 
     // Step 5: Start the data and logstore service now. This step is essential before we can ask Raft to join groups etc
-    hs()->data_service().start();
+
+    // It is crucial to start the logstore before the enalbe data channel. This is because during log replay,
+    // the commit_blks() function is called, which interacts with the allocator.
+    // Starting the data channel before the log replay is complete can lead to a race condition between
+    // PUSHDATA operations and log replay.
+    // For example, consider LSN 100 in the log store is associated with PBA1. After a restart, the allocator
+    // is only aware of allocations up to the last checkpoint and may consider PBA1 as available.
+    // If a PUSHDATA request is received during this time, PBA1 could be allocated again to a new request,
+    // leading to data corruption by overwriting the data associated with LSN 100.
+    // Now the data channel is started in join_group().
+
+    LOGINFO("Starting LogStore service, fist_boot = {}", hs()->is_first_time_boot());
     hs()->logstore_service().start(hs()->is_first_time_boot());
+    LOGINFO("Started LogStore service, log replay should already done till this point");
+    // all log stores are replayed, time to start data service.
+    LOGINFO("Starting DataService");
+    hs()->data_service().start();
 
-    // Step 6: Iterate all the repl dev and ask each one of the join the raft group.
-    for (auto it = m_rd_map.begin(); it != m_rd_map.end();) {
-        auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second);
-        rdev->wait_for_logstore_ready();
-        if (!rdev->join_group()) {
-            it = m_rd_map.erase(it);
-        } else {
-            ++it;
-        }
+    // Step 6: Iterate all the repl devs and ask each one of them to join the raft group concurrently.
+    std::vector< std::future< bool > > join_group_futures;
+    for (const auto& [_, repl_dev] : m_rd_map) {
+        join_group_futures.emplace_back(std::async(std::launch::async, [&repl_dev]() {
+            auto rdev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev);
+            rdev->wait_for_logstore_ready();
+
+            // upper layer can register a callback to be notified when log replay is done.
+            if (auto listener = rdev->get_listener(); listener) listener->on_log_replay_done(rdev->group_id());
+            return rdev->join_group();
+        }));
+    }
+
+    for (auto& future : join_group_futures) {
+        if (!future.get()) HS_REL_ASSERT(false, "FAILED TO JOIN GROUP, PANIC HERE");
     }
 
     // Step 7: Register to CPManager to ensure we can flush the superblk.
@@ -148,12 +197,75 @@ void RaftReplService::start() {
 }
 
 void RaftReplService::stop() {
-    stop_reaper_thread();
-    GenericReplService::stop();
+#if 0
+    start_stopping();
+    while (true) {
+        auto pending_request_num = get_pending_request_num();
+        if (!pending_request_num) break;
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+    }
+#endif
+
+    // stop all repl_devs
+    {
+        std::unique_lock lg(m_rd_map_mtx);
+        for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) {
+            auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second);
+            rdev->stop();
+        }
+    }
+
+    // this will stop and shutdown all the repl_dev and grpc server(data channel).
+    // for each raft_repl_dev:
+    // 1 Cancel snapshot requests if exist.
+    // 2 Terminate background commit thread.
+    // 3 Cancel all scheduler tasks.
+    // after m_msg_mgr is reset , no further data will hit data service and no futher log will hit log store.
     m_msg_mgr.reset();
     hs()->logstore_service().stop();
 }
 
+void RaftReplService::monitor_cert_changes() {
+    auto fw = ioenvironment.get_file_watcher();
+    auto cert_change_cb = [this](const std::string filepath, const bool deleted) {
+        LOGINFO("file change event for {}, deleted? {}", filepath, deleted)
+        // do not block file_watcher thread
+        std::thread restart_svc(&RaftReplService::restart_raft_svc, this, filepath, deleted);
+        restart_svc.detach();
+    };
+
+    // monitor ssl cert file
+    if (!fw->register_listener(ioenvironment.get_ssl_cert(), "hs_ssl_cert_watcher", cert_change_cb)) {
+        LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", "hs_ssl_cert_watcher",
+                 ioenvironment.get_ssl_cert());
+    }
+    // monitor ssl key file
+    if (!fw->register_listener(ioenvironment.get_ssl_key(), "hs_ssl_key_watcher", cert_change_cb)) {
+        LOGERROR("Failed to register listner, {} to watch file {}, Not monitoring cert files", "hs_ssl_key_watcher",
+                 ioenvironment.get_ssl_key());
+    }
+}
+
+void RaftReplService::restart_raft_svc(const std::string filepath, const bool deleted) {
+    if (deleted && !wait_for_cert(filepath)) {
+        LOGINFO("file {} deleted, ", filepath)
+        // wait for the deleted file to be added again
+        throw std::runtime_error(fmt::format("file {} not found! Can not start grpc server", filepath));
+    }
+    const std::unique_lock lock(raft_restart_mutex);
+    m_msg_mgr->restart_server();
+    if (deleted) { monitor_cert_changes(); }
+}
+
+bool RaftReplService::wait_for_cert(const std::string& filepath) {
+    auto attempts = cert_change_timeout / cert_check_sleep;
+    for (auto i = attempts; i > 0; --i) {
+        if (std::filesystem::exists(filepath)) { return true; }
+        std::this_thread::sleep_for(cert_check_sleep);
+    }
+    return false;
+}
+
 RaftReplDev* RaftReplService::raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie) {
     json_superblk group_config;
     auto& js = group_config.load(buf, meta_cookie);
@@ -234,14 +346,18 @@ AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t
             return make_async_error< shared< ReplDev > >(to_repl_error(status.error()));
         }
 
+        auto follower_priority = compute_raft_follower_priority();
+
         auto my_id = m_repl_app->get_my_repl_id();
         for (auto& member : members) {
             if (member == my_id) { continue; } // Skip myself
             do {
-                auto const result = m_msg_mgr->add_member(group_id, member).get();
+                auto srv_config = nuraft::srv_config(nuraft_mesg::to_server_id(member), 0, boost::uuids::to_string(member), "",
+                                                     false, follower_priority);
+                auto const result = m_msg_mgr->add_member(group_id, srv_config).get();
                 if (result) {
-                    LOGINFOMOD(replication, "Groupid={}, new member={} added", boost::uuids::to_string(group_id),
-                               boost::uuids::to_string(member));
+                    LOGINFOMOD(replication, "Groupid={}, new member={} added with priority={}", boost::uuids::to_string(group_id),
+                               boost::uuids::to_string(member), follower_priority);
                     break;
                 } else if (result.error() != nuraft::CONFIG_CHANGING) {
                     LOGWARNMOD(replication, "Groupid={}, add member={} failed with error={}",
@@ -293,7 +409,10 @@ folly::SemiFuture< ReplServiceError > RaftReplService::remove_repl_dev(group_id_
     auto rdev_result = get_repl_dev(group_id);
     if (!rdev_result) { return folly::makeSemiFuture< ReplServiceError >(ReplServiceError::SERVER_NOT_FOUND); }
 
-    return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value())->destroy_group();
+    auto ret = std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value())->destroy_group();
+
+    // decr_pending_request_num();
+    return ret;
 }
 
 void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) {
@@ -314,7 +433,22 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki
     }
 
     if (rd_sb->destroy_pending == 0x1) {
-        LOGINFOMOD(replication, "ReplDev group_id={} was destroyed, skipping the load", group_id);
+        LOGINFOMOD(replication, "ReplDev group_id={} was destroyed, reclaim the stale resource", group_id);
+        // if we do not add the repl_dev to m_rd_map, it will not be permanently destroyed since gc thread finds the
+        // pending destroy repl_dev only from m_rd_map. so, we should try to reclaim all the repl_dev stale resources
+        // here.
+
+        // 1 since we permanantly destroy the repl_dev here, it will not join_raft group where raft_server will be
+        // created. hence , no need to detroy it through nuraft_mesg, where raft_server will be shutdown.
+        // 2  m_raft_config_sb will be destroyed in raft_group_config_found() method if repl_dev is is not found, so
+        // skip it.
+
+        // 3 logdev will be destroyed in delete_unopened_logdevs() if we don't open it(create repl_dev) here, so skip
+        // it.
+
+        // 4 destroy the superblk, and after this,  the repl_dev will not be loaded and found again.
+        rd_sb.destroy();
+
         return;
     }
 
@@ -325,9 +459,49 @@ void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki
     add_repl_dev(group_id, rdev);
 }
 
-AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, replica_id_t member_out,
-                                                  replica_id_t member_in) const {
-    return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED);
+// replace_member actually has two phases:
+// 1. start_replace_member: flip member_out to learner and add member_in.
+// 2. complete_replace_member: remove member_out.
+// In this function, it only invokes replDev start_replace_member. There is
+// a background reaper thread helps periodically check the member_in replication status, after in_member has caught up,
+// will trigger replDev complete_replace_member.
+AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, const replica_member_info& member_out,
+                                                  const replica_member_info& member_in, uint32_t commit_quorum,
+                                                  uint64_t trace_id) const {
+    // if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING);
+    // incr_pending_request_num();
+    auto rdev_result = get_repl_dev(group_id);
+    if (!rdev_result) { return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND); }
+
+    return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value())
+        ->start_replace_member(member_out, member_in, commit_quorum, trace_id)
+        .via(&folly::InlineExecutor::instance())
+        .thenValue([this](auto&& e) mutable {
+            if (e.hasError()) {
+                // decr_pending_request_num();
+                return make_async_error<>(e.error());
+            }
+            // decr_pending_request_num();
+            return make_async_success<>();
+        });
+}
+
+AsyncReplResult<> RaftReplService::flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target, uint32_t commit_quorum,
+                                    bool wait_and_verify, uint64_t trace_id) const {
+    // if (is_stopping()) return make_async_error<>(ReplServiceError::STOPPING);
+    // incr_pending_request_num();
+    auto rdev_result = get_repl_dev(group_id);
+    if (!rdev_result) {
+        // decr_pending_request_num();
+        return make_async_error<>(ReplServiceError::SERVER_NOT_FOUND);
+    }
+    return std::dynamic_pointer_cast< RaftReplDev >(rdev_result.value())
+        ->flip_learner_flag(member, target, commit_quorum, wait_and_verify, trace_id)
+        .via(&folly::InlineExecutor::instance())
+        .thenValue([this](auto&& e) mutable {
+            if (e.hasError()) { return make_async_error<>(e.error()); }
+            return make_async_success<>();
+        });
 }
 
 ////////////////////// Reaper Thread related //////////////////////////////////
@@ -344,7 +518,7 @@ void RaftReplService::start_reaper_thread() {
             m_rdev_gc_timer_hdl = iomanager.schedule_thread_timer(
                 HS_DYNAMIC_CONFIG(generic.repl_dev_cleanup_interval_sec) * 1000 * 1000 * 1000, true /* recurring */,
                 nullptr, [this](void*) {
-                    LOGINFOMOD(replication, "Reaper Thread: Doing GC");
+                    LOGDEBUGMOD(replication, "Reaper Thread: Doing GC");
                     gc_repl_reqs();
                     gc_repl_devs();
                 });
@@ -361,12 +535,19 @@ void RaftReplService::start_reaper_thread() {
                 HS_DYNAMIC_CONFIG(consensus.flush_durable_commit_interval_ms) * 1000 * 1000, true /* recurring */,
                 nullptr, [this](void*) { flush_durable_commit_lsn(); });
 
+            // Check replace_member sync status to see a new member is fully synced up and ready to remove the old member
+            m_replace_member_sync_check_timer_hdl = iomanager.schedule_thread_timer(
+                HS_DYNAMIC_CONFIG(consensus.replace_member_sync_check_interval_ms) * 1000 * 1000, true /* recurring */,
+                nullptr, [this](void*) { check_replace_member_status(); });
+
+
             p.setValue();
         } else {
             // Cancel all recurring timers started
             iomanager.cancel_timer(m_rdev_gc_timer_hdl, true /* wait */);
             iomanager.cancel_timer(m_rdev_fetch_timer_hdl, true /* wait */);
             iomanager.cancel_timer(m_flush_durable_commit_timer_hdl, true /* wait */);
+            iomanager.cancel_timer(m_replace_member_sync_check_timer_hdl, true /* wait */);
         }
     });
     std::move(f).get();
@@ -407,21 +588,43 @@ void RaftReplService::gc_repl_reqs() {
 }
 
 void RaftReplService::gc_repl_devs() {
-    std::unique_lock lg(m_rd_map_mtx);
-    for (auto it = m_rd_map.begin(); it != m_rd_map.end();) {
-        auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second);
-        if (rdev->is_destroy_pending() &&
-            (get_elapsed_time_sec(rdev->destroyed_time()) >=
-             HS_DYNAMIC_CONFIG(generic.repl_dev_cleanup_interval_sec))) {
-            LOGINFOMOD(replication,
-                       "ReplDev group_id={} was destroyed, shutting down the raft group in delayed fashion now",
-                       rdev->group_id());
-            m_msg_mgr->leave_group(rdev->group_id());
-            it = m_rd_map.erase(it);
-        } else {
-            ++it;
+    /* incr_pending_request_num();
+    // Skip gc when raft repl service is stopping to avoid concurrency issues between repl_dev's stop and destroy ops.
+    if (is_stopping()) {
+        LOGINFOMOD(replication, "ReplSvc is stopping, skipping GC");
+        decr_pending_request_num();
+        return;
+    } */
+
+    std::vector< group_id_t > groups_to_leave;
+    {
+        std::shared_lock lg(m_rd_map_mtx);
+        for (auto it = m_rd_map.begin(); it != m_rd_map.end(); ++it) {
+            auto rdev = std::dynamic_pointer_cast< RaftReplDev >(it->second);
+            if (rdev->is_destroy_pending() &&
+                (get_elapsed_time_sec(rdev->destroyed_time()) >=
+                 HS_DYNAMIC_CONFIG(generic.repl_dev_cleanup_interval_sec))) {
+                LOGINFOMOD(replication,
+                           "ReplDev group_id={} was destroyed, shutting down the raft group in delayed fashion now",
+                           rdev->group_id());
+                groups_to_leave.push_back(rdev->group_id());
+            }
+        }
+    }
+
+    // Call leave_group to shut down the raft server and destroy all resources on the repl dev.
+    // This operation may require acquiring the m_rd_map_mtx lock for some steps (e.g., trigger cp flush).
+    // Therefore, we perform it outside the lock scope and then remove group from m_rd_map.
+    for (const auto& group_id : groups_to_leave) {
+        m_msg_mgr->leave_group(group_id);
+        // notify consumer to cleanup any resources associated with the listener itself;
+        m_repl_app->destroy_repl_dev_listener(group_id);
+        {
+            std::unique_lock lg(m_rd_map_mtx);
+            m_rd_map.erase(group_id);
         }
     }
+    // decr_pending_request_num();
 }
 
 void RaftReplService::flush_durable_commit_lsn() {
@@ -433,12 +636,53 @@ void RaftReplService::flush_durable_commit_lsn() {
     }
 }
 
+void RaftReplService::check_replace_member_status() {
+    std::unique_lock lg(m_rd_map_mtx);
+    for (auto& rdev_parent : m_rd_map) {
+        auto rdev = std::dynamic_pointer_cast< RaftReplDev >(rdev_parent.second);
+        rdev->check_replace_member_status();
+    }
+}
+
 ///////////////////// RaftReplService CP Callbacks /////////////////////////////
-std::unique_ptr< CPContext > RaftReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; }
+int ReplSvcCPContext::add_repl_dev_ctx(ReplDev* dev, cshared< ReplDevCPContext > dev_ctx) {
+    m_cp_ctx_map.emplace(dev, dev_ctx);
+    return 0;
+}
+
+cshared< ReplDevCPContext > ReplSvcCPContext::get_repl_dev_ctx(ReplDev* dev) {
+    if (m_cp_ctx_map.count(dev) == 0) {
+        // it is possible if a repl dev added during the cp flush
+        return std::make_shared< ReplDevCPContext >();
+    }
+    return m_cp_ctx_map[dev];
+}
+
+std::unique_ptr< CPContext > RaftReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) {
+    // checking if cur_cp == nullptr as on_switchover_cp will be called when registering the cp handler
+    if (cur_cp != nullptr) {
+        // Add cp info from all devices to current cp.
+        // We dont need taking cp_guard as cp_mgr already taken it in do_trigger_cp_flush
+        auto cur_cp_ctx = s_cast< ReplSvcCPContext* >(cur_cp->context(cp_consumer_t::REPLICATION_SVC));
+        repl_service().iterate_repl_devs([cur_cp, cur_cp_ctx](cshared< ReplDev >& repl_dev) {
+            // we need collecting the LSN of each repl dev and put it into current CP.
+            // There is no dirty buffers accumulated to new_cp yet, as the cp_mgr ensure replication_svc
+            // is the first one being called during cp switchover.
+            auto dev_ctx = std::static_pointer_cast< RaftReplDev >(repl_dev)->get_cp_ctx(cur_cp);
+            cur_cp_ctx->add_repl_dev_ctx(repl_dev.get(), std::move(dev_ctx));
+        });
+    }
+    // create new ctx
+    auto ctx = std::make_unique< ReplSvcCPContext >(new_cp);
+    return ctx;
+}
 
 folly::Future< bool > RaftReplServiceCPHandler::cp_flush(CP* cp) {
-    repl_service().iterate_repl_devs(
-        [cp](cshared< ReplDev >& repl_dev) { std::static_pointer_cast< RaftReplDev >(repl_dev)->cp_flush(cp); });
+    auto cp_ctx = s_cast< ReplSvcCPContext* >(cp->context(cp_consumer_t::REPLICATION_SVC));
+    repl_service().iterate_repl_devs([cp, cp_ctx](cshared< ReplDev >& repl_dev) {
+        auto dev_ctx = cp_ctx->get_repl_dev_ctx(repl_dev.get());
+        std::static_pointer_cast< RaftReplDev >(repl_dev)->cp_flush(cp, dev_ctx);
+    });
     return folly::makeFuture< bool >(true);
 }
 
diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h
index a38cbbccb..aa9550c4f 100644
--- a/src/lib/replication/service/raft_repl_service.h
+++ b/src/lib/replication/service/raft_repl_service.h
@@ -31,6 +31,12 @@
 
 namespace homestore {
 
+constexpr auto cert_change_timeout = std::chrono::seconds(1200);
+constexpr auto cert_check_sleep = std::chrono::seconds(1);
+constexpr int32_t raft_leader_priority = 100;
+constexpr double raft_priority_decay_coefficient = 0.8;
+constexpr uint32_t raft_priority_election_round_upper_limit = 5;
+
 struct repl_dev_superblk;
 class RaftReplDev;
 
@@ -46,12 +52,15 @@ class RaftReplService : public GenericReplService,
     iomgr::timer_handle_t m_rdev_fetch_timer_hdl;
     iomgr::timer_handle_t m_rdev_gc_timer_hdl;
     iomgr::timer_handle_t m_flush_durable_commit_timer_hdl;
+    iomgr::timer_handle_t m_replace_member_sync_check_timer_hdl;
     iomgr::io_fiber_t m_reaper_fiber;
+    std::mutex raft_restart_mutex;
 
 public:
     RaftReplService(cshared< ReplApplication >& repl_app);
 
     static ReplServiceError to_repl_error(nuraft::cmd_result_code code);
+    int32_t compute_raft_follower_priority();
 
     ///////////////////// Overrides of nuraft_mesg::MessagingApplication ////////////////////
     std::string lookup_peer(nuraft_mesg::peer_id_t const&) override;
@@ -69,8 +78,13 @@ class RaftReplService : public GenericReplService,
                                                          std::set< replica_id_t > const& members) override;
     folly::SemiFuture< ReplServiceError > remove_repl_dev(group_id_t group_id) override;
     void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override;
-    AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out,
-                                     replica_id_t member_in) const override;
+    AsyncReplResult<> replace_member(group_id_t group_id, const replica_member_info& member_out,
+                                           const replica_member_info& member_in, uint32_t commit_quorum = 0,
+                                           uint64_t trace_id = 0) const override;
+
+    AsyncReplResult<> flip_learner_flag(group_id_t group_id, const replica_member_info& member, bool target,
+                                        uint32_t commit_quorum, bool wait_and_verify = true,
+                                        uint64_t trace_id = 0) const override;
 
 private:
     RaftReplDev* raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie);
@@ -80,7 +94,28 @@ class RaftReplService : public GenericReplService,
     void gc_repl_devs();
     void gc_repl_reqs();
     void flush_durable_commit_lsn();
+    void check_replace_member_status();
+    void monitor_cert_changes();
+    void restart_raft_svc(const std::string filepath, const bool deleted);
+    bool wait_for_cert(const std::string& filepath);
+};
+
+// cp context for repl_dev, repl_dev cp_lsn is critical cursor in the system,
+// anything below the cp_lsn we believed is persisted through cp and will not
+// go through replay.  The cp_lsn need to be kept into ctx when switchover_cp,
+// and the persist of repl_dev_cp need to be done after all other consumers succeed.
 
+struct ReplDevCPContext;
+
+class ReplSvcCPContext : public CPContext {
+    std::shared_mutex m_cp_map_mtx;
+    std::map< ReplDev*, cshared< ReplDevCPContext > > m_cp_ctx_map;
+
+public:
+    ReplSvcCPContext(CP* cp) : CPContext(cp) {};
+    virtual ~ReplSvcCPContext() = default;
+    int add_repl_dev_ctx(ReplDev* dev, cshared< ReplDevCPContext > dev_ctx);
+    cshared< ReplDevCPContext > get_repl_dev_ctx(ReplDev* dev);
 };
 
 class RaftReplServiceCPHandler : public CPCallbacks {
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index d922f71cb..dece4b36e 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -102,6 +102,41 @@ if (${io_tests})
     target_link_libraries(test_cp_mgr homestore ${COMMON_TEST_DEPS} GTest::gtest)
     add_test(NAME CPMgr COMMAND test_cp_mgr)
 
+    can_build_epoll_io_tests(epoll_tests)
+    if(${epoll_tests})
+        add_test(NAME LogDev-Epoll COMMAND test_log_dev)
+        add_test(NAME LogStore-Epoll COMMAND test_log_store)
+        add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr)
+        add_test(NAME DataService-Epoll COMMAND test_data_service)
+    endif()
+
+    can_build_spdk_io_tests(spdk_tests)
+    if(${spdk_tests})
+        add_test(NAME LogStore-Spdk COMMAND test_log_store -- --spdk "true")
+        add_test(NAME LogDev-Spdk COMMAND test_log_dev -- --spdk "true")
+        add_test(NAME MetaBlkMgr-Spdk COMMAND test_meta_blk_mgr -- --spdk "true")
+        add_test(NAME DataSerice-Spdk COMMAND test_data_service -- --spdk "true")
+        if(${epoll_tests})
+        SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk)
+        SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk)
+        endif()
+    endif()
+endif()
+
+can_build_repl_tests(repl_tests)
+if (${repl_tests})
+    add_executable(test_repl_service)
+    target_sources(test_repl_service PRIVATE test_repl_service.cpp)
+    target_link_libraries(test_repl_service homestore ${COMMON_TEST_DEPS} GTest::gmock)
+
+    add_executable(test_repl_log_store)
+    target_sources(test_repl_log_store PRIVATE test_repl_log_store.cpp)
+    target_link_libraries(test_repl_log_store hs_logdev homestore ${COMMON_TEST_DEPS} GTest::gmock)
+
+    add_executable(test_repl_data_service)
+    target_sources(test_repl_data_service PRIVATE test_repl_data_service.cpp)
+    target_link_libraries(test_repl_data_service homestore ${COMMON_TEST_DEPS} GTest::gmock)
+
     add_executable(test_solo_repl_dev)
     target_sources(test_solo_repl_dev PRIVATE test_solo_repl_dev.cpp)
     target_link_libraries(test_solo_repl_dev homestore ${COMMON_TEST_DEPS} GTest::gmock)
@@ -114,30 +149,24 @@ if (${io_tests})
     target_sources(test_raft_repl_dev PRIVATE test_raft_repl_dev.cpp)
     target_link_libraries(test_raft_repl_dev homestore ${COMMON_TEST_DEPS} GTest::gmock)
 
+    add_executable(test_raft_repl_dev_dynamic)
+    target_sources(test_raft_repl_dev_dynamic PRIVATE test_raft_repl_dev_dynamic.cpp)
+    target_link_libraries(test_raft_repl_dev_dynamic homestore ${COMMON_TEST_DEPS} GTest::gmock)
+
     can_build_epoll_io_tests(epoll_tests)
     if(${epoll_tests})
-        add_test(NAME LogDev-Epoll COMMAND test_log_dev)
-        add_test(NAME LogStore-Epoll COMMAND test_log_store)
-        add_test(NAME HomeRaftLogStore-Epoll COMMAND test_home_raft_logstore)
-        add_test(NAME MetaBlkMgr-Epoll COMMAND test_meta_blk_mgr)
-        add_test(NAME DataService-Epoll COMMAND test_data_service)
         add_test(NAME RaftReplDev-Epoll COMMAND test_raft_repl_dev)
-        # add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev)
+        add_test(NAME RaftReplDevDynamic-Epoll COMMAND test_raft_repl_dev_dynamic --override_config homestore_config.consensus.replace_member_sync_check_interval_ms=1000)
+        add_test(NAME SoloReplDev-Epoll COMMAND test_solo_repl_dev)
+        add_test(NAME HomeRaftLogStore-Epoll COMMAND test_home_raft_logstore)
     endif()
 
     can_build_spdk_io_tests(spdk_tests)
     if(${spdk_tests})
-        add_test(NAME LogStore-Spdk COMMAND test_log_store -- --spdk "true")
-        add_test(NAME LogDev-Spdk COMMAND test_log_dev -- --spdk "true")
-        add_test(NAME MetaBlkMgr-Spdk COMMAND test_meta_blk_mgr -- --spdk "true")
-        add_test(NAME DataSerice-Spdk COMMAND test_data_service -- --spdk "true")
-        add_test(NAME SoloReplDev-Spdk COMMAND test_solo_repl_dev -- --spdk "true")
-        add_test(NAME HomeRaftLogStore-Spdk COMMAND test_home_raft_logstore -- --spdk "true")
         add_test(NAME RaftReplDev-Spdk COMMAND test_raft_repl_dev -- --spdk "true")
-        if(${epoll_tests})
-        SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk)
-        SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk)
-        endif()
+        add_test(NAME RaftReplDevDynamic-Spdk COMMAND test_raft_repl_dev_dynamic -- --spdk "true" --override_config homestore_config.consensus.replace_member_sync_check_interval_ms=1000)
+        add_test(NAME SoloReplDev-Spdk COMMAND test_solo_repl_dev -- --spdk "true")
+        add_test(NAME HomeRaftLogStore-Spdk COMMAND test_home_raft_logstore -- --spdk "true")   
     endif()
 endif()
 
diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp
index a7e14df41..9b2b07c52 100644
--- a/src/tests/btree_helpers/btree_test_helper.hpp
+++ b/src/tests/btree_helpers/btree_test_helper.hpp
@@ -276,7 +276,7 @@ struct BtreeTestHelper {
     }
 
     void range_remove_existing_random() {
-        static std::uniform_int_distribution< uint32_t > s_rand_range_generator{2, 5};
+        static std::uniform_int_distribution< uint32_t > s_rand_range_generator{2, 50};
 
         auto const [start_k, end_k] = m_shadow_map.pick_random_existing_keys(s_rand_range_generator(m_re));
         do_range_remove(start_k, end_k, true /* only_existing */);
diff --git a/src/tests/btree_helpers/btree_test_kvs.hpp b/src/tests/btree_helpers/btree_test_kvs.hpp
index cac6bc4dc..c1baa8f38 100644
--- a/src/tests/btree_helpers/btree_test_kvs.hpp
+++ b/src/tests/btree_helpers/btree_test_kvs.hpp
@@ -60,6 +60,17 @@ static std::string gen_random_string(size_t len, uint32_t preamble = std::numeri
     }
     return str;
 }
+template < typename T >
+static bool willAdditionOverflow(T a, int b) {
+    static_assert(std::is_integral< T >::value, "Template parameter must be an integral type.");
+
+    if (b > 0) {
+        return a > std::numeric_limits< T >::max() - b;
+    } else if (b < 0) {
+        return a < std::numeric_limits< T >::min() - b;
+    }
+    return false;
+}
 
 using namespace homestore;
 
@@ -310,7 +321,7 @@ class TestIntervalKey : public BtreeIntervalKey {
         m_offset = other->m_offset;
     }
 
-    std::string to_string() const override { return fmt::format("{}.{}", m_base, m_offset); }
+    std::string to_string() const override { return fmt::format("{}", key()); }
 
     static uint32_t get_max_size() { return sizeof(TestIntervalKey); }
 
@@ -323,9 +334,10 @@ class TestIntervalKey : public BtreeIntervalKey {
 
     int distance(BtreeKey const& f) const override {
         TestIntervalKey const& from = s_cast< TestIntervalKey const& >(f);
-        DEBUG_ASSERT_EQ(m_base, from.m_base, "Invalid from key for distance");
-        DEBUG_ASSERT_GE(m_offset, from.m_offset, "Invalid from key for distance");
-        return m_offset - from.m_offset;
+        uint64_t this_val = (uint64_cast(m_base) << 32) | m_offset;
+        uint64_t from_val = (uint64_cast(from.m_base) << 32) | from.m_offset;
+        DEBUG_ASSERT_GE(this_val, from_val, "Invalid from key for distance");
+        return static_cast< int >(this_val - from_val);
     }
 
     bool is_interval_key() const override { return true; }
@@ -519,7 +531,8 @@ class TestIntervalValue : public BtreeIntervalValue {
         m_offset = other->m_offset;
     }
 
-    std::string to_string() const override { return fmt::format("{}.{}", m_base_val, m_offset); }
+    std::string to_string() const override { return fmt::format("{}", value()); }
+    uint64_t value() const { return (uint64_cast(m_base_val) << 16) | m_offset; }
 
     friend std::ostream& operator<<(std::ostream& os, const TestIntervalValue& v) {
         os << v.to_string();
diff --git a/src/tests/btree_helpers/shadow_map.hpp b/src/tests/btree_helpers/shadow_map.hpp
index 8aae946d3..7d2070e04 100644
--- a/src/tests/btree_helpers/shadow_map.hpp
+++ b/src/tests/btree_helpers/shadow_map.hpp
@@ -242,6 +242,7 @@ class ShadowMap {
             file << key.key() << " " << value << '\n';
         }
         file.close();
+        LOGINFO("Saved shadow map to file: {}", filename);
     }
 
     void load(const std::string& filename) {
diff --git a/src/tests/log_store_benchmark.cpp b/src/tests/log_store_benchmark.cpp
index a80f67b45..c34db76a3 100644
--- a/src/tests/log_store_benchmark.cpp
+++ b/src/tests/log_store_benchmark.cpp
@@ -55,7 +55,7 @@ class BenchLogStore {
 public:
     friend class SampleDB;
     BenchLogStore() {
-        m_logdev_id = logstore_service().create_new_logdev();
+        m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
         m_log_store = logstore_service().create_new_log_store(m_logdev_id, true /* append_mode */);
         m_log_store->register_log_found_cb(bind_this(BenchLogStore::on_log_found, 3));
         m_nth_entry.store(0);
diff --git a/src/tests/test_blk_read_tracker.cpp b/src/tests/test_blk_read_tracker.cpp
index 0c91ea035..8c372cf55 100644
--- a/src/tests/test_blk_read_tracker.cpp
+++ b/src/tests/test_blk_read_tracker.cpp
@@ -25,8 +25,7 @@
 
 using namespace homestore;
 
- 
-SISL_OPTIONS_ENABLE(logging, test_blk_read_tracker, nuraft_mesg)
+SISL_OPTIONS_ENABLE(logging, test_blk_read_tracker)
 
 VENUM(op_type_t, uint8_t, insert = 0, remove = 1, wait_on = 2, max_op = 3);
 class BlkReadTrackerTest : public testing::Test {
diff --git a/src/tests/test_blkid.cpp b/src/tests/test_blkid.cpp
index 2555d321d..93a1813b8 100644
--- a/src/tests/test_blkid.cpp
+++ b/src/tests/test_blkid.cpp
@@ -7,8 +7,7 @@
 
 #include <homestore/blk.h>
 
- 
-SISL_OPTIONS_ENABLE(logging, test_blkid, nuraft_mesg)
+SISL_OPTIONS_ENABLE(logging, test_blkid)
 
 SISL_OPTION_GROUP(test_blkid,
                   (num_iterations, "", "num_iterations", "number of iterations",
diff --git a/src/tests/test_btree_long_running b/src/tests/test_btree_long_running
index 2e24d18bf..3c9ff5ffa 100644
--- a/src/tests/test_btree_long_running
+++ b/src/tests/test_btree_long_running
@@ -39,7 +39,7 @@ SISL_OPTION_GROUP(
     (num_iters, "", "num_iters", "number of iterations for rand ops",
      ::cxxopts::value< uint32_t >()->default_value("500"), "number"),
     (num_entries, "", "num_entries", "number of entries to test with",
-     ::cxxopts::value< uint32_t >()->default_value("5000"), "number"),
+     ::cxxopts::value< uint32_t >()->default_value("7000"), "number"),
     (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"),
     (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""),
     (operation_list, "", "operation_list", "operation list instead of default created following by percentage",
@@ -269,6 +269,34 @@ TYPED_TEST(BtreeTest, RandomInsert) {
     this->get_all();
 }
 
+TYPED_TEST(BtreeTest, TriggerCacheEviction) {
+    // restart homestore with smaller cache %
+    HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) {
+        s.resource_limits.cache_size_percent = 1u;
+        HS_SETTINGS_FACTORY().save();
+    });
+
+    this->restart_homestore();
+
+    LOGINFO("TriggerCacheEviction test start");
+    const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >();
+    LOGINFO("Step 1: Do insert for {} entries", num_entries);
+    for (uint32_t i{0}; i < num_entries; ++i) {
+        this->put(i, btree_put_type::INSERT);
+        // this->print();
+    }
+
+    this->get_all();
+
+    // reset cache pct
+    HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) {
+        s.resource_limits.cache_size_percent = 65u;
+        HS_SETTINGS_FACTORY().save();
+    });
+
+    LOGINFO("TriggerCacheEviction test end");
+}
+
 TYPED_TEST(BtreeTest, SequentialRemove) {
     LOGINFO("SequentialRemove test start");
     // Forward sequential insert
@@ -633,6 +661,8 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin
                 this->m_bt->count_keys(this->m_bt->root_node_id()));
         BtreeTestHelper< TestType >::TearDown();
         m_helper.shutdown_homestore(false);
+        this->m_bt.reset();
+        log_obj_life_counter();
     }
 
 private:
@@ -663,6 +693,10 @@ int main(int argc, char* argv[]) {
         auto seed = SISL_OPTIONS["seed"].as< uint64_t >();
         LOGINFO("Using seed {} to sow the random generation", seed);
         g_re.seed(seed);
+    } else {
+        auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+        LOGINFO("No seed provided. Using randomly generated seed: {}", seed);
+        g_re.seed(seed);
     }
     auto ret = RUN_ALL_TESTS();
     return ret;
diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp
index 1634984f3..8698f5100 100644
--- a/src/tests/test_btree_node.cpp
+++ b/src/tests/test_btree_node.cpp
@@ -104,7 +104,7 @@ struct NodeTest : public testing::Test {
         }
     }
 
-    void put_range(uint32_t k, uint32_t count) {
+    void put_range(uint64_t k, uint32_t count) {
         btree_put_type put_type;
         if constexpr (!std::is_same_v< V, TestIntervalValue >) {
             // For non-interval values we support only update, so we need to first put the value
@@ -341,6 +341,41 @@ TYPED_TEST(NodeTest, SequentialInsert) {
     this->validate_get_any(98, 102);
 }
 
+TYPED_TEST(NodeTest, SimpleInsert) {
+    auto oc = this->m_node1->occupied_size();
+    this->put(1, btree_put_type::INSERT);
+    this->put(2, btree_put_type::INSERT);
+    this->put(3, btree_put_type::INSERT);
+    this->remove(2);
+    this->remove(1);
+    this->remove(3);
+    auto oc2 = this->m_node1->occupied_size();
+    ASSERT_EQ(oc, oc2) << "Occupied size cannot be more than original size";
+    this->put(1, btree_put_type::INSERT);
+    this->put(2, btree_put_type::INSERT);
+    this->put(3, btree_put_type::INSERT);
+    this->remove(3);
+    this->remove(2);
+    this->remove(1);
+    ASSERT_EQ(oc, oc2) << "Occupied size must be the same as original size";
+
+    this->put(2, btree_put_type::INSERT);
+    this->put(1, btree_put_type::INSERT);
+    this->put(4, btree_put_type::INSERT);
+    this->put(3, btree_put_type::INSERT);
+    for (uint32_t i = 5; i <= 50; ++i) {
+        this->put(i, btree_put_type::INSERT);
+    }
+    LOGDEBUG("Creating a hole with size of 11 for prefix compaction usecase");
+    for (uint32_t i = 10; i <= 20; ++i) {
+        this->remove(i);
+    }
+    this->m_node1->move_out_to_right_by_entries(*this->m_node2, 20);
+    uint32_t copy_idx{0u};
+    this->m_node1->append_copy_in_upto_size(*this->m_node2, copy_idx, std::numeric_limits< uint32_t >::max(),
+                                            /*copy_only_if_fits=*/false);
+}
+
 TYPED_TEST(NodeTest, ReverseInsert) {
     for (uint32_t i{100}; (i > 0 && this->has_room()); --i) {
         this->put(i - 1, btree_put_type::INSERT);
diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp
index 4df2a7231..404ba8247 100644
--- a/src/tests/test_common/homestore_test_common.hpp
+++ b/src/tests/test_common/homestore_test_common.hpp
@@ -32,6 +32,10 @@
 #include <iomgr/iomgr_config_generated.h>
 #include <common/homestore_assert.hpp>
 
+#ifdef _PRERELEASE
+#include "common/crash_simulator.hpp"
+#endif
+
 const std::string SPDK_ENV_VAR_STRING{"USER_WANT_SPDK"};
 const std::string HTTP_SVC_ENV_VAR_STRING{"USER_WANT_HTTP_OFF"};
 const std::string CP_WATCHDOG_TIMER_SEC{"USER_SET_CP_WD_TMR_SEC"};          // used in nightly test;
@@ -194,8 +198,8 @@ class HSTestHelper {
         }
 
         homestore::HomeStore::instance()->shutdown();
+        iomanager.stop(); // Stop iomanager first in case any fiber is still referencing homestore resources
         homestore::HomeStore::reset_instance();
-        iomanager.stop();
 
         if (cleanup) {
             remove_files(m_generated_devs);
@@ -208,9 +212,14 @@ class HSTestHelper {
     test_params& params(ServiceType svc) { return m_token.svc_params_[svc]; }
 
 #ifdef _PRERELEASE
-    void wait_for_crash_recovery() {
+    void wait_for_crash_recovery(bool check_will_crash = false) {
+        if(check_will_crash && !homestore::HomeStore::instance()->crash_simulator().will_crash()) {
+            return;
+        }
+        LOGDEBUG("Waiting for m_crash_recovered future");
         m_crash_recovered.getFuture().get();
         m_crash_recovered = folly::Promise< folly::Unit >();
+        homestore::HomeStore::instance()->crash_simulator().set_will_crash(false);
     }
 #endif
 
@@ -247,6 +256,11 @@ class HSTestHelper {
         m_fc.inject_delay_flip(flip_name, {null_cond}, freq, delay_usec);
         LOGDEBUG("Flip {} set", flip_name);
     }
+
+    void remove_flip(const std::string flip_name) {
+        m_fc.remove_flip(flip_name);
+        LOGDEBUG("Flip {} removed", flip_name);
+    }
 #endif
 
     static void fill_data_buf(uint8_t* buf, uint64_t size, uint64_t pattern = 0) {
@@ -335,7 +349,7 @@ class HSTestHelper {
         auto fut = homestore::hs()->cp_mgr().trigger_cp_flush(true /* force */);
         auto on_complete = [&](auto success) {
             HS_REL_ASSERT_EQ(success, true, "CP Flush failed");
-            LOGINFO("CP Flush completed");
+            LOGDEBUG("CP Flush completed");
         };
 
         if (wait) {
@@ -458,7 +472,9 @@ class HSTestHelper {
             } else if ((svc == ServiceType::LOG)) {
                 hsi->with_log_service();
             } else if (svc == ServiceType::REPLICATION) {
+#ifdef REPLICATION_SUPPORT
                 hsi->with_repl_data_service(tp.repl_app, tp.custom_chunk_selector);
+#endif
             }
         }
 #ifdef _PRERELEASE
diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp
index 67abe2f8e..c00788127 100644
--- a/src/tests/test_common/hs_repl_test_common.hpp
+++ b/src/tests/test_common/hs_repl_test_common.hpp
@@ -17,6 +17,8 @@
  */
 
 #pragma once
+#include "raft_repl_test_base.hpp"
+
 #include <mutex>
 #include <condition_variable>
 #include <map>
@@ -35,9 +37,13 @@
 #include <sisl/grpc/rpc_client.hpp>
 #include "test_common/homestore_test_common.hpp"
 
+#include <replication/service/raft_repl_service.h>
+
 SISL_OPTION_GROUP(test_repl_common_setup,
                   (replicas, "", "replicas", "Total number of replicas",
                    ::cxxopts::value< uint32_t >()->default_value("3"), "number"),
+                  (spare_replicas, "", "spare_replicas", "Additional number of spare replicas not part of repldev",
+                   ::cxxopts::value< uint32_t >()->default_value("1"), "number"),
                   (base_port, "", "base_port", "Port number of first replica",
                    ::cxxopts::value< uint16_t >()->default_value("4000"), "number"),
                   (replica_num, "", "replica_num",
@@ -113,6 +119,9 @@ class HSReplTestHelper : public HSTestHelper {
         create_repl_dev_listener(homestore::group_id_t group_id) override {
             return helper_.get_listener(group_id);
         }
+        void destroy_repl_dev_listener(homestore::group_id_t) override {}
+
+        void on_repl_devs_init_completed() { LOGINFO("Repl dev init completed CB called"); }
 
         std::pair< std::string, uint16_t > lookup_peer(homestore::replica_id_t replica_id) const override {
             uint16_t port;
@@ -134,11 +143,12 @@ class HSReplTestHelper : public HSTestHelper {
     HSReplTestHelper(std::string const& name, std::vector< std::string > const& args, char** argv) :
             name_{name}, args_{args}, argv_{argv} {}
 
-    void setup() {
+    void setup(uint32_t num_replicas) {
+        num_replicas_ = num_replicas;
         replica_num_ = SISL_OPTIONS["replica_num"].as< uint16_t >();
+
         sisl::logging::SetLogger(name_ + std::string("_replica_") + std::to_string(replica_num_));
         sisl::logging::SetLogPattern("[%D %T%z] [%^%L%$] [%n] [%t] %v");
-        auto const num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >();
 
         boost::uuids::string_generator gen;
         for (uint32_t i{0}; i < num_replicas; ++i) {
@@ -226,7 +236,7 @@ class HSReplTestHelper : public HSTestHelper {
 
     void reset_setup() {
         teardown();
-        setup();
+        setup(num_replicas_);
     }
 
     void restart(uint32_t shutdown_delay_secs = 5u) {
@@ -249,6 +259,10 @@ class HSReplTestHelper : public HSTestHelper {
         start_homestore();
     }
 
+    void reinit_repl_app() {
+        m_token.params(HS_SERVICE::REPLICATION).repl_app = std::make_unique< TestReplApplication >(*this);
+    }
+
     uint16_t replica_num() const { return replica_num_; }
     homestore::replica_id_t my_replica_id() const { return my_replica_id_; }
     homestore::replica_id_t replica_id(uint16_t member_id) const {
@@ -273,8 +287,12 @@ class HSReplTestHelper : public HSTestHelper {
 
         if (replica_num_ == 0) {
             std::set< homestore::replica_id_t > members;
-            std::transform(members_.begin(), members_.end(), std::inserter(members, members.end()),
-                           [](auto const& p) { return p.first; });
+            // By default we create repl dev with number of members equal to replicas argument.
+            // We dont add spare replica's to the group by default.
+            for (auto& m : members_) {
+                if (m.second < SISL_OPTIONS["replicas"].as< uint32_t >()) { members.insert(m.first); }
+            }
+
             group_id_t repl_group_id = hs_utils::gen_random_uuid();
             {
                 std::unique_lock lg(groups_mtx_);
@@ -283,7 +301,21 @@ class HSReplTestHelper : public HSTestHelper {
 
             auto v = hs()->repl_service().create_repl_dev(repl_group_id, members).get();
             ASSERT_EQ(v.hasValue(), true)
-                << "Error in creating repl dev for group_id=" << boost::uuids::to_string(repl_group_id).c_str();
+                << "Error in creating repl dev for group_id=" << boost::uuids::to_string(repl_group_id).c_str()
+                << ", err=" << v.error();
+            auto& raftService = dynamic_cast< RaftReplService& >(hs()->repl_service());
+            auto follower_priority = raftService.compute_raft_follower_priority();
+            auto repl_dev = v.value();
+            ASSERT_EQ(my_replica_id_, repl_dev->get_leader_id());
+            auto peer_info = repl_dev->get_replication_status();
+            for (auto pinfo : peer_info) {
+                LOGINFO("Replica={} has priority={}", boost::uuids::to_string(pinfo.id_), pinfo.priority_);
+                if (pinfo.id_ == my_replica_id_) {
+                    ASSERT_EQ(raft_leader_priority, pinfo.priority_);
+                } else {
+                    ASSERT_EQ(follower_priority, pinfo.priority_);
+                }
+            }
         }
     }
 
@@ -299,6 +331,7 @@ class HSReplTestHelper : public HSTestHelper {
         auto listener = std::move(pending_listeners_[0]);
         repl_groups_.insert(std::pair(group_id, listener));
         pending_listeners_.erase(pending_listeners_.begin());
+        LOGINFO("Got listener for group_id={} replica={}", boost::uuids::to_string(group_id), replica_num_);
         return listener;
     }
 
@@ -309,6 +342,11 @@ class HSReplTestHelper : public HSTestHelper {
         }
     }
 
+    void add_listener(std::shared_ptr< ReplDevListener > listener) {
+        std::unique_lock lg(groups_mtx_);
+        pending_listeners_.emplace_back(listener);
+    }
+
     size_t num_listeners() const {
         std::unique_lock lg(groups_mtx_);
         return repl_groups_.size();
@@ -346,6 +384,7 @@ class HSReplTestHelper : public HSTestHelper {
     std::string name_;
     std::vector< std::string > args_;
     char** argv_;
+    uint32_t num_replicas_;
 
     std::vector< homestore::dev_info > dev_list_;
 
diff --git a/src/tests/test_common/raft_repl_test_base.hpp b/src/tests/test_common/raft_repl_test_base.hpp
new file mode 100644
index 000000000..80eeb1573
--- /dev/null
+++ b/src/tests/test_common/raft_repl_test_base.hpp
@@ -0,0 +1,770 @@
+/*********************************************************************************
+ * Modifications Copyright 2017-2019 eBay Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *    https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ *
+ *********************************************************************************/
+#pragma once
+
+#include <vector>
+#include <iostream>
+#include <filesystem>
+#include <thread>
+
+#include <gtest/gtest.h>
+#include <iomgr/io_environment.hpp>
+#include <sisl/logging/logging.h>
+#include <sisl/options/options.h>
+#include <sisl/fds/buffer.hpp>
+#include <folly/init/Init.h>
+#include <folly/executors/GlobalExecutor.h>
+#include <boost/uuid/nil_generator.hpp>
+
+#include <gtest/gtest.h>
+#include <iomgr/iomgr_flip.hpp>
+#include <homestore/blk.h>
+#include <homestore/homestore.hpp>
+#include <homestore/homestore_decl.hpp>
+#include <homestore/blkdata_service.hpp>
+#include <homestore/replication_service.hpp>
+#include <homestore/replication/repl_dev.h>
+#include "common/homestore_config.hpp"
+#include "common/homestore_assert.hpp"
+#include "common/homestore_utils.hpp"
+
+#define private public
+#include "test_common/hs_repl_test_common.hpp"
+#include "replication/service/raft_repl_service.h"
+#include "replication/repl_dev/raft_repl_dev.h"
+
+using namespace homestore;
+
+SISL_LOGGING_DEF(test_raft_repl_dev)
+SISL_LOGGING_INIT(HOMESTORE_LOG_MODS, nuraft_mesg, nuraft)
+
+SISL_OPTION_GROUP(test_raft_repl_dev,
+                  (block_size, "", "block_size", "block size to io",
+                   ::cxxopts::value< uint32_t >()->default_value("4096"), "number"),
+                  (num_raft_groups, "", "num_raft_groups", "number of raft groups per test",
+                   ::cxxopts::value< uint32_t >()->default_value("1"), "number"),
+                  // for below replication parameter, their default value always get from dynamic config, only used
+                  // when specified by user
+                  (snapshot_distance, "", "snapshot_distance", "distance between snapshots",
+                   ::cxxopts::value< uint32_t >()->default_value("0"), "number"),
+                  (num_raft_logs_resv, "", "num_raft_logs_resv", "number of raft logs reserved",
+                   ::cxxopts::value< uint32_t >()->default_value("0"), "number"),
+                  (res_mgr_audit_timer_ms, "", "res_mgr_audit_timer_ms", "resource manager audit timer",
+                   ::cxxopts::value< uint32_t >()->default_value("0"), "number"));
+
+SISL_OPTIONS_ENABLE(logging, test_raft_repl_dev, iomgr, config, test_common_setup, test_repl_common_setup)
+
+static std::unique_ptr< test_common::HSReplTestHelper > g_helper;
+static std::random_device g_rd{};
+static std::default_random_engine g_re{g_rd()};
+
+class TestReplicatedDB : public homestore::ReplDevListener {
+public:
+    struct Key {
+        uint64_t id_;
+        bool operator<(Key const& other) const { return id_ < other.id_; }
+    };
+
+    struct Value {
+        int64_t lsn_;
+        uint64_t data_size_;
+        uint64_t data_pattern_;
+        MultiBlkId blkid_;
+        uint64_t id_;
+    };
+
+    struct KeyValuePair {
+        Key key;
+        Value value;
+    };
+
+    struct test_req : public repl_req_ctx {
+        struct journal_header {
+            uint64_t data_size;
+            uint64_t data_pattern;
+            uint64_t key_id; // put it in header to test duplication in alloc_local_blks
+        };
+        journal_header jheader;
+        uint64_t key_id;
+        sisl::sg_list write_sgs;
+        sisl::sg_list read_sgs;
+
+        sisl::blob header_blob() { return sisl::blob(uintptr_cast(&jheader), sizeof(journal_header)); }
+        sisl::blob key_blob() { return sisl::blob{uintptr_cast(&key_id), sizeof(uint64_t)}; }
+
+        test_req() {
+            write_sgs.size = 0;
+            read_sgs.size = 0;
+            key_id = (uint64_t)rand() << 32 | rand();
+            jheader.key_id = key_id;
+        }
+
+        ~test_req() {
+            for (auto const& iov : write_sgs.iovs) {
+                iomanager.iobuf_free(uintptr_cast(iov.iov_base));
+            }
+
+            for (auto const& iov : read_sgs.iovs) {
+                iomanager.iobuf_free(uintptr_cast(iov.iov_base));
+            }
+        }
+    };
+
+    TestReplicatedDB() = default;
+    virtual ~TestReplicatedDB() = default;
+
+    void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key,
+                   std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override {
+        ASSERT_EQ(header.size(), sizeof(test_req::journal_header));
+        ASSERT_EQ(blkids.size(), 1);
+
+        auto jheader = r_cast< test_req::journal_header const* >(header.cbytes());
+        Key k{.id_ = *(r_cast< uint64_t const* >(key.cbytes()))};
+        Value v{.lsn_ = lsn,
+                .data_size_ = jheader->data_size,
+                .data_pattern_ = jheader->data_pattern,
+                .blkid_ = blkids[0],
+                .id_ = k.id_};
+
+        LOGINFOMOD(replication, "[Replica={}] Received commit on lsn={} dsn={} key={} value[blkid={} pattern={}]",
+                   g_helper->replica_num(), lsn, ctx->dsn(), k.id_, v.blkid_.to_string(), v.data_pattern_);
+
+        {
+            std::unique_lock lk(db_mtx_);
+            inmem_db_.insert_or_assign(k, v);
+            lsn_index_.emplace(lsn, v);
+            last_committed_lsn = lsn;
+            ++commit_count_;
+        }
+
+        if (ctx->is_proposer()) { g_helper->runner().next_task(); }
+    }
+
+    bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key,
+                       cintrusive< repl_req_ctx >& ctx) override {
+        LOGINFOMOD(replication, "[Replica={}] Received pre-commit on lsn={} dsn={}", g_helper->replica_num(), lsn,
+                   ctx->dsn());
+        return true;
+    }
+
+    void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key,
+                     cintrusive< repl_req_ctx >& ctx) override {
+        LOGINFOMOD(replication, "[Replica={}] Received rollback on lsn={}", g_helper->replica_num(), lsn);
+    }
+
+    void on_restart() {
+        LOGINFOMOD(replication, "restarted repl dev for [Replica={}] Group={}", g_helper->replica_num(),
+                   boost::uuids::to_string(repl_dev()->group_id()));
+    }
+
+    void on_error(ReplServiceError error, const sisl::blob& header, const sisl::blob& key,
+                  cintrusive< repl_req_ctx >& ctx) override {
+        LOGINFOMOD(replication, "[Replica={}] Received error={} on key={}", g_helper->replica_num(), enum_name(error),
+                   *(r_cast< uint64_t const* >(key.cbytes())));
+        g_helper->runner().comp_promise_.setException(folly::make_exception_wrapper< ReplServiceError >(error));
+    }
+
+    void notify_committed_lsn(int64_t lsn) override {
+        LOGINFOMOD(replication, "[Replica={}] Received notify_committed_lsn={}", g_helper->replica_num(), lsn);
+    }
+
+    void on_config_rollback(int64_t lsn) override {
+        LOGINFOMOD(replication, "[Replica={}] Received config rollback at lsn={}", g_helper->replica_num(), lsn);
+    }
+    void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) override {
+        LOGINFOMOD(replication, "[Replica={}] Received no_space_left at lsn={}, chunk_id={}", g_helper->replica_num(),
+                   lsn, chunk_id);
+    }
+
+    AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override {
+        std::lock_guard< std::mutex > lock(m_snapshot_lock);
+        auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot();
+        LOGINFOMOD(replication, "[Replica={}] Got snapshot callback term={} idx={}", g_helper->replica_num(),
+                   s->get_last_log_term(), s->get_last_log_idx());
+        m_last_snapshot = context;
+        return make_async_success<>();
+    }
+
+    static int64_t get_next_lsn(uint64_t& obj_id) { return obj_id & ((1ULL << 63) - 1); }
+    static void set_resync_msg_type_bit(uint64_t& obj_id) { obj_id |= 1ULL << 63; }
+
+    int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override {
+        auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot();
+        if (RaftStateMachine::is_hs_snp_obj(snp_data->offset)) {
+            LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset);
+            return -1;
+        }
+        if ((snp_data->offset & snp_obj_id_type_app) == 0) {
+            LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset);
+            return -1;
+        }
+
+        int64_t next_lsn = get_next_lsn(snp_data->offset);
+        if (next_lsn == 0) {
+            snp_data->is_last_obj = false;
+            snp_data->blob = sisl::io_blob_safe(sizeof(ulong));
+            LOGINFOMOD(replication,
+                       "[Replica={}] Read logical snapshot callback first message obj_id={} term={} idx={}",
+                       g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx());
+            return 0;
+        }
+
+        std::vector< KeyValuePair > kv_snapshot_obj;
+        // we can not use find to get the next element, since if the next lsn is a config lsn , it will not be put into
+        // lsn_index_ and as a result, the find will return the end of the map. so here we use lower_bound to get the
+        // first element to be read and transfered.
+        for (auto iter = lsn_index_.lower_bound(next_lsn); iter != lsn_index_.end(); iter++) {
+            auto& v = iter->second;
+            kv_snapshot_obj.emplace_back(Key{v.id_}, v);
+            LOGTRACEMOD(replication, "[Replica={}] Read logical snapshot callback fetching lsn={} size={} pattern={}",
+                        g_helper->replica_num(), v.lsn_, v.data_size_, v.data_pattern_);
+            if (kv_snapshot_obj.size() >= 10) { break; }
+        }
+
+        if (kv_snapshot_obj.size() == 0) {
+            snp_data->is_last_obj = true;
+            LOGINFOMOD(replication, "Snapshot is_last_obj is true");
+            return 0;
+        }
+
+        int64_t kv_snapshot_obj_size = sizeof(KeyValuePair) * kv_snapshot_obj.size();
+        sisl::io_blob_safe blob{static_cast< uint32_t >(kv_snapshot_obj_size)};
+        std::memcpy(blob.bytes(), kv_snapshot_obj.data(), kv_snapshot_obj_size);
+        snp_data->blob = std::move(blob);
+        snp_data->is_last_obj = false;
+        LOGINFOMOD(replication, "[Replica={}] Read logical snapshot callback obj_id={} term={} idx={} num_items={}",
+                   g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(),
+                   kv_snapshot_obj.size());
+
+        return 0;
+    }
+
+    void snapshot_obj_write(uint64_t data_size, uint64_t data_pattern, MultiBlkId& out_blkids) {
+        auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >();
+        auto write_sgs = test_common::HSTestHelper::create_sgs(data_size, block_size, data_pattern);
+        auto fut = homestore::data_service().async_alloc_write(write_sgs, blk_alloc_hints{}, out_blkids);
+        std::move(fut).get();
+        for (auto const& iov : write_sgs.iovs) {
+            iomanager.iobuf_free(uintptr_cast(iov.iov_base));
+        }
+    }
+
+    void write_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override {
+        if (RaftStateMachine::is_hs_snp_obj(snp_data->offset)) {
+            LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset);
+            return;
+        }
+        int64_t next_lsn = get_next_lsn(snp_data->offset);
+        auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot();
+        auto last_committed_idx =
+            std::dynamic_pointer_cast< RaftReplDev >(repl_dev())->raft_server()->get_committed_log_idx();
+        if (next_lsn == 0) {
+            snp_data->offset = last_committed_lsn + 1;
+            set_resync_msg_type_bit(snp_data->offset);
+            LOGINFOMOD(replication, "[Replica={}] Save logical snapshot callback return obj_id={}",
+                       g_helper->replica_num(), snp_data->offset);
+            return;
+        }
+
+        size_t kv_snapshot_obj_size = snp_data->blob.size();
+        if (kv_snapshot_obj_size == 0) return;
+
+        size_t num_items = kv_snapshot_obj_size / sizeof(KeyValuePair);
+        std::unique_lock lk(db_mtx_);
+        auto ptr = r_cast< const KeyValuePair* >(snp_data->blob.bytes());
+        for (size_t i = 0; i < num_items; i++) {
+            auto key = ptr->key;
+            auto value = ptr->value;
+            LOGTRACEMOD(replication, "[Replica={}] Save logical snapshot got lsn={} data_size={} data_pattern={}",
+                        g_helper->replica_num(), value.lsn_, value.data_size_, value.data_pattern_);
+
+            // Write to data service and inmem map.
+            MultiBlkId out_blkids;
+            if (value.data_size_ != 0) {
+                snapshot_obj_write(value.data_size_, value.data_pattern_, out_blkids);
+                value.blkid_ = out_blkids;
+            }
+            inmem_db_.insert_or_assign(key, value);
+            last_committed_lsn = value.lsn_;
+            ++commit_count_;
+            ptr++;
+        }
+
+        snp_data->offset = last_committed_lsn + 1;
+        set_resync_msg_type_bit(snp_data->offset);
+        LOGINFOMOD(replication,
+                   "[Replica={}] Save logical snapshot callback obj_id={} term={} idx={} is_last={} num_items={}",
+                   g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(),
+                   snp_data->is_last_obj, num_items);
+    }
+
+    bool apply_snapshot(shared< snapshot_context > context) override {
+        std::lock_guard< std::mutex > lock(m_snapshot_lock);
+        auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot();
+        LOGINFOMOD(replication, "[Replica={}] Apply snapshot term={} idx={}", g_helper->replica_num(),
+                   s->get_last_log_term(), s->get_last_log_idx());
+        m_last_snapshot = context;
+        return true;
+    }
+
+    shared< snapshot_context > last_snapshot() override {
+        std::lock_guard< std::mutex > lock(m_snapshot_lock);
+        if (!m_last_snapshot) return nullptr;
+
+        auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(m_last_snapshot)->nuraft_snapshot();
+        LOGINFOMOD(replication, "[Replica={}] Last snapshot term={} idx={}", g_helper->replica_num(),
+                   s->get_last_log_term(), s->get_last_log_idx());
+        return m_last_snapshot;
+    }
+
+    void free_user_snp_ctx(void*& user_snp_ctx) override {}
+
+    ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size,
+                                                      cintrusive< homestore::repl_req_ctx >& hs_ctx) override {
+        auto jheader = r_cast< test_req::journal_header const* >(header.cbytes());
+        Key k{.id_ = jheader->key_id};
+        auto iter = inmem_db_.find(k);
+        if (iter != inmem_db_.end()) {
+            LOGDEBUG("data already exists in mem db, key={}", k.id_);
+            auto hints = blk_alloc_hints{};
+            hints.committed_blk_id = iter->second.blkid_;
+            return hints;
+        }
+        return blk_alloc_hints{};
+    }
+    void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {
+        LOGINFO("[Replica={}] start replace member out {} in {}", g_helper->replica_num(),
+                boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id));
+    }
+
+    void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {
+        LOGINFO("[Replica={}] complete replace member out {} in {}", g_helper->replica_num(),
+                boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id));
+    }
+
+    void on_destroy(const group_id_t& group_id) override {
+        LOGINFOMOD(replication, "[Replica={}] Group={} is being destroyed", g_helper->replica_num(),
+                   boost::uuids::to_string(group_id));
+        g_helper->unregister_listener(group_id);
+    }
+
+    void db_write(uint64_t data_size, uint32_t max_size_per_iov) {
+        static std::atomic< uint32_t > s_uniq_num{0};
+        auto req = intrusive< test_req >(new test_req());
+        req->jheader.data_size = data_size;
+        req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num;
+        req->jheader.key_id = req->key_id;
+        auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >();
+
+        LOGINFOMOD(replication, "[Replica={}] Db write key={} data_size={} pattern={} block_size={}",
+                   g_helper->replica_num(), req->key_id, data_size, req->jheader.data_pattern, block_size);
+
+        if (data_size != 0) {
+            req->write_sgs =
+                test_common::HSTestHelper::create_sgs(data_size, max_size_per_iov, req->jheader.data_pattern);
+        }
+
+        repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req, false, s_uniq_num);
+    }
+
+    void validate_db_data() {
+        g_helper->runner().set_num_tasks(inmem_db_.size());
+        while (!repl_dev()->is_ready_for_traffic()) {
+            LOGINFO("not yet ready for traffic, waiting");
+            std::this_thread::sleep_for(std::chrono::milliseconds{500});
+        }
+
+        LOGINFOMOD(replication, "[{}]: Total {} keys committed, validating them",
+                   boost::uuids::to_string(repl_dev()->group_id()), inmem_db_.size());
+        auto it = inmem_db_.begin();
+        g_helper->runner().set_task([this, &it]() {
+            Key k;
+            Value v;
+            {
+                std::unique_lock lk(db_mtx_);
+                std::tie(k, v) = *it;
+                ++it;
+            }
+
+            if (v.data_size_ != 0) {
+                auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >();
+                auto read_sgs = test_common::HSTestHelper::create_sgs(v.data_size_, block_size);
+
+                repl_dev()->async_read(v.blkid_, read_sgs, v.data_size_).thenValue([read_sgs, k, v](auto const ec) {
+                    LOGINFOMOD(replication, "Validating key={} value[blkid={} pattern={}]", k.id_, v.blkid_.to_string(),
+                               v.data_pattern_);
+                    RELEASE_ASSERT(!ec, "Read of blkid={} for key={} error={}", v.blkid_.to_string(), k.id_,
+                                   ec.message());
+                    for (auto const& iov : read_sgs.iovs) {
+                        test_common::HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len,
+                                                                     v.data_pattern_);
+                        iomanager.iobuf_free(uintptr_cast(iov.iov_base));
+                    }
+                    g_helper->runner().next_task();
+                });
+            } else {
+                g_helper->runner().next_task();
+            }
+        });
+        g_helper->runner().execute().get();
+    }
+
+    uint64_t db_commit_count() const {
+        std::shared_lock lk(db_mtx_);
+        return commit_count_;
+    }
+
+    uint64_t db_size() const {
+        std::shared_lock lk(db_mtx_);
+        return inmem_db_.size();
+    }
+
+    void create_snapshot() {
+        auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev());
+        ulong snapshot_idx = raft_repl_dev->raft_server()->create_snapshot();
+        LOGINFO("Manually create snapshot got index {}", snapshot_idx);
+    }
+
+    void truncate(int num_reserved_entries) {
+        auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev());
+        // raft_repl_dev->truncate(num_reserved_entries);
+        LOGINFO("Manually truncated");
+    }
+
+    void set_zombie() { zombie_ = true; }
+    bool is_zombie() {
+        // Wether a group is zombie(non recoverable)
+        return zombie_;
+    }
+
+private:
+    std::map< Key, Value > inmem_db_;
+    std::map< int64_t, Value > lsn_index_;
+    uint64_t commit_count_{0};
+    std::shared_mutex db_mtx_;
+    uint64_t last_committed_lsn{0};
+    std::shared_ptr< snapshot_context > m_last_snapshot{nullptr};
+    std::mutex m_snapshot_lock;
+    bool zombie_{false};
+};
+
+class RaftReplDevTestBase : public testing::Test {
+public:
+    void SetUp() override {
+        // By default it will create one db
+        for (uint32_t i{0}; i < SISL_OPTIONS["num_raft_groups"].as< uint32_t >(); ++i) {
+            auto db = std::make_shared< TestReplicatedDB >();
+            g_helper->register_listener(db);
+            dbs_.emplace_back(std::move(db));
+        }
+    }
+
+    void TearDown() override {
+        for (auto const& db : dbs_) {
+            if (db->is_zombie()) { continue; }
+            run_on_leader(db, [this, db]() {
+                auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get();
+                ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group";
+            });
+        }
+
+        for (auto const& db : dbs_) {
+            if (db->is_zombie()) { continue; }
+            auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev());
+            if (!repl_dev) continue;
+            int i = 0;
+            bool force_leave = false;
+            do {
+                std::this_thread::sleep_for(std::chrono::seconds(1));
+                auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service());
+                raft_repl_svc.gc_repl_devs();
+                LOGINFO("Waiting for repl dev to get destroyed");
+
+                // TODO: if leader is destroyed, but the follower does not receive the notification, it will not be
+                // destroyed for ever. we need handle this in raft_repl_dev. revisit here after making changes at
+                // raft_repl_dev side to hanle this case. this is a workaround to avoid the infinite loop for now.
+                if (i++ > 10 && !force_leave) {
+                    LOGWARN("has already waited for repl dev to get destroyed for 10 times, so do a force leave");
+                    repl_dev->force_leave();
+                    force_leave = true;
+                }
+
+            } while (!repl_dev->is_destroyed());
+        }
+    }
+
+    void generate_writes(uint64_t data_size, uint32_t max_size_per_iov, shared< TestReplicatedDB > db = nullptr) {
+        if (db == nullptr) { db = pick_one_db(); }
+        // LOGINFO("Writing on group_id={}", db->repl_dev()->group_id());
+        db->db_write(data_size, max_size_per_iov);
+    }
+
+    void wait_for_all_commits() { wait_for_commits(written_entries_); }
+
+    void wait_for_commits(uint64_t exp_writes) {
+        uint64_t total_writes{0};
+        while (true) {
+            total_writes = 0;
+            for (auto const& db : dbs_) {
+                total_writes += db->db_commit_count();
+            }
+
+            if (total_writes >= exp_writes) { break; }
+            std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+            LOGINFO("Replica={} received {} commits but expected {}", g_helper->replica_num(), total_writes,
+                    exp_writes);
+        }
+        LOGINFO("Replica={} has received {} commits as expected", g_helper->replica_num(), total_writes);
+    }
+
+    void validate_data() {
+        for (auto const& db : dbs_) {
+            db->validate_db_data();
+        }
+    }
+
+    shared< TestReplicatedDB > pick_one_db() { return dbs_[0]; }
+
+    void assign_leader(uint16_t replica) {
+        LOGINFO("Switch the leader to replica_num = {}", replica);
+        if (g_helper->replica_num() == replica) {
+            for (auto const& db : dbs_) {
+                do {
+                    auto result = db->repl_dev()->become_leader().get();
+                    if (result.hasError()) {
+                        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+                    } else {
+                        break;
+                    }
+                } while (true);
+            }
+        } else {
+            for (auto const& db : dbs_) {
+                homestore::replica_id_t leader_uuid;
+                while (true) {
+                    leader_uuid = db->repl_dev()->get_leader_id();
+                    if (!leader_uuid.is_nil() && (g_helper->member_id(leader_uuid) == replica)) { break; }
+
+                    LOGINFO("Waiting for replica={} to become leader", replica);
+                    std::this_thread::sleep_for(std::chrono::milliseconds{500});
+                }
+            }
+        }
+    }
+
+    void run_on_leader(std::shared_ptr< TestReplicatedDB > db, auto&& lambda) {
+        if (!db || !db->repl_dev()) {
+            // Spare which are not added to group will not have repl dev.
+            return;
+        }
+
+        do {
+            auto leader_uuid = db->repl_dev()->get_leader_id();
+
+            if (leader_uuid.is_nil()) {
+                LOGINFO("Waiting for leader to be elected for group={}", db->repl_dev()->group_id());
+                std::this_thread::sleep_for(std::chrono::milliseconds{500});
+            } else if (leader_uuid == g_helper->my_replica_id()) {
+                lambda();
+                break;
+            } else {
+                break;
+            }
+        } while (true);
+    }
+
+    void write_on_leader(uint32_t num_entries, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr,
+                         uint64_t* data_size = nullptr) {
+        if (dbs_[0]->repl_dev() == nullptr) return;
+
+        do {
+            auto repl_dev = dbs_[0]->repl_dev();
+            auto leader_uuid = repl_dev->get_leader_id();
+
+            if (leader_uuid.is_nil()) {
+                LOGINFO("Waiting for leader to be elected");
+                std::this_thread::sleep_for(std::chrono::milliseconds{500});
+            } else if (leader_uuid == g_helper->my_replica_id()) {
+                LOGINFO("Writing {} entries since I am the leader my_uuid={}", num_entries,
+                        boost::uuids::to_string(g_helper->my_replica_id()));
+                if (!repl_dev->is_ready_for_traffic()) {
+                    LOGINFO("leader is not yet ready for traffic, waiting");
+                    std::this_thread::sleep_for(std::chrono::milliseconds{500});
+                }
+                auto const block_size = SISL_OPTIONS["block_size"].as< uint32_t >();
+                g_helper->runner().set_num_tasks(num_entries);
+
+                LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size);
+                g_helper->runner().set_task([this, block_size, db, data_size]() {
+                    static std::normal_distribution<> num_blks_gen{3.0, 2.0};
+                    uint64_t size =
+                        data_size == nullptr ? std::abs(std::lround(num_blks_gen(g_re))) * block_size : *data_size;
+                    this->generate_writes(size, block_size, db);
+                });
+                if (wait_for_commit) { g_helper->runner().execute().get(); }
+                break;
+            } else {
+                LOGINFO("{} entries were written on the leader_uuid={} my_uuid={}", num_entries,
+                        boost::uuids::to_string(leader_uuid), boost::uuids::to_string(g_helper->my_replica_id()));
+                break;
+            }
+        } while (true);
+
+        written_entries_ += num_entries;
+        if (wait_for_commit) { this->wait_for_all_commits(); }
+    }
+    replica_id_t wait_and_get_leader_id() {
+        do {
+            auto leader_uuid = dbs_[0]->repl_dev()->get_leader_id();
+            if (leader_uuid.is_nil()) {
+                LOGINFO("Waiting for leader to be elected");
+                std::this_thread::sleep_for(std::chrono::milliseconds{500});
+            } else {
+                return leader_uuid;
+            }
+        } while (true);
+    }
+
+    ReplServiceError write_with_id(uint64_t id, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr) {
+        if (dbs_[0]->repl_dev() == nullptr) return ReplServiceError::FAILED;
+        if (db == nullptr) { db = pick_one_db(); }
+        LOGINFO("Writing data {} since I am the leader my_uuid={}", id,
+                boost::uuids::to_string(g_helper->my_replica_id()));
+        auto const block_size = SISL_OPTIONS["block_size"].as< uint32_t >();
+
+        LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size);
+        g_helper->runner().set_num_tasks(1);
+        g_helper->runner().set_task([this, block_size, db, id]() {
+            static std::normal_distribution<> num_blks_gen{3.0, 1.0};
+            auto data_size = std::max(1L, std::abs(std::lround(num_blks_gen(g_re)))) * block_size;
+            ASSERT_GT(data_size, 0);
+            LOGINFO("data_size larger than 0, go ahead, data_size= {}.", data_size);
+            static std::atomic< uint32_t > s_uniq_num{0};
+            auto req = intrusive(new TestReplicatedDB::test_req());
+            req->jheader.data_size = data_size;
+            req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num;
+            // overwrite the key_id with the id passed in
+            req->jheader.key_id = id;
+            req->key_id = id;
+
+            LOGINFOMOD(replication, "[Replica={}] Db write key={} data_size={} pattern={} block_size={}",
+                       g_helper->replica_num(), req->key_id, data_size, req->jheader.data_pattern, block_size);
+
+            if (data_size != 0) {
+                req->write_sgs =
+                    test_common::HSTestHelper::create_sgs(data_size, block_size, req->jheader.data_pattern);
+            }
+
+            db->repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req);
+        });
+
+        if (!wait_for_commit) { return ReplServiceError::OK; }
+        try {
+            g_helper->runner().execute().get();
+            LOGDEBUG("write data task complete, id={}", id)
+        } catch (const ReplServiceError& e) {
+            LOGERRORMOD(replication, "[Replica={}] Error in writing data: id={}, error={}", g_helper->replica_num(), id,
+                        enum_name(e));
+            return e;
+        }
+
+        written_entries_ += 1;
+        LOGINFO("wait_for_commit={}", written_entries_);
+        this->wait_for_all_commits();
+        return ReplServiceError::OK;
+    }
+
+    void remove_db(std::shared_ptr< TestReplicatedDB > db, bool wait_for_removal) {
+        this->run_on_leader(db, [this, db]() {
+            auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get();
+            ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group";
+        });
+
+        // Remove the db from the dbs_ list and check if count matches with repl_device
+        for (auto it = dbs_.begin(); it != dbs_.end(); ++it) {
+            if (*it == db) {
+                dbs_.erase(it);
+                break;
+            }
+        }
+
+        if (wait_for_removal) { wait_for_listener_destroy(dbs_.size()); }
+    }
+
+    void wait_for_listener_destroy(uint64_t exp_listeners) {
+        while (true) {
+            auto total_listeners = g_helper->num_listeners();
+            if (total_listeners == exp_listeners) { break; }
+            std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        }
+    }
+
+    void restart_replica(uint16_t replica, uint32_t shutdown_delay_sec = 5u) {
+        if (g_helper->replica_num() == replica) {
+            LOGINFO("Restart homestore: replica_num = {}", replica);
+            g_helper->restart(shutdown_delay_sec);
+            // g_helper->sync_for_test_start();
+        } else {
+            LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica);
+            std::this_thread::sleep_for(std::chrono::seconds{5});
+        }
+    }
+
+    void shutdown_replica(uint16_t replica) {
+        if (g_helper->replica_num() == replica) {
+            LOGINFO("Shutdown homestore: replica_num = {}", replica);
+            g_helper->shutdown();
+        } else {
+            LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica);
+            std::this_thread::sleep_for(std::chrono::seconds{5});
+        }
+    }
+
+    void start_replica(uint16_t replica) {
+        if (g_helper->replica_num() == replica) {
+            LOGINFO("Start homestore: replica_num = {}", replica);
+            g_helper->start();
+        }
+    }
+
+    void create_snapshot() { dbs_[0]->create_snapshot(); }
+    void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); }
+
+    void replace_member(std::shared_ptr< TestReplicatedDB > db, replica_id_t member_out, replica_id_t member_in,
+                        uint32_t commit_quorum = 0, ReplServiceError error = ReplServiceError::OK) {
+        this->run_on_leader(db, [this, error, db, member_out, member_in, commit_quorum]() {
+            LOGINFO("Start replace member out={} in={}", boost::uuids::to_string(member_out),
+                    boost::uuids::to_string(member_in));
+
+            replica_member_info out{member_out, ""};
+            replica_member_info in{member_in, ""};
+            auto result = hs()->repl_service().replace_member(db->repl_dev()->group_id(), out, in, commit_quorum).get();
+            if (error == ReplServiceError::OK) {
+                ASSERT_EQ(result.hasError(), false) << "Error in replacing member, err=" << result.error();
+            } else {
+                ASSERT_EQ(result.hasError(), true);
+                ASSERT_EQ(result.error(), error) << "Error in replacing member, err=" << result.error();
+            }
+        });
+    }
+
+protected:
+    std::vector< std::shared_ptr< TestReplicatedDB > > dbs_;
+    uint32_t written_entries_{0};
+
+#ifdef _PRERELEASE
+    flip::FlipClient m_fc{iomgr_flip::instance()};
+#endif
+};
diff --git a/src/tests/test_data_service.cpp b/src/tests/test_data_service.cpp
index 0974ca431..e6c47e211 100644
--- a/src/tests/test_data_service.cpp
+++ b/src/tests/test_data_service.cpp
@@ -445,7 +445,7 @@ class BlkDataServiceTest : public testing::Test {
     void read_io(uint32_t io_size) {
         auto remaining_io_size = io_size;
         while (remaining_io_size > 0) {
-            auto const bid = get_rand_blkid_to_read(io_size);
+            auto const bid = get_rand_blkid_to_read(remaining_io_size);
             if (!bid.is_valid()) {
                 // didn't find any block to read, either write blk map is empty or
                 // all blks are pending on free.
@@ -455,6 +455,7 @@ class BlkDataServiceTest : public testing::Test {
             // every piece in bid is a single block, e.g.  nblks = 1
             auto const nbids = bid.num_pieces();
             auto sub_io_size = nbids * inst().get_blk_size();
+	    HS_REL_ASSERT_LE(sub_io_size, remaining_io_size, "not expecting sub_io_size to exceed remaining_io_size");
 
             // we pass crc from lambda becaues if there is any async_free_blk, the written blks in the blkcrc map will
             // be removed by the time read thenVlue is called;
@@ -581,7 +582,7 @@ class BlkDataServiceTest : public testing::Test {
         auto nbids = io_size / inst().get_blk_size();    // number of blks to read;
 
         // nbids should not exceed max pieces that MultiBlkId can hold;
-        nbids = std::max(nbids, MultiBlkId::max_addln_pieces);
+        nbids = std::min(nbids, MultiBlkId::max_addln_pieces);
 
         // make sure skip + nbids are in the range of m_blk_crc_map;
         if (skip_nbids + nbids > m_blk_crc_map.size()) { skip_nbids = m_blk_crc_map.size() - nbids; }
diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp
index 77fdfb651..35b44eeaf 100644
--- a/src/tests/test_index_crash_recovery.cpp
+++ b/src/tests/test_index_crash_recovery.cpp
@@ -33,25 +33,32 @@ SISL_OPTIONS_ENABLE(logging, test_index_crash_recovery, iomgr, test_common_setup
 // TODO Add tests to do write,remove after recovery.
 // TODO Test with var len key with io mgr page size is 512.
 
-SISL_OPTION_GROUP(test_index_crash_recovery,
-                  (num_iters, "", "num_iters", "number of iterations for rand ops",
-                   ::cxxopts::value< uint32_t >()->default_value("500"), "number"),
-                  (num_entries, "", "num_entries", "number of entries to test with",
-                   ::cxxopts::value< uint32_t >()->default_value("5000"), "number"),
-                  (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"),
-                   "seconds"),
-                  (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node",
-                   ::cxxopts::value< uint32_t >()->default_value("0"), ""),
-                  (operation_list, "", "operation_list",
-                   "operation list instead of default created following by percentage",
-                   ::cxxopts::value< std::vector< std::string > >(), "operations [...]"),
-                  (preload_size, "", "preload_size", "number of entries to preload tree with",
-                   ::cxxopts::value< uint32_t >()->default_value("1000"), "number"),
-                  (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""),
-                  (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown",
-                   ::cxxopts::value< bool >()->default_value("1"), ""),
-                  (seed, "", "seed", "random engine seed, use random if not defined",
-                   ::cxxopts::value< uint64_t >()->default_value("0"), "number"))
+SISL_OPTION_GROUP(
+    test_index_crash_recovery,
+    (num_iters, "", "num_iters", "number of iterations for rand ops",
+     ::cxxopts::value< uint32_t >()->default_value("500"), "number"),
+    (num_entries, "", "num_entries", "number of entries to test with",
+     ::cxxopts::value< uint32_t >()->default_value("5000"), "number"),
+    (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"),
+    (num_rounds, "", "num_rounds", "number of rounds to test with",
+     ::cxxopts::value< uint32_t >()->default_value("100"), "number"),
+    (num_entries_per_rounds, "", "num_entries_per_rounds", "number of entries per rounds",
+     ::cxxopts::value< uint32_t >()->default_value("40"), "number"),
+    (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("20"),
+     ""),
+    (min_keys_in_node, "", "min_keys_in_node", "min_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("6"),
+     ""),
+    (operation_list, "", "operation_list", "operation list instead of default created following by percentage",
+     ::cxxopts::value< std::vector< std::string > >(), "operations [...]"),
+    (preload_size, "", "preload_size", "number of entries to preload tree with",
+     ::cxxopts::value< uint32_t >()->default_value("1000"), "number"),
+    (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""),
+    (load_from_file, "", "load_from_file", "load from file", ::cxxopts::value< bool >()->default_value("0"), ""),
+    (save_to_file, "", "save_to_file", "save to file", ::cxxopts::value< bool >()->default_value("0"), ""),
+    (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown",
+     ::cxxopts::value< bool >()->default_value("1"), ""),
+    (seed, "", "seed", "random engine seed, use random if not defined",
+     ::cxxopts::value< uint64_t >()->default_value("0"), "number"))
 
 void log_obj_life_counter() {
     std::string str;
@@ -73,8 +80,6 @@ class SequenceGenerator {
 public:
     SequenceGenerator(int putFreq, int removeFreq, uint64_t start_range, uint64_t end_range) :
             putFreq_(putFreq), removeFreq_(removeFreq), start_range_(start_range), end_range_(end_range) {
-        std::random_device rd;
-        gen_ = std::mt19937(rd());
         keyDist_ = std::uniform_int_distribution<>(start_range_, end_range_);
         updateOperationTypeDistribution();
     }
@@ -95,27 +100,48 @@ class SequenceGenerator {
         keyDist_ = std::uniform_int_distribution<>(start_range_, end_range_);
     }
 
+    void fillRange(uint64_t start, uint64_t end) {
+        for (uint64_t i = start; i <= end; ++i) {
+            keyStates[i] = true;
+        }
+    }
+
     OperationList generateOperations(size_t numOperations, bool reset = false) {
         std::vector< Operation > operations;
         if (reset) { this->reset(); }
-        for (size_t i = 0; i < numOperations; ++i) {
-            uint32_t key = keyDist_(gen_);
+        if (putFreq_ == 100 && end_range_ - start_range_ + 1 - in_use_key_cnt_.load() < numOperations) {
+            LOGDEBUG("All keys are in use, skipping operation generation. end_range_ {} start_range_ {} "
+                     "in_use_key_cnt_ {}, numOperations {}",
+                     end_range_, start_range_, in_use_key_cnt_.load(), numOperations);
+            return operations;
+        }
+        if (removeFreq_ == 100 && in_use_key_cnt_.load() < numOperations) {
+            LOGDEBUG("Not enough keys are in use, skipping operation generation. in_use_key_cnt_ {} numOperations {}",
+                     in_use_key_cnt_.load(), numOperations);
+            return operations;
+        }
+
+        while (operations.size() < numOperations) {
+            uint32_t key = keyDist_(g_re);
             auto [it, inserted] = keyStates.try_emplace(key, false);
             auto& inUse = it->second;
 
-            OperationType operation = static_cast< OperationType >(opTypeDist_(gen_));
+            OperationType operation = static_cast< OperationType >(opTypeDist_(g_re));
 
             if (operation == OperationType::Put && !inUse) {
                 operations.emplace_back(key, OperationType::Put);
                 inUse = true;
+                in_use_key_cnt_.fetch_add(1);
             } else if (operation == OperationType::Remove && inUse) {
                 operations.emplace_back(key, OperationType::Remove);
                 inUse = false;
+                in_use_key_cnt_.fetch_sub(1);
             }
         }
 
         return operations;
     }
+
     __attribute__((noinline)) std::string showKeyState(uint64_t key) const {
         auto it = keyStates.find(key);
         if (it != keyStates.end()) { return it->second ? "Put" : "Remove"; }
@@ -130,15 +156,18 @@ class SequenceGenerator {
         }
         return occurrences;
     }
-    __attribute__((noinline)) std::string printOperations(const OperationList& operations) const {
+
+    __attribute__((noinline)) static std::string printOperations(const OperationList& operations) {
         std::ostringstream oss;
+        auto count = 1;
         for (const auto& [key, opType] : operations) {
             std::string opTypeStr = (opType == OperationType::Put) ? "Put" : "Remove";
-            oss << "{" << key << ", " << opTypeStr << "}\n";
+            oss << count++ << "- {" << key << ", " << opTypeStr << "}\n";
         }
         return oss.str();
     }
-    __attribute__((noinline)) std::string printKeysOccurrences(const OperationList& operations) const {
+
+    __attribute__((noinline)) static std::string printKeysOccurrences(const OperationList& operations) {
         std::set< uint64_t > keys = collectUniqueKeys(operations);
         std::ostringstream oss;
         for (auto key : keys) {
@@ -151,16 +180,52 @@ class SequenceGenerator {
         }
         return oss.str();
     }
-    __attribute__((noinline)) std::string printKeyOccurrences(const OperationList& operations, uint64_t key ) const {
+
+    __attribute__((noinline)) static std::string printKeyOccurrences(const OperationList& operations, uint64_t key) {
         std::ostringstream oss;
         auto keyOccurrences = inspect(operations, key);
         oss << "Occurrences of key " << key << ":\n";
         for (const auto& [index, operation] : keyOccurrences) {
             std::string opTypeStr = (operation == OperationType::Put) ? "Put" : "Remove";
-                oss << "Index: " << index << ", Operation: " << opTypeStr << "\n";
+            oss << "Index: " << index << ", Operation: " << opTypeStr << "\n";
         }
         return oss.str();
     }
+
+    static std::set< uint64_t > collectUniqueKeys(const OperationList& operations) {
+        std::set< uint64_t > keys;
+        for (const auto& [key, _] : operations) {
+            keys.insert(key);
+        }
+        return keys;
+    }
+    static void save_to_file(std::string filename, const OperationList& operations) {
+        std::ofstream file(filename);
+        if (file.is_open()) {
+            for (const auto& [key, opType] : operations) {
+                file << key << " " << static_cast< int >(opType) << "\n";
+            }
+            file.close();
+        }
+    }
+
+    static OperationList load_from_file(std::string filename) {
+        std::ifstream file(filename);
+        OperationList operations;
+        if (file.is_open()) {
+            std::string line;
+            while (std::getline(file, line)) {
+                std::istringstream iss(line);
+                uint64_t key;
+                int opType;
+                iss >> key >> opType;
+                operations.emplace_back(key, static_cast< OperationType >(opType));
+            }
+            file.close();
+        }
+        return operations;
+    }
+
     void reset() { keyStates.clear(); }
 
 private:
@@ -168,25 +233,31 @@ class SequenceGenerator {
     int removeFreq_;
     uint64_t start_range_;
     uint64_t end_range_;
-    std::mt19937 gen_;
     std::uniform_int_distribution<> keyDist_;
     std::discrete_distribution<> opTypeDist_;
     std::map< uint64_t, bool > keyStates;
+    std::atomic< uint64_t > in_use_key_cnt_{0};
 
     void updateOperationTypeDistribution() {
         opTypeDist_ =
             std::discrete_distribution<>({static_cast< double >(putFreq_), static_cast< double >(removeFreq_)});
     }
-
-    std::set< uint64_t > collectUniqueKeys(const OperationList& operations) const {
-        std::set< uint64_t > keys;
-        for (const auto& [key, _] : operations) {
-            keys.insert(key);
-        }
-        return keys;
-    }
 };
+
 #ifdef _PRERELEASE
+
+struct long_running_crash_options {
+    uint32_t put_freq;
+    std::vector< std::string > put_flips{};
+    std::vector< std::string > remove_flips{};
+    uint32_t num_entries{SISL_OPTIONS["num_entries"].as< uint32_t >()};
+    uint32_t preload_size{SISL_OPTIONS["preload_size"].as< uint32_t >()};
+    uint32_t rounds{SISL_OPTIONS["num_rounds"].as< uint32_t >()};
+    uint32_t num_entries_per_rounds{SISL_OPTIONS["num_entries_per_rounds"].as< uint32_t >()};
+    bool load_mode{SISL_OPTIONS.count("load_from_file") > 0};
+    bool save_mode{SISL_OPTIONS.count("save_to_file") > 0};
+};
+
 template < typename TestType >
 struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestType >, public ::testing::Test {
     using T = TestType;
@@ -197,12 +268,15 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
         TestIndexServiceCallbacks(IndexCrashTest* test) : m_test(test) {}
 
         std::shared_ptr< IndexTableBase > on_index_table_found(superblk< index_table_sb >&& sb) override {
-            LOGINFO("Index table recovered, root bnode_id {} version {}", sb->root_node, sb->root_link_version);
+            LOGINFO("Index table recovered, root bnode_id {} uuid {} ordinal {} version {}",
+                    static_cast< uint64_t >(sb->root_node), boost::uuids::to_string(sb->uuid), sb->ordinal,
+                    sb->root_link_version);
 
             m_test->m_cfg = BtreeConfig(hs()->index_service().node_size());
             m_test->m_cfg.m_leaf_node_type = T::leaf_node_type;
             m_test->m_cfg.m_int_node_type = T::interior_node_type;
             m_test->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >();
+            m_test->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as< uint32_t >();
             m_test->m_bt = std::make_shared< typename T::BtreeType >(std::move(sb), m_test->m_cfg);
             return m_test->m_bt;
         }
@@ -228,9 +302,11 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
              {HS_SERVICE::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}},
             nullptr, {}, SISL_OPTIONS["init_device"].as< bool >());
 
-        LOGINFO("Node size {} ", hs()->index_service().node_size());
         this->m_cfg = BtreeConfig(hs()->index_service().node_size());
         this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >();
+        this->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as< uint32_t >();
+        LOGINFO("Node size {}, max_keys_in_node {}, min_keys_in_node {}", this->m_cfg.node_size(),
+                this->m_cfg.m_max_keys_in_node, this->m_cfg.m_min_keys_in_node);
         auto uuid = boost::uuids::random_generator()();
         auto parent_uuid = boost::uuids::random_generator()();
 
@@ -240,28 +316,44 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
         BtreeTestHelper< TestType >::SetUp();
         if (this->m_bt == nullptr || SISL_OPTIONS["init_device"].as< bool >()) {
             this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg);
+            auto num_keys = this->m_bt->count_keys(this->m_bt->root_node_id());
+            //            LOGINFO("Creating new index table with uuid {} - init_device:{:s} bt: {} root id {}, num of
+            //            keys {}",  boost::uuids::to_string(uuid), SISL_OPTIONS["init_device"].as< bool >(),
+            //            this->m_bt, this->m_bt->root_node_id(), num_keys);
+            LOGINFO("Creating new index table with uuid {} - root id {}, num of keys {}", boost::uuids::to_string(uuid),
+                    this->m_bt->root_node_id(), num_keys);
+
         } else {
             populate_shadow_map();
         }
 
         hs()->index_service().add_index_table(this->m_bt);
-        LOGINFO("Added index table to index service");
+        LOGINFO("Added index table to index service with uuid {} - total tables in the system is currently {}",
+                boost::uuids::to_string(uuid), hs()->index_service().num_tables());
     }
 
     void populate_shadow_map() {
+        LOGINFO("Populating shadow map");
         this->m_shadow_map.load(m_shadow_filename);
-        ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id()))
-            << "shadow map size and tree size mismatch";
+        auto num_keys = this->m_bt->count_keys(this->m_bt->root_node_id());
+        LOGINFO("Shadow map size {} - btree keys {} - root id {}", this->m_shadow_map.size(), num_keys,
+                this->m_bt->root_node_id());
+        ASSERT_EQ(this->m_shadow_map.size(), num_keys) << "shadow map size and tree size mismatch";
         this->get_all();
     }
 
     void reset_btree() {
+        hs()->index_service().remove_index_table(this->m_bt);
         this->m_bt->destroy();
+        this->trigger_cp(true);
+
         auto uuid = boost::uuids::random_generator()();
         auto parent_uuid = boost::uuids::random_generator()();
         this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg);
         hs()->index_service().add_index_table(this->m_bt);
         this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1);
+        this->m_shadow_map.save(m_shadow_filename);
+        LOGINFO("Reset btree with uuid {} - erase shadow map {}", boost::uuids::to_string(uuid), m_shadow_filename);
     }
 
     void restart_homestore(uint32_t shutdown_delay_sec = 3) override {
@@ -273,7 +365,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
     void reapply_after_crash() {
         ShadowMap< K, V > snapshot_map{this->m_shadow_map.max_keys()};
         snapshot_map.load(m_shadow_filename);
-        LOGDEBUG("\tSnapshot before crash\n{}", snapshot_map.to_string());
+        // LOGINFO("\tSnapshot before crash\n{}", snapshot_map.to_string());
         auto diff = this->m_shadow_map.diff(snapshot_map);
 
         // visualize tree after crash
@@ -281,20 +373,28 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
         // this->visualize_keys(recovered_tree_filename);
         // LOGINFO(" tree after recovered stored in {}", recovered_tree_filename);
 
-        std::string dif_str = "KEY \tADDITION\n";
-        for (const auto& [k, addition] : diff) {
-            dif_str += fmt::format(" {} \t{}\n", k.key(), addition);
+        std::string dif_str = "Keys[";
+        for (const auto& [k, _] : diff) {
+            dif_str += fmt::format("{} ", k.key());
         }
-        LOGDEBUG("Diff between shadow map and snapshot map\n{}\n", dif_str);
+        dif_str += "]";
+        LOGINFO("Diff between shadow map and snapshot map\n{}\n", dif_str);
 
         for (const auto& [k, addition] : diff) {
             // this->print_keys(fmt::format("reapply: before inserting key {}", k.key()));
             //  this->visualize_keys(recovered_tree_filename);
-            if (addition) { this->force_upsert(k.key()); }
+            if (addition) {
+                LOGDEBUG("Reapply: Inserting key {}", k.key());
+                this->force_upsert(k.key());
+            } else {
+                LOGDEBUG("Reapply: Removing key {}", k.key());
+                this->remove_one(k.key(), false);
+            }
         }
-        test_common::HSTestHelper::trigger_cp(true);
+        trigger_cp(true);
         this->m_shadow_map.save(m_shadow_filename);
     }
+
     void reapply_after_crash(OperationList& operations) {
         for (const auto& [key, opType] : operations) {
             switch (opType) {
@@ -308,7 +408,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
                 break;
             }
         }
-        test_common::HSTestHelper::trigger_cp(true);
+        trigger_cp(true);
     }
 
     void TearDown() override {
@@ -323,60 +423,321 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
                 LOGINFO("Error: failed to remove {}", m_shadow_filename);
             }
         }
-        LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(),
-                this->m_bt->count_keys(this->m_bt->root_node_id()));
+        LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(), this->tree_key_count());
         BtreeTestHelper< TestType >::TearDown();
         this->shutdown_homestore(false);
     }
 
     void crash_and_recover(uint32_t s_key, uint32_t e_key) {
-        this->print_keys("Btree prior to CP and susbsequent simulated crash: ");
-        test_common::HSTestHelper::trigger_cp(false);
-        this->wait_for_crash_recovery();
+        // this->print_keys("Btree prior to CP and susbsequent simulated crash: ");
+        trigger_cp(false);
+        this->wait_for_crash_recovery(true);
         // this->visualize_keys("tree_after_crash_" + std::to_string(s_key) + "_" + std::to_string(e_key) + ".dot");
 
-        this->print_keys("Post crash and recovery, btree structure: ");
+        // this->print_keys("Post crash and recovery, btree structure: ");
         this->reapply_after_crash();
 
+        // this->print_keys("Post reapply, btree structure: ");
+
         this->get_all();
         LOGINFO("Expect to have [{},{}) in tree and it is actually{} ", s_key, e_key, tree_key_count());
-        ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id()))
-            << "shadow map size and tree size mismatch";
+        ASSERT_EQ(this->m_shadow_map.size(), this->tree_key_count()) << "shadow map size and tree size mismatch";
+    }
+
+    void sanity_check(OperationList& operations) const {
+        std::set< uint64_t > new_keys;
+        std::transform(operations.begin(), operations.end(), std::inserter(new_keys, new_keys.end()),
+                       [](const Operation& operation) { return operation.first; });
+        uint32_t count = 0;
+        this->m_shadow_map.foreach ([this, new_keys, &count](K key, V value) {
+            // discard the new keys to check
+            if (new_keys.find(key.key()) != new_keys.end()) { return; }
+            count++;
+            auto copy_key = std::make_unique< K >();
+            *copy_key = key;
+            auto out_v = std::make_unique< V >();
+            auto req = BtreeSingleGetRequest{copy_key.get(), out_v.get()};
+            req.enable_route_tracing();
+            const auto ret = this->m_bt->get(req);
+            if (ret != btree_status_t::success) {
+                this->print_keys(fmt::format("Sanity check: key {}", key.key()));
+                this->dump_to_file("sanity_fail.txt");
+            }
+            ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map";
+        });
+        LOGINFO("Sanity check passed for {} keys!", count);
     }
 
-    void crash_and_recover(OperationList& operations, std::string filename = "") {
-        //        this->print_keys("Btree prior to CP and susbsequent simulated crash: ");
-        test_common::HSTestHelper::trigger_cp(false);
-        this->wait_for_crash_recovery();
-        //        this->print_keys("Post crash and recovery, btree structure:");
+    void crash_and_recover_common(OperationList& operations, std::string filename = "") {
+        //          this->print_keys("Btree prior to CP and susbsequent simulated crash: ");
+        LOGINFO("Before Crash: {} keys in shadow map and it is actually {} keys in tree - operations size {}",
+                this->m_shadow_map.size(), tree_key_count(), operations.size());
 
         if (!filename.empty()) {
-            LOGINFO("Visualize the tree file {}", filename);
-            this->visualize_keys(filename);
+            std::string b_filename = filename + "_before_crash.dot";
+            LOGINFO("Visualize the tree before crash file {}", b_filename);
+            this->visualize_keys(b_filename);
         }
 
-        this->reapply_after_crash(operations);
+        trigger_cp(false);
+        LOGINFO("waiting for crash to recover");
+        this->wait_for_crash_recovery(true);
 
-        //        this->print_keys("\n\nafter reapply keys");
         if (!filename.empty()) {
-            LOGINFO("Visualize the tree file after_reapply__{}", filename);
-            this->visualize_keys("after_reapply__" + filename);
+            std::string rec_filename = filename + "_after_recovery.dot";
+            LOGINFO("Visualize the tree file after recovery : {}", rec_filename);
+            this->visualize_keys(rec_filename);
+        }
+         // this->print_keys("Post crash and recovery, btree structure: ");
+        sanity_check(operations);
+        //        Added to the index service right after recovery. Not needed here
+        //        test_common::HSTestHelper::trigger_cp(true);
+        LOGINFO("Before Reapply: {} keys in shadow map and actually {} in trees operation size {}",
+                this->m_shadow_map.size(), tree_key_count(), operations.size());
+        this->reapply_after_crash(operations);
+        if (!filename.empty()) {
+            std::string re_filename = filename + "_after_reapply.dot";
+            LOGINFO("Visualize the tree after reapply {}", re_filename);
+            this->visualize_keys(re_filename);
         }
+         // this->print_keys("Post reapply, btree structure: ");
 
         this->get_all();
+        LOGINFO("After reapply: {} keys in shadow map and actually {} in tress", this->m_shadow_map.size(),
+                tree_key_count());
+        ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id()))
+            << "shadow map size and tree size mismatch";
+    }
+
+    void crash_and_recover(std::string& flip, OperationList& operations, std::string filename = "") {
+        this->remove_flip(flip);
+        this->crash_and_recover_common(operations, filename);
+    }
+
+    void crash_and_recover(std::vector< std::string >& flips, OperationList& operations, std::string filename = "") {
+        for (auto const& flip : flips) {
+            this->remove_flip(flip);
+        }
+        this->crash_and_recover_common(operations, filename);
     }
 
     uint32_t tree_key_count() { return this->m_bt->count_keys(this->m_bt->root_node_id()); }
 
+    void long_running_crash(long_running_crash_options const& crash_test_options) {
+        // set putFreq 100 for the initial load
+        SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/,
+                                    crash_test_options.num_entries - 1 /*end_range*/);
+
+        std::vector< std::string > flips;
+        OperationList operations;
+        auto m_start_time = Clock::now();
+        auto time_to_stop = [this, m_start_time]() { return (get_elapsed_time_sec(m_start_time) > this->m_run_time); };
+        double elapsed_time, progress_percent, last_progress_time = 0;
+        bool renew_btree_after_crash = false;
+        auto cur_put_flip_idx = 0;
+        auto cur_remove_flip_idx = 0;
+        std::uniform_int_distribution<> dis(1, 100);
+        int flip_percentage = 90; // Set the desired percentage here
+        bool normal_execution = true;
+        bool clean_shutdown = true;
+        // if it is safe then delete all previous save files
+        if (crash_test_options.save_mode) {
+            std::filesystem::remove_all("/tmp/operations_*.txt");
+            std::filesystem::remove_all("/tmp/flips_history.txt");
+        }
+        // init tree
+        LOGINFO("Step 0: Fill up the tree with {} entries", crash_test_options.preload_size);
+        if (crash_test_options.load_mode) {
+            operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_0.txt"));
+        } else {
+            operations = generator.generateOperations(crash_test_options.preload_size, true /* reset */);
+            if (crash_test_options.save_mode) {
+                SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations);
+            }
+        }
+
+        LOGDEBUG("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations));
+        uint32_t num_keys{0};
+
+        for (auto [k, _] : operations) {
+            this->put(k, btree_put_type::INSERT, true /* expect_success */);
+            num_keys++;
+        }
+
+        generator.setPutFrequency(crash_test_options.put_freq);
+        generator.setRemoveFrequency(100 - crash_test_options.put_freq);
+
+        // Trigger the cp to make sure middle part is successful
+        LOGINFO("Step 0-1: Flush all the entries so far");
+        test_common::HSTestHelper::trigger_cp(true);
+        this->get_all();
+        this->m_shadow_map.save(this->m_shadow_filename);
+        // this->print_keys("reapply: after preload");
+        this->visualize_keys("tree_after_preload.dot");
+
+        for (uint32_t round = 1; round <= crash_test_options.rounds && !time_to_stop(); round++) {
+            LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, crash_test_options.rounds);
+            bool print_time = false;
+            elapsed_time = get_elapsed_time_sec(m_start_time);
+
+            if (crash_test_options.load_mode) {
+                operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round));
+            } else {
+                operations = generator.generateOperations(crash_test_options.num_entries_per_rounds,
+                                                          renew_btree_after_crash /* reset */);
+                if (crash_test_options.save_mode) {
+                    SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations);
+                }
+            }
+            if (operations.empty()) {
+                LOGDEBUG("No operations generated, skipping round {}", round);
+                continue;
+            }
+
+            flips.clear();
+            if (crash_test_options.load_mode) {
+                std::ifstream file("/tmp/flips_history.txt");
+                std::string line;
+                bool found = false;
+                for (uint32_t i = 0; i < round && std::getline(file, line); i++) {
+                    if (i == round - 1) {
+                        found = true;
+                        break;
+                    }
+                }
+                if (found && !line.empty()) {
+                    if (line == "normal") {
+                        normal_execution = true;
+                    } else {
+                        normal_execution = false;
+                        std::istringstream iss(line);
+                        std::string flip;
+                        while (iss >> flip) {
+                            flips.emplace_back(flip);
+                        }
+                        auto log_str = fmt::format("Step 1-{}: Set flag", round);
+                        for (auto const& f : flips) {
+                            log_str += fmt::format(" {}", f);
+                            this->set_basic_flip(f, 1, 100);
+                        }
+                        LOGINFO("{}", log_str);
+                    }
+                }
+                file.close();
+            } else {
+                if (dis(g_re) <= flip_percentage) {
+                    if (!crash_test_options.put_flips.empty()) {
+                        flips.emplace_back(
+                            crash_test_options.put_flips[cur_put_flip_idx++ % crash_test_options.put_flips.size()]);
+                    }
+                    if (!crash_test_options.remove_flips.empty()) {
+                        flips.emplace_back(crash_test_options.remove_flips[cur_remove_flip_idx++ %
+                                                                           crash_test_options.remove_flips.size()]);
+                    }
+                    auto log_str = fmt::format("Step 1-{}: Set flag", round);
+                    for (auto const& f : flips) {
+                        log_str += fmt::format(" {}", f);
+                        this->set_basic_flip(f, 1, 100);
+                    }
+                    LOGINFO("{}", log_str);
+                    normal_execution = false;
+                } else {
+                    normal_execution = true;
+                    LOGINFO("Step 1-{}: No flip set", round);
+                }
+                if (crash_test_options.save_mode) {
+                    // save the filp name to a file for later use
+                    std::ofstream file("/tmp/flips_history.txt", std::ios::app);
+                    if (file.is_open()) {
+                        std::string out_line{"normal"};
+                        if (!normal_execution) {
+                            out_line = flips[0];
+                            for (size_t i = 1; i < flips.size(); i++) {
+                                out_line += " " + flips[i];
+                            }
+                        }
+                        file << out_line << "\n";
+                    }
+                    file.close();
+                }
+            }
+
+            LOGDEBUG("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations));
+
+            for (auto [k, op] : operations) {
+                if (op == OperationType::Remove) {
+                    if (num_keys < 1) {
+                        // remove flips and continue
+                        for (auto const& flip : flips) {
+                            this->remove_flip(flip);
+                        }
+                        continue;
+                    }
+                    LOGDEBUG("Removing key {}", k);
+                    this->remove_one(k, true /* expect_success */);
+                    num_keys--;
+                } else {
+                    if (num_keys >= crash_test_options.num_entries) {
+                        // remove flips and continue
+                        for (auto const& flip : flips) {
+                            this->remove_flip(flip);
+                        }
+                        continue;
+                    }
+                    LOGDEBUG("Inserting key {}", k);
+                    this->put(k, btree_put_type::INSERT, true /* expect_success */);
+                    num_keys++;
+                }
+                if (!time_to_stop()) {
+                    static bool print_alert = false;
+                    if (print_alert) {
+                        LOGINFO("It is time to stop but let's finish this round and then stop!");
+                        print_alert = false;
+                    }
+                }
+            }
+            if (normal_execution) {
+                if (clean_shutdown) {
+                    this->m_shadow_map.save(this->m_shadow_filename);
+                    this->restart_homestore();
+                } else {
+                    test_common::HSTestHelper::trigger_cp(true);
+                    this->get_all();
+                }
+            } else {
+                // remove the flips so that they do not get triggered erroneously
+                this->crash_and_recover(flips, operations, fmt::format("long_tree_{}", round));
+            }
+            if (elapsed_time - last_progress_time > 30) {
+                last_progress_time = elapsed_time;
+                print_time = true;
+            }
+            if (print_time) {
+                LOGINFO(
+                    "\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of "
+                    "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n",
+                    round, crash_test_options.rounds, round * 100.0 / crash_test_options.rounds, elapsed_time,
+                    this->m_run_time, elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(),
+                    crash_test_options.num_entries, this->tree_key_count() * 100.0 / crash_test_options.num_entries);
+            }
+            // this->print_keys(fmt::format("reapply: after round {}", round));
+            if (renew_btree_after_crash) { this->reset_btree(); };
+        }
+        this->destroy_btree();
+        log_obj_life_counter();
+    }
+
 protected:
     const std::string m_shadow_filename = "/tmp/shadow_map_index_recovery.txt";
 };
 
 // Crash recovery can test one simple btree, since focus is not on btree test itself, but index recovery
-using BtreeTypes = testing::Types< FixedLenBtree >;
+using BtreeTypes = testing::Types< FixedLenBtree, PrefixIntervalBtree >;
 TYPED_TEST_SUITE(IndexCrashTest, BtreeTypes);
 
 TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) {
+    this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1);
+    this->m_shadow_map.save(this->m_shadow_filename);
     // Simulate the crash even before first cp
     this->set_basic_flip("crash_flush_on_root");
 
@@ -385,13 +746,15 @@ TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) {
 
     // Trigger a cp, which should induce the crash and wait for hs to recover
     test_common::HSTestHelper::trigger_cp(false);
-    this->wait_for_crash_recovery();
+    this->wait_for_crash_recovery(true);
 
     // Post crash, load the shadow_map into a new instance and compute the diff. Redo the operation
     this->reapply_after_crash();
 }
 
 TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) {
+    this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1);
+    this->m_shadow_map.save(this->m_shadow_filename);
     // Insert into 4 phases, first fill up the last part, since we need to test split on left edge
     LOGINFO("Step 1: Fill up the last quarter of the tree");
     auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >();
@@ -441,82 +804,6 @@ TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) {
     this->query_all_paginate(80);
 }
 
-/*
-TYPED_TEST(IndexCrashTest, ManualMergeCrash){
-    // Define the lambda function
-    const uint32_t num_entries = 30;
-
-    auto initTree = [this, num_entries]() {
-        for (uint64_t k = 0u; k < num_entries; ++k) {
-            this->force_upsert(k);
-        }
-        test_common::HSTestHelper::trigger_cp(true);
-        this->m_shadow_map.save(this->m_shadow_filename);
-    };
-
-    std::vector< OperationList > removing_scenarios = {
-        {{29, OperationType::Remove},
-         {28, OperationType::Remove},
-         {27, OperationType::Remove},
-         {26, OperationType::Remove},
-         {25, OperationType::Remove},
-         {24, OperationType::Remove}}
-    };
-
-    auto scenario = removing_scenarios[0];
-
-    LOGINFO("Step 1-1: Populate some keys and flush");
-    initTree();
-    this->visualize_keys("tree_init.dot");
-    LOGINFO("Step 2-1: Set crash flag, remove some keys in reverse order");
-    this->set_basic_flip("crash_flush_on_merge_at_parent");
-
-    for (auto [k, _] : scenario) {
-        LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k);
-        this->remove_one(k);
-    }
-    this->visualize_keys("tree_before_crash.dot");
-
-    LOGINFO("Step 3-1: Trigger cp to crash");
-    this->crash_and_recover(scenario, "recover_tree_crash_1.dot");
-    test_common::HSTestHelper::trigger_cp(true);
-    this->get_all();
-
-    LOGINFO("Step 1-2: Populate some keys and flush");
-    initTree();
-    this->visualize_keys("tree_init_02.dot");
-    LOGINFO("Step 2-2: Set crash flag, remove some keys in reverse order");
-    this->set_basic_flip("crash_flush_on_merge_at_left_child");
-    for (auto [k, _] : scenario) {
-        LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k);
-        this->remove_one(k);
-    }
-    this->visualize_keys("tree_before_crash_2.dot");
-
-    LOGINFO("Step 3-2: Trigger cp to crash");
-    this->crash_and_recover(scenario, "recover_tree_crash_2.dot");
-    test_common::HSTestHelper::trigger_cp(true);
-    this->get_all();
-
-    LOGINFO("Step 1-3: Populate some keys and flush");
-    initTree();
-    this->visualize_keys("tree_init_03.dot");
-    LOGINFO("Step 2-3: Set crash flag, remove some keys in reverse order");
-    this->set_basic_flip("crash_flush_on_freed_child");
-    for (auto [k, _] : scenario) {
-        LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k);
-        this->remove_one(k);
-    }
-    LOGINFO("Step 2-3: Set crash flag, remove some keys in reverse order");
-    this->visualize_keys("tree_before_crash_3.dot");
-
-    LOGINFO("Step 3-3: Trigger cp to crash");
-    this->crash_and_recover(scenario, "recover_tree_crash_3.dot");
-    test_common::HSTestHelper::trigger_cp(true);
-    this->get_all();
-}
-*/
-
 TYPED_TEST(IndexCrashTest, SplitCrash1) {
     // Define the lambda function
     auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >();
@@ -524,11 +811,11 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) {
     vector< std::string > flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child",
                                    "crash_flush_on_split_at_right_child"};
     OperationList operations;
+    bool renew_btree_after_crash = true;
     for (size_t i = 0; i < flips.size(); ++i) {
-        this->reset_btree();
         LOGINFO("Step 1-{}: Set flag {}", i + 1, flips[i]);
         this->set_basic_flip(flips[i]);
-        operations = generator.generateOperations(num_entries -1 , true /* reset */);
+        operations = generator.generateOperations(num_entries - 1, renew_btree_after_crash /* reset */);
         //        LOGINFO("Batch {} Operations:\n {} \n ", i + 1, generator.printOperations(operations));
         //        LOGINFO("Detailed Key Occurrences for Batch {}:\n {} \n ", i + 1,
         //        generator.printKeyOccurrences(operations));
@@ -536,52 +823,266 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) {
             //          LOGINFO("\t\t\t\t\t\t\t\t\t\t\t\t\tupserting entry {}", k);
             this->put(k, btree_put_type::INSERT, true /* expect_success */);
         }
-        this->crash_and_recover(operations, fmt::format("recover_tree_crash_{}.dot", i + 1));
+        this->crash_and_recover(flips[i], operations, fmt::format("recover_tree_crash_{}.dot", i + 1));
+        if (renew_btree_after_crash) { this->reset_btree(); };
     }
 }
 
 TYPED_TEST(IndexCrashTest, long_running_put_crash) {
-    // Define the lambda function
-    auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >();
-    SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/);
-    vector< std::string > flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child",
-                                   "crash_flush_on_split_at_right_child"};
-    OperationList operations;
-    auto m_start_time = Clock::now();
-    auto time_to_stop = [this, m_start_time]() { return (get_elapsed_time_sec(m_start_time) > this->m_run_time); };
-    double elapsed_time, progress_percent, last_progress_time = 0;
-    for (size_t i = 0; !time_to_stop(); ++i) {
-        bool print_time = false;
-        elapsed_time = get_elapsed_time_sec(m_start_time);
+    long_running_crash_options crash_test_options{
+        .put_freq = 100,
+        .put_flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child",
+                      "crash_flush_on_split_at_right_child"},
+    };
+    this->long_running_crash(crash_test_options);
+}
+
+TYPED_TEST(IndexCrashTest, long_running_remove_crash) {
+    long_running_crash_options crash_test_options{
+        .put_freq = 0,
+        .remove_flips = {"crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child"
+                         /*, "crash_flush_on_freed_child"*/},
+        .preload_size = SISL_OPTIONS["num_entries"].as< uint32_t >(),
+    };
+    this->long_running_crash(crash_test_options);
+}
+
+TYPED_TEST(IndexCrashTest, long_running_put_remove_crash) {
+    long_running_crash_options crash_test_options{
+        .put_freq = 50,
+        .put_flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child",
+                      "crash_flush_on_split_at_right_child"},
+        .remove_flips = {"crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child"
+                         /*, "crash_flush_on_freed_child"*/},
+    };
+    this->long_running_crash(crash_test_options);
+}
+
+// Basic reverse and forward order remove with different flip points
+TYPED_TEST(IndexCrashTest, MergeRemoveBasic) {
+    vector< std::string > flip_points = {
+        "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child",
+        // "crash_flush_on_freed_child",
+    };
 
+    for (size_t i = 0; i < flip_points.size(); ++i) {
         this->reset_btree();
-        auto flip = flips[i % flips.size()];
-        LOGINFO("Step 1-{}: Set flag {}", i + 1, flip);
 
-        this->set_basic_flip(flip, 1, 10);
-        operations = generator.generateOperations(num_entries -1, true /* reset */);
-        //        operations = generator.generateOperations(num_entries/10, false /* reset */);
-        //        LOGINFO("Batch {} Operations:\n {} \n ", i + 1, generator.printOperations(operations));
-        //        LOGINFO("Detailed Key Occurrences for Batch {}:\n {} \n ", i + 1,
-        //        generator.printKeyOccurrences(operations));
-        for (auto [k, _] : operations) {
-            //          LOGINFO("\t\t\t\t\t\t\t\t\t\t\t\t\tupserting entry {}", k);
+        auto& flip_point = flip_points[i];
+        LOGINFO("=== Testing flip point: {} - {} ===", i + 1, flip_point);
+
+        // Populate some keys [1,num_entries) and trigger cp to persist
+        LOGINFO("Step {}-1: Populate some keys and flush", i + 1);
+        auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >();
+        for (auto k = 0u; k < num_entries; ++k) {
             this->put(k, btree_put_type::INSERT, true /* expect_success */);
         }
-        this->crash_and_recover(operations/*,  fmt::format("recover_tree_crash_{}.dot", i + 1)*/);
-        if (elapsed_time - last_progress_time > 30) {
-            last_progress_time = elapsed_time;
-            print_time = true;
+        test_common::HSTestHelper::trigger_cp(true);
+        this->m_shadow_map.save(this->m_shadow_filename);
+
+        this->visualize_keys("tree_merge_full.dot");
+
+        // Split keys into batches and remove the last one in reverse order
+        LOGINFO("Step {}-2: Set crash flag, remove some keys in reverse order", i + 1);
+        int batch_num = 4;
+        {
+            int n = batch_num;
+            auto r = num_entries * n / batch_num - 1;
+            auto l = num_entries * (n - 1) / batch_num;
+            OperationList ops;
+            for (auto k = r; k >= l; --k) {
+                ops.emplace_back(k, OperationType::Remove);
+            }
+            LOGINFO("Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, r, l);
+
+            this->set_basic_flip(flip_point);
+            for (auto [k, _] : ops) {
+                this->remove_one(k, true);
+            }
+            this->visualize_keys("tree_merge_before_first_crash.dot");
+
+            LOGINFO("Step {}-2-2: Trigger cp to crash", i + 1);
+            this->crash_and_recover(ops);
         }
-        if (print_time) {
-            LOGINFO("\n\n\n\t\t\tProgress: {} iterations completed - Elapsed time: {:.0f} seconds of total "
-                    "{} ({:.2f}%)\n\n\n",
-                    i, elapsed_time, this->m_run_time, elapsed_time * 100.0 / this->m_run_time);
+
+        // Remove the next batch of keys in forward order
+        LOGINFO("Step {}-3: Remove another batch in ascending order", i + 1) {
+            int n = batch_num - 1;
+            auto r = num_entries * n / batch_num - 1;
+            auto l = num_entries * (n - 1) / batch_num;
+            OperationList ops;
+            for (auto k = l; k <= r; ++k) {
+                ops.emplace_back(k, OperationType::Remove);
+            }
+            LOGINFO("Step {}-3-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r);
+
+            this->set_basic_flip(flip_point);
+            for (auto [k, _] : ops) {
+                this->remove_one(k, true);
+            }
+            this->visualize_keys("tree_merge_before_second_crash.dot");
+
+            LOGINFO("Step {}-3-2: Trigger cp to crash", i + 1);
+            this->crash_and_recover(ops);
         }
-        this->print_keys(fmt::format("reapply: after iteration {}", i));
 
+        // Remove the next batch of keys in random order
+        LOGINFO("Step {}-4: Remove another batch in random order", i + 1) {
+            int n = batch_num - 2;
+            auto r = num_entries * n / batch_num - 1;
+            auto l = num_entries * (n - 1) / batch_num;
+            SequenceGenerator generator(0, 100, l, r);
+            generator.fillRange(l, r);
+            OperationList ops = generator.generateOperations(r - l + 1, false);
+
+            LOGINFO("Step {}-4-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r);
+
+            this->set_basic_flip(flip_point);
+            for (auto [k, _] : ops) {
+                this->remove_one(k, true);
+            }
+            this->visualize_keys("tree_merge_before_third_crash.dot");
+
+            LOGINFO("Step {}-4-2: Trigger cp to crash", i + 1);
+            this->crash_and_recover(ops);
+        }
+
+        LOGINFO("Step {}-5: Cleanup the tree", i + 1);
+        for (auto k = 0u; k < num_entries; ++k) {
+            this->remove_one(k, false);
+        }
+        test_common::HSTestHelper::trigger_cp(true);
+        this->get_all();
     }
 }
+
+//
+// TYPED_TEST(IndexCrashTest, MergeCrash1) {
+//     auto const num_entries = SISL_OPTIONS["num_entries"].as<uint32_t>();
+//     vector<std::string> flips = {
+//         "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child",
+//     };
+//     SequenceGenerator generator(0 /*putFreq*/, 100 /* removeFreq*/, 0 /*start_range*/, num_entries - 1
+//     /*end_range*/); OperationList operations; for (size_t i = 0; i < flips.size(); ++i) {
+//         this->reset_btree();
+//         LOGINFO("Step {}-1: Init btree", i + 1);
+//         for (auto k = 0u; k < num_entries; ++k) {
+//             this->put(k, btree_put_type::INSERT, true /* expect_success */);
+//         }
+//         test_common::HSTestHelper::trigger_cp(true);
+//         this->print_keys("Inited tree");
+//
+//         LOGINFO("Step {}-2: Set flag {}", i + 1, flips[i]);
+//         this->set_basic_flip(flips[i], 1, 10);
+//         generator.reset();
+//         generator.fillRange(0, num_entries - 1);
+//
+//         // Randomly remove some keys
+//         std::random_device rd;
+//         std::mt19937 gen(rd());
+//         std::uniform_int_distribution<> dis(num_entries / 4, num_entries / 2);
+//         auto num_keys_to_remove = dis(gen);
+//         LOGINFO("Removing {} keys before crash", num_keys_to_remove);
+//         operations = generator.generateOperations(num_keys_to_remove, false /* reset */);
+//         for (auto [k, _]: operations) {
+//             LOGINFO("Removing key {}", k);
+//             this->remove_one(k, true);
+//         }
+//
+//         LOGINFO("Step {}-3: Simulate crash and recover", i + 1);
+//         this->crash_and_recover(operations, fmt::format("recover_tree_crash_{}.dot", i + 1));
+//     }
+// }
+//
+// TYPED_TEST(IndexCrashTest, MergeManualCrash) {
+//     std::vector<std::string> flip_points = {
+//         "crash_flush_on_merge_at_parent",
+//         "crash_flush_on_merge_at_left_child",
+//     };
+//
+//     constexpr uint32_t num_entries = 28; // with max=5 & min=3
+//
+//     auto initTree = [this, num_entries]() {
+//         for (auto k = 0u; k < num_entries; ++k) {
+//             this->put(k, btree_put_type::INSERT, true /* expect_success */);
+//         }
+//         test_common::HSTestHelper::trigger_cp(true);
+//         this->m_shadow_map.save(this->m_shadow_filename);
+//     };
+//
+//     std::vector<OperationList> removing_scenarios = {
+//         {
+//             {27, OperationType::Remove},
+//             {26, OperationType::Remove},
+//             {25, OperationType::Remove},
+//             {24, OperationType::Remove},
+//             {23, OperationType::Remove},
+//             {22, OperationType::Remove},
+//         }, // Merge 2 rightmost leaf nodes in 1 action
+//         {
+//             {27, OperationType::Remove},
+//             {26, OperationType::Remove},
+//             {25, OperationType::Remove},
+//             {24, OperationType::Remove},
+//             {23, OperationType::Remove},
+//             {20, OperationType::Remove},
+//             {19, OperationType::Remove},
+//         }, // Merge 3 rightmost leaf nodes in 1 action
+//         {
+//             {27, OperationType::Remove},
+//             {26, OperationType::Remove},
+//             {25, OperationType::Remove},
+//             {24, OperationType::Remove},
+//             {23, OperationType::Remove},
+//             {22, OperationType::Remove},
+//             {21, OperationType::Remove},
+//             {20, OperationType::Remove},
+//             {19, OperationType::Remove},
+//         }, // Merge 3 rightmost leaf nodes in 2 actions
+//         {
+//             {23, OperationType::Remove},
+//             {22, OperationType::Remove},
+//             {11, OperationType::Remove},
+//             {10, OperationType::Remove},
+//             {13, OperationType::Remove},
+//         }, // Merge from level=0 then level=1
+//         // {
+//         //     {16, OperationType::Remove},
+//         // }, // Merge from level=1 then level=0 - need to set min=4
+//     };
+//
+//     for (int i = 0; i < static_cast<int>(removing_scenarios.size()); i++) {
+//         auto scenario = removing_scenarios[i];
+//         auto s_idx = i + 1;
+//         LOGINFO("\n\tTesting scenario {}", s_idx);
+//         for (int j = 0; j < static_cast<int>(flip_points.size()); j++) {
+//             const auto &flip_point = flip_points[j];
+//             auto f_idx = j + 1;
+//             LOGINFO("\n\t\t\t\tTesting flip point: {}", flip_point);
+//
+//             LOGINFO("Step {}-{}-1: Populate keys and flush", s_idx, f_idx);
+//             initTree();
+//             this->visualize_keys(fmt::format("tree_init.{}_{}.dot", s_idx, f_idx));
+//
+//             LOGINFO("Step {}-{}-2: Set crash flag, remove keys in reverse order", s_idx, f_idx);
+//             this->set_basic_flip(flip_point);
+//             for (auto k: scenario) {
+//                 LOGINFO("Removing entry {}", k.first);
+//                 this->remove_one(k.first);
+//             }
+//             this->visualize_keys(fmt::format("tree_before_first_crash.{}_{}.dot", s_idx, f_idx));
+//             this->remove_flip(flip_point);
+//
+//             LOGINFO("Step {}-{}-3: Trigger cp to crash", s_idx, f_idx);
+//             this->crash_and_recover(scenario);
+//             test_common::HSTestHelper::trigger_cp(true);
+//             this->get_all();
+//
+//             this->reset_btree();
+//             test_common::HSTestHelper::trigger_cp(true);
+//         }
+//     }
+// }
 #endif
 
 int main(int argc, char* argv[]) {
diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp
index cbe8ff760..871eafdaf 100644
--- a/src/tests/test_log_dev.cpp
+++ b/src/tests/test_log_dev.cpp
@@ -157,6 +157,32 @@ class LogDevTest : public ::testing::Test {
         }
     }
 
+    void insert_batch_sync(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& lsn, int64_t batch,
+                           uint32_t fixed_size = 0) {
+        bool io_memory{false};
+        std::vector< test_log_data* > data_vector;
+
+        for (int64_t i = 0; i < batch; ++i) {
+            auto* d = prepare_data(lsn + i, io_memory, fixed_size);
+            data_vector.push_back(d); // Store the pointer in the vector
+            log_store->write_async(lsn + i, {uintptr_cast(d), d->total_size(), false}, nullptr, nullptr);
+            LOGINFO("Written async data for LSN -> {}:{}", log_store->get_store_id(), lsn + i);
+        }
+
+        log_store->flush();
+        LOGINFO("Flush data from {} to {}", lsn, lsn + batch);
+        lsn += batch;
+
+        // Free all the allocated memory after the batch insert
+        for (auto* d : data_vector) {
+            if (io_memory) {
+                iomanager.iobuf_free(uintptr_cast(d));
+            } else {
+                std::free(voidptr_cast(d));
+            }
+        }
+    }
+
     void kickstart_inserts(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& cur_lsn, int64_t batch,
                            uint32_t fixed_size = 0) {
         auto last = cur_lsn + batch;
@@ -200,8 +226,13 @@ class LogDevTest : public ::testing::Test {
         read_all_verify(log_store);
     }
 
-    void truncate_validate(std::shared_ptr< HomeLogStore > log_store) {
+    void truncate_validate(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t* trunc_lsn = nullptr) {
         auto upto = log_store->get_contiguous_completed_seq_num(-1);
+        if (trunc_lsn && *trunc_lsn != upto) {
+            LOGWARN("Truncate issued upto {} but real upto lsn in log store is {}", *trunc_lsn, upto);
+            upto = *trunc_lsn;
+        }
+
         LOGINFO("truncate_validate upto {}", upto);
         log_store->truncate(upto);
         read_all_verify(log_store);
@@ -212,6 +243,20 @@ class LogDevTest : public ::testing::Test {
         auto actual_count = log_store->get_logdev()->log_dev_meta().num_rollback_records(log_store->get_store_id());
         ASSERT_EQ(actual_count, expected_count);
     }
+
+    logid_t get_last_truncate_idx(logdev_id_t logdev_id) {
+        auto status = logstore_service().get_logdev(logdev_id)->get_status(0);
+        if (status.contains("last_truncate_log_idx")) { return s_cast< logid_t >(status["last_truncate_log_idx"]); }
+        LOGERROR("Failed to get last_truncate_log_idx from logdev status for logdev_id {}", logdev_id);
+        return static_cast< logid_t >(-1);
+    }
+
+    logid_t get_current_log_idx(logdev_id_t logdev_id) {
+        auto status = logstore_service().get_logdev(logdev_id)->get_status(0);
+        if (status.contains("current_log_idx")) { return s_cast< logid_t >(status["current_log_idx"]); }
+        LOGERROR("Failed to get current_log_idx from logdev status for logdev_id {}", logdev_id);
+        return static_cast< logid_t >(-1);
+    }
 };
 
 TEST_F(LogDevTest, WriteSyncThenRead) {
@@ -219,7 +264,7 @@ TEST_F(LogDevTest, WriteSyncThenRead) {
 
     for (uint32_t iteration{0}; iteration < iterations; ++iteration) {
         LOGINFO("Iteration {}", iteration);
-        auto logdev_id = logstore_service().create_new_logdev();
+        auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
         s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple();
         auto log_store = logstore_service().create_new_log_store(logdev_id, false);
         const auto store_id = log_store->get_store_id();
@@ -239,7 +284,7 @@ TEST_F(LogDevTest, WriteSyncThenRead) {
 
 TEST_F(LogDevTest, Rollback) {
     LOGINFO("Step 1: Create a single logstore to start rollback test");
-    auto logdev_id = logstore_service().create_new_logdev();
+    auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
     s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple();
     auto log_store = logstore_service().create_new_log_store(logdev_id, false);
     auto store_id = log_store->get_store_id();
@@ -247,7 +292,7 @@ TEST_F(LogDevTest, Rollback) {
     auto restart = [&]() {
         std::promise< bool > p;
         auto starting_cb = [&]() {
-            logstore_service().open_logdev(logdev_id);
+            logstore_service().open_logdev(logdev_id, flush_mode_t::EXPLICIT);
             logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) {
                 log_store = store;
                 p.set_value(true);
@@ -304,6 +349,351 @@ TEST_F(LogDevTest, Rollback) {
     rollback_records_validate(log_store, 0 /* expected_count */);
 }
 
+TEST_F(LogDevTest, ReTruncate) {
+    LOGINFO("Step 1: Create a single logstore to start re-truncate test");
+    auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
+    s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple();
+    auto log_store = logstore_service().create_new_log_store(logdev_id, false);
+
+    LOGINFO("Step 2: Issue sequential inserts with q depth of 10");
+    logstore_seq_num_t cur_lsn = 0;
+    kickstart_inserts(log_store, cur_lsn, 500);
+
+    LOGINFO("Step 3: Truncate all entries");
+    logstore_seq_num_t ls_last_lsn = 499;
+    truncate_validate(log_store, &ls_last_lsn);
+    ASSERT_EQ(log_store->start_lsn(), ls_last_lsn + 1);
+    ASSERT_EQ(log_store->tail_lsn(), ls_last_lsn);
+    ASSERT_EQ(log_store->truncated_upto(), ls_last_lsn);
+
+    LOGINFO("Step 4: Truncate again");
+    truncate_validate(log_store, &ls_last_lsn);
+    ASSERT_EQ(log_store->start_lsn(), ls_last_lsn + 1);
+    ASSERT_EQ(log_store->tail_lsn(), ls_last_lsn);
+    ASSERT_EQ(log_store->truncated_upto(), ls_last_lsn);
+
+    LOGINFO("Step 5: Read and verify all entries again");
+    read_all_verify(log_store);
+}
+
+TEST_F(LogDevTest, TruncateWithExceedingLSN) {
+    LOGINFO("Step 1: Create a single logstore to start truncate with exceeding LSN test");
+    auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
+    s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple();
+    auto log_store = logstore_service().create_new_log_store(logdev_id, false);
+
+    LOGINFO("Step 2: Insert 500 entries");
+    logstore_seq_num_t cur_lsn = 0;
+    kickstart_inserts(log_store, cur_lsn, 500);
+
+    LOGINFO("Step 3: Read and verify all entries");
+    read_all_verify(log_store);
+
+    LOGINFO("Step 4: Truncate 100 entries");
+    logstore_seq_num_t trunc_lsn = 99;
+    truncate_validate(log_store, &trunc_lsn);
+    ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1);
+    ASSERT_EQ(log_store->tail_lsn(), 499);
+    ASSERT_EQ(log_store->next_lsn(), 500);
+    ASSERT_EQ(log_store->truncated_upto(), trunc_lsn);
+
+    LOGINFO("Step 5: Read and verify all entries");
+    read_all_verify(log_store);
+
+    LOGINFO("Step 6: Truncate all with exceeding lsn");
+    trunc_lsn = 1999999;
+    truncate_validate(log_store, &trunc_lsn);
+    ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1);
+    ASSERT_EQ(log_store->tail_lsn(), trunc_lsn);
+    ASSERT_EQ(log_store->next_lsn(), 2000000);
+    ASSERT_EQ(log_store->truncated_upto(), trunc_lsn);
+
+    LOGINFO("Step 7 Read and verify all entries");
+    read_all_verify(log_store);
+
+    LOGINFO("Step 8: Append 500 entries");
+    cur_lsn = log_store->next_lsn();
+    kickstart_inserts(log_store, cur_lsn, 500);
+    ASSERT_EQ(log_store->next_lsn(), 2000500);
+
+    LOGINFO("Step 9: Read and verify all entries");
+    read_all_verify(log_store);
+}
+
+TEST_F(LogDevTest, TruncateAfterRestart) {
+    LOGINFO("Step 1: Create a single logstore to start truncate with overlapping LSN test");
+    auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
+    s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple();
+    auto log_store = logstore_service().create_new_log_store(logdev_id, false);
+    auto store_id = log_store->get_store_id();
+
+    auto restart = [&]() {
+        std::promise< bool > p;
+        auto starting_cb = [&]() {
+            logstore_service().open_logdev(logdev_id, flush_mode_t::EXPLICIT);
+            logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) {
+                log_store = store;
+                p.set_value(true);
+            });
+        };
+        start_homestore(true /* restart */, starting_cb);
+        p.get_future().get();
+    };
+
+    LOGINFO("Step 2: Insert 500 entries");
+    logstore_seq_num_t cur_lsn = 0;
+    kickstart_inserts(log_store, cur_lsn, 500);
+
+    LOGINFO("Step 3: Read and verify all entries");
+    read_all_verify(log_store);
+
+    LOGINFO("Step 4: Truncate 100 entries");
+    logstore_seq_num_t trunc_lsn = 99;
+    truncate_validate(log_store, &trunc_lsn);
+    ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1);
+    ASSERT_EQ(log_store->tail_lsn(), 499);
+    ASSERT_EQ(log_store->next_lsn(), 500);
+    ASSERT_EQ(log_store->truncated_upto(), trunc_lsn);
+
+    LOGINFO("Step 5: Read and verify all entries");
+    read_all_verify(log_store);
+
+    LOGINFO("Step 6: Restart and verify all entries");
+    restart();
+    read_all_verify(log_store);
+    auto const [last_trunc_lsn, trunc_ld_key, tail_lsn] = log_store->truncate_info();
+    ASSERT_EQ(last_trunc_lsn, trunc_lsn);
+    ASSERT_EQ(trunc_ld_key.idx, 0);
+    ASSERT_EQ(tail_lsn, log_store->tail_lsn());
+
+    LOGINFO("Step 7: call log dev truncate again and read verify")
+    logstore_service().device_truncate();
+    read_all_verify(log_store);
+}
+
+TEST_F(LogDevTest, TruncateAcrossMultipleStores) {
+    LOGINFO("Step 1: Create 3 log stores to start truncate across multiple stores test");
+    auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
+    s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple();
+    auto store1 = logstore_service().create_new_log_store(logdev_id, false);
+    auto store2 = logstore_service().create_new_log_store(logdev_id, false);
+    auto store3 = logstore_service().create_new_log_store(logdev_id, false);
+
+    LOGINFO("Step 2: Insert 100 entries to store {}", store1->get_store_id());
+    logstore_seq_num_t cur_lsn = 0;
+    kickstart_inserts(store1, cur_lsn, 100);
+    ASSERT_EQ(get_current_log_idx(logdev_id), 100);
+
+    LOGINFO("Step 3: Insert 200 entries to store {}", store2->get_store_id());
+    cur_lsn = 0;
+    kickstart_inserts(store2, cur_lsn, 200);
+    ASSERT_EQ(get_current_log_idx(logdev_id), 300);
+
+    LOGINFO("Step 4: Insert 200 entries to store {}", store3->get_store_id());
+    cur_lsn = 0;
+    kickstart_inserts(store3, cur_lsn, 200);
+    ASSERT_EQ(get_current_log_idx(logdev_id), 500);
+
+    LOGINFO("Step 5: Read and verify all stores");
+    read_all_verify(store1);
+    ASSERT_EQ(store1->start_lsn(), 0);
+    ASSERT_EQ(store1->tail_lsn(), 99);
+    ASSERT_EQ(store1->truncated_upto(), -1);
+    read_all_verify(store2);
+    ASSERT_EQ(store2->start_lsn(), 0);
+    ASSERT_EQ(store2->tail_lsn(), 199);
+    ASSERT_EQ(store2->truncated_upto(), -1);
+    read_all_verify(store3);
+    ASSERT_EQ(store3->start_lsn(), 0);
+    ASSERT_EQ(store3->tail_lsn(), 199);
+    ASSERT_EQ(store3->truncated_upto(), -1);
+    // log dev should not truncate any logs due to no truncate in log stores happened
+    ASSERT_EQ(get_last_truncate_idx(logdev_id), -1);
+
+    LOGINFO("Step 6: Truncate 100 entries in store {}", store2->get_store_id());
+    logstore_seq_num_t trunc_lsn = 99;
+    truncate_validate(store2, &trunc_lsn);
+
+    LOGINFO("Step 7: Read and verify all stores");
+    read_all_verify(store1);
+    ASSERT_EQ(store1->start_lsn(), 0);
+    ASSERT_EQ(store1->tail_lsn(), 99);
+    ASSERT_EQ(store1->truncated_upto(), -1);
+    read_all_verify(store2);
+    ASSERT_EQ(store2->start_lsn(), 100);
+    ASSERT_EQ(store2->tail_lsn(), 199);
+    ASSERT_EQ(store2->truncated_upto(), 99);
+    read_all_verify(store3);
+    ASSERT_EQ(store3->start_lsn(), 0);
+    ASSERT_EQ(store3->tail_lsn(), 199);
+    ASSERT_EQ(store3->truncated_upto(), -1);
+    // log dev should not truncate any logs due to store1 has valid logs
+    ASSERT_EQ(get_last_truncate_idx(logdev_id), -1);
+
+    LOGINFO("Step 8: Truncate 500 entries in store {}", store3->get_store_id());
+    trunc_lsn = 499;
+    truncate_validate(store3, &trunc_lsn);
+
+    LOGINFO("Step 9: Read and verify all stores");
+    read_all_verify(store1);
+    ASSERT_EQ(store1->start_lsn(), 0);
+    ASSERT_EQ(store1->tail_lsn(), 99);
+    ASSERT_EQ(store1->truncated_upto(), -1);
+    read_all_verify(store2);
+    ASSERT_EQ(store2->start_lsn(), 100);
+    ASSERT_EQ(store2->tail_lsn(), 199);
+    ASSERT_EQ(store2->truncated_upto(), 99);
+    read_all_verify(store3);
+    ASSERT_EQ(store3->start_lsn(), 500);
+    ASSERT_EQ(store3->tail_lsn(), 499);
+    ASSERT_EQ(store3->truncated_upto(), 499);
+
+    // log dev should truncate not truncate any logs due to store1 has valid logs
+    ASSERT_EQ(get_last_truncate_idx(logdev_id), -1);
+
+    LOGINFO("Step 10: Truncate 100 entries in store {}", store1->get_store_id());
+    trunc_lsn = 99;
+    truncate_validate(store1, &trunc_lsn);
+
+    LOGINFO("Step 11: Read and verify all stores");
+    read_all_verify(store1);
+    ASSERT_EQ(store1->start_lsn(), 100);
+    ASSERT_EQ(store1->tail_lsn(), 99);
+    ASSERT_EQ(store1->truncated_upto(), 99);
+    read_all_verify(store2);
+    ASSERT_EQ(store2->start_lsn(), 100);
+    ASSERT_EQ(store2->tail_lsn(), 199);
+    ASSERT_EQ(store2->truncated_upto(), 99);
+    read_all_verify(store3);
+    ASSERT_EQ(store3->start_lsn(), 500);
+    ASSERT_EQ(store3->tail_lsn(), 499);
+    ASSERT_EQ(store3->truncated_upto(), 499);
+
+    // log dev should truncate logs upto 199, as store2 has valid logs
+    ASSERT_EQ(get_last_truncate_idx(logdev_id), 199);
+
+    LOGINFO("Step 12: Truncate 300 entries in store {}", store2->get_store_id());
+    trunc_lsn = 299;
+    truncate_validate(store2, &trunc_lsn);
+
+    LOGINFO("Step 13: Read and verify all stores");
+    read_all_verify(store1);
+    ASSERT_EQ(store1->start_lsn(), 100);
+    ASSERT_EQ(store1->tail_lsn(), 99);
+    ASSERT_EQ(store1->truncated_upto(), 99);
+    read_all_verify(store2);
+    ASSERT_EQ(store2->start_lsn(), 300);
+    ASSERT_EQ(store2->tail_lsn(), 299);
+    ASSERT_EQ(store2->truncated_upto(), 299);
+    read_all_verify(store3);
+    ASSERT_EQ(store3->start_lsn(), 500);
+    ASSERT_EQ(store3->tail_lsn(), 499);
+    ASSERT_EQ(store3->truncated_upto(), 499);
+
+    // log dev should truncate all logs as all stores are empty
+    ASSERT_EQ(get_last_truncate_idx(logdev_id), 499);
+
+    LOGINFO("Step 14: Insert 100 entries in store {}", store1->get_store_id());
+    cur_lsn = 100;
+    kickstart_inserts(store1, cur_lsn, 100);
+    ASSERT_EQ(get_current_log_idx(logdev_id), 600);
+
+    LOGINFO("Step 15: Read and verify all stores");
+    read_all_verify(store1);
+    ASSERT_EQ(store1->start_lsn(), 100);
+    ASSERT_EQ(store1->tail_lsn(), 199);
+    ASSERT_EQ(store1->truncated_upto(), 99);
+    read_all_verify(store2);
+    ASSERT_EQ(store2->start_lsn(), 300);
+    ASSERT_EQ(store2->tail_lsn(), 299);
+    ASSERT_EQ(store2->truncated_upto(), 299);
+    read_all_verify(store3);
+    ASSERT_EQ(store3->start_lsn(), 500);
+    ASSERT_EQ(store3->tail_lsn(), 499);
+    ASSERT_EQ(store3->truncated_upto(), 499);
+
+    // log dev should not truncate since no new truncate happened
+    ASSERT_EQ(get_last_truncate_idx(logdev_id), 499);
+
+    LOGINFO("Step 16: Truncate 500 entries in store {}", store1->get_store_id());
+    trunc_lsn = 499;
+    truncate_validate(store1, &trunc_lsn);
+
+    LOGINFO("Step 17: Read and verify all stores");
+    read_all_verify(store1);
+    ASSERT_EQ(store1->start_lsn(), 500);
+    ASSERT_EQ(store1->tail_lsn(), 499);
+    ASSERT_EQ(store1->truncated_upto(), 499);
+    read_all_verify(store2);
+    ASSERT_EQ(store2->start_lsn(), 300);
+    ASSERT_EQ(store2->tail_lsn(), 299);
+    ASSERT_EQ(store2->truncated_upto(), 299);
+    read_all_verify(store3);
+    ASSERT_EQ(store3->start_lsn(), 500);
+    ASSERT_EQ(store3->tail_lsn(), 499);
+    ASSERT_EQ(store3->truncated_upto(), 499);
+
+    // make sure new logs can truncate successfully when there are empty log stores
+    ASSERT_EQ(get_last_truncate_idx(logdev_id), 599);
+}
+
+TEST_F(LogDevTest, TruncateLogsAfterFlushAndRestart) {
+    LOGINFO("Step 1: Create a single logstore to start truncate-logs-after-flush-and-restart test");
+    auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
+    s_max_flush_multiple = logstore_service().get_logdev(logdev_id)->get_flush_size_multiple();
+    auto log_store = logstore_service().create_new_log_store(logdev_id, false);
+    auto store_id = log_store->get_store_id();
+
+    auto restart = [&]() {
+        std::promise< bool > p;
+        auto starting_cb = [&]() {
+            logstore_service().open_logdev(logdev_id, flush_mode_t::EXPLICIT);
+            logstore_service().open_log_store(logdev_id, store_id, false /* append_mode */).thenValue([&](auto store) {
+                log_store = store;
+                p.set_value(true);
+            });
+        };
+        start_homestore(true /* restart */, starting_cb);
+        p.get_future().get();
+    };
+
+    LOGINFO("Step 2: Insert 100 entries");
+    logstore_seq_num_t cur_lsn = 0;
+    insert_batch_sync(log_store, cur_lsn, 100, 0);
+
+    LOGINFO("Step 3: Read and verify all entries");
+    read_all_verify(log_store);
+    ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 99);
+
+    LOGINFO("Step 4: Append 100 entries");
+    insert_batch_sync(log_store, cur_lsn, 100, 0);
+    ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199);
+
+    LOGINFO("Step 5: Read and verify all entries");
+    read_all_verify(log_store);
+
+    LOGINFO("Step 6: restart and verify");
+    restart();
+    read_all_verify(log_store);
+    ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199);
+
+    LOGINFO("Step 7: Truncate 50 entries");
+    logstore_seq_num_t trunc_lsn = 49;
+    truncate_validate(log_store, &trunc_lsn);
+    ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1);
+    ASSERT_EQ(log_store->tail_lsn(), 199);
+    ASSERT_EQ(log_store->truncated_upto(), trunc_lsn);
+    ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199);
+
+    LOGINFO("Step 8: restart and verify");
+    restart();
+    read_all_verify(log_store);
+    ASSERT_EQ(log_store->start_lsn(), trunc_lsn + 1);
+    ASSERT_EQ(log_store->tail_lsn(), 199);
+    ASSERT_EQ(log_store->truncated_upto(), trunc_lsn);
+    ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), 199);
+}
+
 TEST_F(LogDevTest, CreateRemoveLogDev) {
     auto num_logdev = SISL_OPTIONS["num_logdevs"].as< uint32_t >();
     std::vector< std::shared_ptr< HomeLogStore > > log_stores;
@@ -317,7 +707,7 @@ TEST_F(LogDevTest, CreateRemoveLogDev) {
     ASSERT_EQ(vdev->num_descriptors(), 0);
 
     for (uint32_t i{0}; i < num_logdev; ++i) {
-        auto id = logstore_service().create_new_logdev();
+        auto id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
         s_max_flush_multiple = logstore_service().get_logdev(id)->get_flush_size_multiple();
         auto store = logstore_service().create_new_log_store(id, false);
         log_stores.push_back(store);
@@ -365,7 +755,7 @@ TEST_F(LogDevTest, DeleteUnopenedLogDev) {
     // Test deletion of unopened logdev.
     std::set< logdev_id_t > id_set, unopened_id_set;
     for (uint32_t i{0}; i < num_logdev; ++i) {
-        auto id = logstore_service().create_new_logdev();
+        auto id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
         id_set.insert(id);
         if (i >= num_logdev / 2) { unopened_id_set.insert(id); }
         s_max_flush_multiple = logstore_service().get_logdev(id)->get_flush_size_multiple();
@@ -389,7 +779,7 @@ TEST_F(LogDevTest, DeleteUnopenedLogDev) {
         auto starting_cb = [&]() {
             auto it = id_set.begin();
             for (uint32_t i{0}; i < id_set.size() / 2; i++, it++) {
-                logstore_service().open_logdev(*it);
+                logstore_service().open_logdev(*it, flush_mode_t::EXPLICIT);
             }
         };
         start_homestore(true /* restart */, starting_cb);
diff --git a/src/tests/test_log_store.cpp b/src/tests/test_log_store.cpp
index 43e57ff7c..1aa580bba 100644
--- a/src/tests/test_log_store.cpp
+++ b/src/tests/test_log_store.cpp
@@ -455,7 +455,7 @@ class SampleDB {
 
                 for (uint32_t i{0}; i < n_log_stores; ++i) {
                     SampleLogStoreClient* client = m_log_store_clients[i].get();
-                    logstore_service().open_logdev(client->m_logdev_id);
+                    logstore_service().open_logdev(client->m_logdev_id, flush_mode_t::EXPLICIT);
                     logstore_service()
                         .open_log_store(client->m_logdev_id, client->m_store_id, false /* append_mode */)
                         .thenValue([i, this, client](auto log_store) { client->set_log_store(log_store); });
@@ -479,7 +479,7 @@ class SampleDB {
 
             std::vector< logdev_id_t > logdev_id_vec;
             for (uint32_t i{0}; i < n_log_devs; ++i) {
-                logdev_id_vec.push_back(logstore_service().create_new_logdev());
+                logdev_id_vec.push_back(logstore_service().create_new_logdev(flush_mode_t::EXPLICIT));
             }
 
             for (uint32_t i{0}; i < n_log_stores; ++i) {
@@ -1225,7 +1225,7 @@ TEST_F(LogStoreTest, WriteSyncThenRead) {
 
     for (uint32_t iteration{0}; iteration < iterations; ++iteration) {
         LOGINFO("Iteration {}", iteration);
-        auto logdev_id = logstore_service().create_new_logdev();
+        auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
         auto tmp_log_store = logstore_service().create_new_log_store(logdev_id, false);
         const auto store_id = tmp_log_store->get_store_id();
         LOGINFO("Created new log store -> id {}", store_id);
diff --git a/src/tests/test_log_store_long_run.cpp b/src/tests/test_log_store_long_run.cpp
index e9808da65..507e51633 100644
--- a/src/tests/test_log_store_long_run.cpp
+++ b/src/tests/test_log_store_long_run.cpp
@@ -294,7 +294,7 @@ class LogStoreLongRun : public ::testing::Test {
                 HS_SETTINGS_FACTORY().save();
                 for (uint32_t i{0}; i < n_log_stores; ++i) {
                     SampleLogStoreClient* client = m_log_store_clients[i].get();
-                    logstore_service().open_logdev(client->m_logdev_id);
+                    logstore_service().open_logdev(client->m_logdev_id, flush_mode_t::EXPLICIT);
                     logstore_service()
                         .open_log_store(client->m_logdev_id, client->m_store_id, false /* append_mode */)
                         .thenValue([i, this, client](auto log_store) { client->set_log_store(log_store); });
@@ -318,7 +318,7 @@ class LogStoreLongRun : public ::testing::Test {
 
             std::vector< logdev_id_t > logdev_id_vec;
             for (uint32_t i{0}; i < n_log_devs; ++i)
-                logdev_id_vec.push_back(logstore_service().create_new_logdev());
+                logdev_id_vec.push_back(logstore_service().create_new_logdev(flush_mode_t::EXPLICIT));
 
             for (uint32_t i{0}; i < n_log_stores; ++i)
                 m_log_store_clients.push_back(std::make_unique< SampleLogStoreClient >(
@@ -466,7 +466,7 @@ class LogStoreLongRun : public ::testing::Test {
         validate_num_stores();
 
         // Create a new logstore.
-        auto logdev_id = logstore_service().create_new_logdev();
+        auto logdev_id = logstore_service().create_new_logdev(flush_mode_t::EXPLICIT);
         m_log_store_clients.push_back(std::make_unique< SampleLogStoreClient >(
             logdev_id, bind_this(LogStoreLongRun::on_log_insert_completion, 3)));
         validate_num_stores();
diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp
index 45681f412..83330422d 100644
--- a/src/tests/test_mem_btree.cpp
+++ b/src/tests/test_mem_btree.cpp
@@ -46,6 +46,8 @@ SISL_OPTION_GROUP(
      ::cxxopts::value< std::vector< std::string > >(), "operations [...]"),
     (preload_size, "", "preload_size", "number of entries to preload tree with",
      ::cxxopts::value< uint32_t >()->default_value("1000"), "number"),
+    (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("0"),
+     ""),
     (seed, "", "seed", "random engine seed, use random if not defined",
      ::cxxopts::value< uint64_t >()->default_value("0"), "number"),
     (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"))
@@ -330,6 +332,10 @@ int main(int argc, char* argv[]) {
         auto seed = SISL_OPTIONS["seed"].as< uint64_t >();
         LOGINFO("Using seed {} to sow the random generation", seed);
         g_re.seed(seed);
+    } else {
+        auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+        LOGINFO("No seed provided. Using randomly generated seed: {}", seed);
+        g_re.seed(seed);
     }
     auto ret = RUN_ALL_TESTS();
     return ret;
diff --git a/src/tests/test_meta_blk_mgr.cpp b/src/tests/test_meta_blk_mgr.cpp
index 870dd5191..d3c5401e9 100644
--- a/src/tests/test_meta_blk_mgr.cpp
+++ b/src/tests/test_meta_blk_mgr.cpp
@@ -185,7 +185,7 @@ class VMetaBlkMgrTest : public ::testing::Test {
     uint64_t total_size_written(const void* cookie) { return m_mbm->meta_size(cookie); }
 
     void do_write_to_full() {
-        static constexpr uint64_t blkstore_overhead = 4 * 1024ul * 1024ul; // 4MB
+        static constexpr uint64_t blkstore_overhead = 256 * 1024ul * 1024ul; // 256MB
         ssize_t free_size = uint64_cast(m_mbm->total_size() - m_mbm->used_size() - blkstore_overhead);
 
         HS_REL_ASSERT_GT(free_size, 0);
@@ -193,7 +193,10 @@ class VMetaBlkMgrTest : public ::testing::Test {
 
         uint64_t size_written{0};
         while (free_size > 0) {
-            if (free_size >= gp.max_wrt_sz) {
+            LOGDEBUG("free size: {}, total size: {}, used size: {}, available blks: {}", free_size, m_mbm->total_size(),
+                     m_mbm->used_size(), m_mbm->available_blks());
+            // if it is overflow, 2 extra blocks are needed for ovf blk header and meta blk;
+            if (free_size - 2 * m_mbm->block_size() >= gp.max_wrt_sz) {
                 size_written = do_sb_write(do_overflow(), 0);
             } else {
                 size_written = do_sb_write(false, m_mbm->meta_blk_context_sz());
diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp
index f8aa06c5c..f6d458943 100644
--- a/src/tests/test_raft_repl_dev.cpp
+++ b/src/tests/test_raft_repl_dev.cpp
@@ -12,606 +12,57 @@
  * specific language governing permissions and limitations under the License.
  *
  *********************************************************************************/
-#include <vector>
-#include <iostream>
-#include <filesystem>
-#include <thread>
-
-#include <gtest/gtest.h>
-#include <iomgr/io_environment.hpp>
-#include <sisl/logging/logging.h>
-#include <sisl/options/options.h>
-#include <sisl/fds/buffer.hpp>
-#include <folly/init/Init.h>
-#include <folly/executors/GlobalExecutor.h>
-#include <boost/uuid/nil_generator.hpp>
-
-#include <gtest/gtest.h>
-#include <iomgr/iomgr_flip.hpp>
-#include <homestore/blk.h>
-#include <homestore/homestore.hpp>
-#include <homestore/homestore_decl.hpp>
-#include <homestore/blkdata_service.hpp>
-#include <homestore/replication_service.hpp>
-#include <homestore/replication/repl_dev.h>
-#include "common/homestore_config.hpp"
-#include "common/homestore_assert.hpp"
-#include "common/homestore_utils.hpp"
-
-#define private public
-#include "test_common/hs_repl_test_common.hpp"
-#include "replication/service/raft_repl_service.h"
-#include "replication/repl_dev/raft_repl_dev.h"
-
-using namespace homestore;
-
-SISL_OPTION_GROUP(test_raft_repl_dev,
-                  (block_size, "", "block_size", "block size to io",
-                   ::cxxopts::value< uint32_t >()->default_value("4096"), "number"),
-                  (num_raft_groups, "", "num_raft_groups", "number of raft groups per test",
-                   ::cxxopts::value< uint32_t >()->default_value("1"), "number"),
-                  // for below replication parameter, their default value always get from dynamic config, only used
-                  // when specified by user
-                  (snapshot_distance, "", "snapshot_distance", "distance between snapshots",
-                   ::cxxopts::value< uint32_t >()->default_value("0"), "number"),
-                  (num_raft_logs_resv, "", "num_raft_logs_resv", "number of raft logs reserved",
-                   ::cxxopts::value< uint32_t >()->default_value("0"), "number"),
-                  (res_mgr_audit_timer_ms, "", "res_mgr_audit_timer_ms", "resource manager audit timer",
-                   ::cxxopts::value< uint32_t >()->default_value("0"), "number"));
-
-SISL_OPTIONS_ENABLE(logging, test_raft_repl_dev, iomgr, config, test_common_setup, test_repl_common_setup)
-
-static std::unique_ptr< test_common::HSReplTestHelper > g_helper;
-static std::random_device g_rd{};
-static std::default_random_engine g_re{g_rd()};
-
-class TestReplicatedDB : public homestore::ReplDevListener {
-public:
-    struct Key {
-        uint64_t id_;
-        bool operator<(Key const& other) const { return id_ < other.id_; }
-    };
-
-    struct Value {
-        int64_t lsn_;
-        uint64_t data_size_;
-        uint64_t data_pattern_;
-        MultiBlkId blkid_;
-        uint64_t id_;
-    };
-
-    struct KeyValuePair {
-        Key key;
-        Value value;
-    };
-
-    struct test_req : public repl_req_ctx {
-        struct journal_header {
-            uint64_t data_size;
-            uint64_t data_pattern;
-        };
-
-        journal_header jheader;
-        uint64_t key_id;
-        sisl::sg_list write_sgs;
-        sisl::sg_list read_sgs;
-
-        sisl::blob header_blob() { return sisl::blob(uintptr_cast(&jheader), sizeof(journal_header)); }
-        sisl::blob key_blob() { return sisl::blob{uintptr_cast(&key_id), sizeof(uint64_t)}; }
-
-        test_req() {
-            write_sgs.size = 0;
-            read_sgs.size = 0;
-            key_id = (uint64_t)rand() << 32 | rand();
-        }
-
-        ~test_req() {
-            for (auto const& iov : write_sgs.iovs) {
-                iomanager.iobuf_free(uintptr_cast(iov.iov_base));
-            }
-
-            for (auto const& iov : read_sgs.iovs) {
-                iomanager.iobuf_free(uintptr_cast(iov.iov_base));
-            }
-        }
-    };
-
-    TestReplicatedDB() = default;
-    virtual ~TestReplicatedDB() = default;
-
-    void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids,
-                   cintrusive< repl_req_ctx >& ctx) override {
-        ASSERT_EQ(header.size(), sizeof(test_req::journal_header));
-
-        auto jheader = r_cast< test_req::journal_header const* >(header.cbytes());
-        Key k{.id_ = *(r_cast< uint64_t const* >(key.cbytes()))};
-        Value v{.lsn_ = lsn,
-                .data_size_ = jheader->data_size,
-                .data_pattern_ = jheader->data_pattern,
-                .blkid_ = blkids,
-                .id_ = k.id_};
-
-        LOGINFOMOD(replication, "[Replica={}] Received commit on lsn={} dsn={} key={} value[blkid={} pattern={}]",
-                   g_helper->replica_num(), lsn, ctx->dsn(), k.id_, v.blkid_.to_string(), v.data_pattern_);
-
-        {
-            std::unique_lock lk(db_mtx_);
-            inmem_db_.insert_or_assign(k, v);
-            lsn_index_.emplace(lsn, v);
-            last_data_committed_lsn = lsn;
-            ++commit_count_;
-        }
-
-        if (ctx->is_proposer()) { g_helper->runner().next_task(); }
-    }
-
-    bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key,
-                       cintrusive< repl_req_ctx >& ctx) override {
-        LOGINFOMOD(replication, "[Replica={}] Received pre-commit on lsn={} dsn={}", g_helper->replica_num(), lsn,
-                   ctx->dsn());
-        return true;
-    }
-
-    void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key,
-                     cintrusive< repl_req_ctx >& ctx) override {
-        LOGINFOMOD(replication, "[Replica={}] Received rollback on lsn={}", g_helper->replica_num(), lsn);
-    }
-
-    void on_restart() {
-        LOGINFOMOD(replication, "restarted repl dev for [Replica={}] Group={}", g_helper->replica_num(),
-                   boost::uuids::to_string(repl_dev()->group_id()));
-    }
-
-    void on_error(ReplServiceError error, const sisl::blob& header, const sisl::blob& key,
-                  cintrusive< repl_req_ctx >& ctx) override {
-        LOGINFOMOD(replication, "[Replica={}] Received error={} on key={}", g_helper->replica_num(), enum_name(error),
-                   *(r_cast< uint64_t const* >(key.cbytes())));
-    }
-
-    AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override {
-        std::lock_guard< std::mutex > lock(m_snapshot_lock);
-        auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot();
-        LOGINFOMOD(replication, "[Replica={}] Got snapshot callback term={} idx={}", g_helper->replica_num(),
-                   s->get_last_log_term(), s->get_last_log_idx());
-        m_last_snapshot = context;
-        return make_async_success<>();
-    }
-
-    int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override {
-        auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot();
-
-        if (snp_data->offset == 0) {
-            snp_data->is_last_obj = false;
-            snp_data->blob = sisl::io_blob_safe(sizeof(ulong));
-            LOGINFOMOD(replication, "[Replica={}] Read logical snapshot callback obj_id={} term={} idx={}",
-                       g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx());
-            return 0;
-        }
-
-        int64_t next_lsn = snp_data->offset;
-        std::vector< KeyValuePair > kv_snapshot_data;
-        // we can not use find to get the next element, since if the next lsn is a config lsn , it will not be put into
-        // lsn_index_ and as a result, the find will return the end of the map. so here we use lower_bound to get the
-        // first element to be read and transfered.
-        for (auto iter = lsn_index_.lower_bound(next_lsn); iter != lsn_index_.end(); iter++) {
-            auto& v = iter->second;
-            kv_snapshot_data.emplace_back(Key{v.id_}, v);
-            LOGTRACEMOD(replication, "[Replica={}] Read logical snapshot callback fetching lsn={} size={} pattern={}",
-                        g_helper->replica_num(), v.lsn_, v.data_size_, v.data_pattern_);
-            if (kv_snapshot_data.size() >= 1000) { break; }
-        }
-
-        if (kv_snapshot_data.size() == 0) {
-            snp_data->is_last_obj = true;
-            LOGINFOMOD(replication, "Snapshot is_last_obj is true");
-            return 0;
-        }
-
-        int64_t kv_snapshot_data_size = sizeof(KeyValuePair) * kv_snapshot_data.size();
-        sisl::io_blob_safe blob{static_cast< uint32_t >(kv_snapshot_data_size)};
-        std::memcpy(blob.bytes(), kv_snapshot_data.data(), kv_snapshot_data_size);
-        snp_data->blob = std::move(blob);
-        snp_data->is_last_obj = false;
-        LOGINFOMOD(replication, "[Replica={}] Read logical snapshot callback obj_id={} term={} idx={} num_items={}",
-                   g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(),
-                   kv_snapshot_data.size());
-
-        return 0;
-    }
-
-    void snapshot_data_write(uint64_t data_size, uint64_t data_pattern, MultiBlkId& out_blkids) {
-        auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >();
-        auto write_sgs = test_common::HSTestHelper::create_sgs(data_size, block_size, data_pattern);
-        auto fut = homestore::data_service().async_alloc_write(write_sgs, blk_alloc_hints{}, out_blkids);
-        std::move(fut).get();
-        for (auto const& iov : write_sgs.iovs) {
-            iomanager.iobuf_free(uintptr_cast(iov.iov_base));
-        }
-    }
-
-    void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override {
-        auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot();
-        if (snp_data->offset == 0) {
-            snp_data->offset = last_data_committed_lsn + 1;
-            LOGINFOMOD(replication, "[Replica={}] Save logical snapshot callback return obj_id={}",
-                       g_helper->replica_num(), snp_data->offset);
-            return;
-        }
-
-        size_t kv_snapshot_data_size = snp_data->blob.size();
-        if (kv_snapshot_data_size == 0) return;
-
-        size_t num_items = kv_snapshot_data_size / sizeof(KeyValuePair);
-        std::unique_lock lk(db_mtx_);
-        auto ptr = r_cast< const KeyValuePair* >(snp_data->blob.bytes());
-        for (size_t i = 0; i < num_items; i++) {
-            auto key = ptr->key;
-            auto value = ptr->value;
-            LOGTRACEMOD(replication, "[Replica={}] Save logical snapshot got lsn={} data_size={} data_pattern={}",
-                        g_helper->replica_num(), value.lsn_, value.data_size_, value.data_pattern_);
-
-            // Write to data service and inmem map.
-            MultiBlkId out_blkids;
-            if (value.data_size_ != 0) {
-                snapshot_data_write(value.data_size_, value.data_pattern_, out_blkids);
-                value.blkid_ = out_blkids;
-            }
-            last_data_committed_lsn = value.lsn_;
-            inmem_db_.insert_or_assign(key, value);
-            ++commit_count_;
-            ptr++;
-        }
-
-        LOGINFOMOD(replication,
-                   "[Replica={}] Save logical snapshot callback obj_id={} term={} idx={} is_last={} num_items={}",
-                   g_helper->replica_num(), snp_data->offset, s->get_last_log_term(), s->get_last_log_idx(),
-                   snp_data->is_last_obj, num_items);
-
-        // before we finish install snapshot, raft_server()->get_committed_log_idx() will always be the same. so we need
-        // last_data_committed_lsn to notify leader to transfer new data to follower.
-        snp_data->offset = last_data_committed_lsn + 1;
-    }
-
-    bool apply_snapshot(shared< snapshot_context > context) override {
-        std::lock_guard< std::mutex > lock(m_snapshot_lock);
-        auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot();
-        LOGINFOMOD(replication, "[Replica={}] Apply snapshot term={} idx={}", g_helper->replica_num(),
-                   s->get_last_log_term(), s->get_last_log_idx());
-        m_last_snapshot = context;
-        return true;
-    }
-
-    shared< snapshot_context > last_snapshot() override {
-        std::lock_guard< std::mutex > lock(m_snapshot_lock);
-        if (!m_last_snapshot) return nullptr;
-
-        auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(m_last_snapshot)->nuraft_snapshot();
-        LOGINFOMOD(replication, "[Replica={}] Last snapshot term={} idx={}", g_helper->replica_num(),
-                   s->get_last_log_term(), s->get_last_log_idx());
-        return m_last_snapshot;
-    }
-
-    void free_user_snp_ctx(void*& user_snp_ctx) override {}
-
-    ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override {
-        return blk_alloc_hints{};
-    }
-
-    void on_destroy() override {
-        LOGINFOMOD(replication, "[Replica={}] Group={} is being destroyed", g_helper->replica_num(),
-                   boost::uuids::to_string(repl_dev()->group_id()));
-        g_helper->unregister_listener(repl_dev()->group_id());
-    }
-
-    void db_write(uint64_t data_size, uint32_t max_size_per_iov) {
-        static std::atomic< uint32_t > s_uniq_num{0};
-        auto req = intrusive< test_req >(new test_req());
-        req->jheader.data_size = data_size;
-        req->jheader.data_pattern = ((long long)rand() << 32) | ++s_uniq_num;
-        auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >();
+#include "test_common/raft_repl_test_base.hpp"
 
-        LOGINFOMOD(replication, "[Replica={}] Db write key={} data_size={} pattern={} block_size={}",
-                   g_helper->replica_num(), req->key_id, data_size, req->jheader.data_pattern, block_size);
-
-        if (data_size != 0) {
-            req->write_sgs =
-                test_common::HSTestHelper::create_sgs(data_size, max_size_per_iov, req->jheader.data_pattern);
-        }
-
-        repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req);
-    }
-
-    void validate_db_data() {
-        g_helper->runner().set_num_tasks(inmem_db_.size());
-
-        LOGINFOMOD(replication, "[{}]: Total {} keys committed, validating them",
-                   boost::uuids::to_string(repl_dev()->group_id()), inmem_db_.size());
-        auto it = inmem_db_.begin();
-        g_helper->runner().set_task([this, &it]() {
-            Key k;
-            Value v;
-            {
-                std::unique_lock lk(db_mtx_);
-                std::tie(k, v) = *it;
-                ++it;
-            }
-
-            if (v.data_size_ != 0) {
-                auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >();
-                auto read_sgs = test_common::HSTestHelper::create_sgs(v.data_size_, block_size);
-
-                repl_dev()->async_read(v.blkid_, read_sgs, v.data_size_).thenValue([read_sgs, k, v](auto const ec) {
-                    LOGINFOMOD(replication, "Validating key={} value[blkid={} pattern={}]", k.id_, v.blkid_.to_string(),
-                               v.data_pattern_);
-                    RELEASE_ASSERT(!ec, "Read of blkid={} for key={} error={}", v.blkid_.to_string(), k.id_,
-                                   ec.message());
-                    for (auto const& iov : read_sgs.iovs) {
-                        test_common::HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len,
-                                                                     v.data_pattern_);
-                        iomanager.iobuf_free(uintptr_cast(iov.iov_base));
-                    }
-                    g_helper->runner().next_task();
-                });
-            } else {
-                g_helper->runner().next_task();
-            }
-        });
-        g_helper->runner().execute().get();
-    }
-
-    uint64_t db_commit_count() const {
-        std::shared_lock lk(db_mtx_);
-        return commit_count_;
-    }
-
-    uint64_t db_size() const {
-        std::shared_lock lk(db_mtx_);
-        return inmem_db_.size();
-    }
-
-    void create_snapshot() {
-        auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev());
-        ulong snapshot_idx = raft_repl_dev->raft_server()->create_snapshot();
-        LOGINFO("Manually create snapshot got index {}", snapshot_idx);
-    }
-
-    void truncate(int num_reserved_entries) {
-        auto raft_repl_dev = std::dynamic_pointer_cast< RaftReplDev >(repl_dev());
-        raft_repl_dev->truncate(num_reserved_entries);
-        LOGINFO("Manually truncated");
-    }
-
-    void set_zombie() { zombie_ = true; }
-    bool is_zombie() {
-        // Wether a group is zombie(non recoverable)
-        return zombie_;
-    }
-
-private:
-    std::map< Key, Value > inmem_db_;
-    std::map< int64_t, Value > lsn_index_;
-    uint64_t commit_count_{0};
-    // this is the last lsn for data, might not be the same with the real last committed lsn
-    // which should be get by raft_server()->get_committed_log_idx()
-    uint64_t last_data_committed_lsn{0};
-    std::shared_mutex db_mtx_;
-    std::shared_ptr< snapshot_context > m_last_snapshot{nullptr};
-    std::mutex m_snapshot_lock;
-    bool zombie_{false};
-};
-
-class RaftReplDevTest : public testing::Test {
-public:
-    void SetUp() override {
-        // By default it will create one db
-        for (uint32_t i{0}; i < SISL_OPTIONS["num_raft_groups"].as< uint32_t >(); ++i) {
-            auto db = std::make_shared< TestReplicatedDB >();
-            g_helper->register_listener(db);
-            dbs_.emplace_back(std::move(db));
-        }
-    }
-
-    void TearDown() override {
-        for (auto const& db : dbs_) {
-            if (db->is_zombie()) { continue; }
-            run_on_leader(db, [this, db]() {
-                auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get();
-                ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group";
-            });
-        }
-
-        for (auto const& db : dbs_) {
-            if (db->is_zombie()) { continue; }
-            auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev());
-            int i = 0;
-            bool force_leave = false;
-            do {
-                std::this_thread::sleep_for(std::chrono::seconds(1));
-                auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service());
-                raft_repl_svc.gc_repl_devs();
-                LOGINFO("Waiting for repl dev to get destroyed");
-
-                // TODO: if leader is destroyed, but the follower does not receive the notification, it will not be
-                // destroyed for ever. we need handle this in raft_repl_dev. revisit here after making changes at
-                // raft_repl_dev side to hanle this case. this is a workaround to avoid the infinite loop for now.
-                if (i++ > 10 && !force_leave) {
-                    LOGWARN("has already waited for repl dev to get destroyed for 10 times, so do a force leave");
-                    repl_dev->force_leave();
-                    force_leave = true;
-                }
-
-            } while (!repl_dev->is_destroyed());
-        }
-    }
-
-    void generate_writes(uint64_t data_size, uint32_t max_size_per_iov, shared< TestReplicatedDB > db = nullptr) {
-        if (db == nullptr) { db = pick_one_db(); }
-        // LOGINFO("Writing on group_id={}", db->repl_dev()->group_id());
-        db->db_write(data_size, max_size_per_iov);
-    }
-
-    void wait_for_all_commits() { wait_for_commits(written_entries_); }
-
-    void wait_for_commits(uint64_t exp_writes) {
-        uint64_t total_writes{0};
-        while (true) {
-            total_writes = 0;
-            for (auto const& db : dbs_) {
-                total_writes += db->db_commit_count();
-            }
-
-            if (total_writes >= exp_writes) { break; }
-            std::this_thread::sleep_for(std::chrono::milliseconds(100));
-        }
-        LOGINFO("Replica={} has received {} commits as expected", g_helper->replica_num(), total_writes);
-    }
-
-    void validate_data() {
-        for (auto const& db : dbs_) {
-            db->validate_db_data();
-        }
-    }
-
-    shared< TestReplicatedDB > pick_one_db() { return dbs_[0]; }
-
-    void assign_leader(uint16_t replica) {
-        LOGINFO("Switch the leader to replica_num = {}", replica);
-        if (g_helper->replica_num() == replica) {
-            for (auto const& db : dbs_) {
-                do {
-                    auto result = db->repl_dev()->become_leader().get();
-                    if (result.hasError()) {
-                        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-                    } else {
-                        break;
-                    }
-                } while (true);
-            }
-        } else {
-            for (auto const& db : dbs_) {
-                homestore::replica_id_t leader_uuid;
-                while (true) {
-                    leader_uuid = db->repl_dev()->get_leader_id();
-                    if (!leader_uuid.is_nil() && (g_helper->member_id(leader_uuid) == replica)) { break; }
-
-                    LOGINFO("Waiting for replica={} to become leader", replica);
-                    std::this_thread::sleep_for(std::chrono::milliseconds{500});
-                }
-            }
-        }
-    }
-
-    void run_on_leader(std::shared_ptr< TestReplicatedDB > db, auto&& lambda) {
-        do {
-            auto leader_uuid = db->repl_dev()->get_leader_id();
-
-            if (leader_uuid.is_nil()) {
-                LOGINFO("Waiting for leader to be elected for group={}", db->repl_dev()->group_id());
-                std::this_thread::sleep_for(std::chrono::milliseconds{500});
-            } else if (leader_uuid == g_helper->my_replica_id()) {
-                lambda();
-                break;
-            } else {
-                break;
-            }
-        } while (true);
-    }
-
-    void write_on_leader(uint32_t num_entries, bool wait_for_commit = true, shared< TestReplicatedDB > db = nullptr) {
-        do {
-            auto leader_uuid = dbs_[0]->repl_dev()->get_leader_id();
-
-            if (leader_uuid.is_nil()) {
-                LOGINFO("Waiting for leader to be elected");
-                std::this_thread::sleep_for(std::chrono::milliseconds{500});
-            } else if (leader_uuid == g_helper->my_replica_id()) {
-                LOGINFO("Writing {} entries since I am the leader my_uuid={}", num_entries,
-                        boost::uuids::to_string(g_helper->my_replica_id()));
-                auto const block_size = SISL_OPTIONS["block_size"].as< uint32_t >();
-                g_helper->runner().set_num_tasks(num_entries);
-
-                LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size);
-                g_helper->runner().set_task([this, block_size, db]() {
-                    static std::normal_distribution<> num_blks_gen{3.0, 2.0};
-                    this->generate_writes(std::abs(std::lround(num_blks_gen(g_re))) * block_size, block_size, db);
-                });
-                if (wait_for_commit) { g_helper->runner().execute().get(); }
-                break;
-            } else {
-                LOGINFO("{} entries were written on the leader_uuid={} my_uuid={}", num_entries,
-                        boost::uuids::to_string(leader_uuid), boost::uuids::to_string(g_helper->my_replica_id()));
-                break;
-            }
-        } while (true);
-
-        written_entries_ += num_entries;
-        if (wait_for_commit) { this->wait_for_all_commits(); }
-    }
-
-    void remove_db(std::shared_ptr< TestReplicatedDB > db, bool wait_for_removal) {
-        this->run_on_leader(db, [this, db]() {
-            auto err = hs()->repl_service().remove_repl_dev(db->repl_dev()->group_id()).get();
-            ASSERT_EQ(err, ReplServiceError::OK) << "Error in destroying the group";
-        });
-
-        // Remove the db from the dbs_ list and check if count matches with repl_device
-        for (auto it = dbs_.begin(); it != dbs_.end(); ++it) {
-            if (*it == db) {
-                dbs_.erase(it);
-                break;
-            }
-        }
-
-        if (wait_for_removal) { wait_for_listener_destroy(dbs_.size()); }
-    }
-
-    void wait_for_listener_destroy(uint64_t exp_listeners) {
-        while (true) {
-            auto total_listeners = g_helper->num_listeners();
-            if (total_listeners == exp_listeners) { break; }
-            std::this_thread::sleep_for(std::chrono::milliseconds(100));
-        }
-    }
-
-    void restart_replica(uint16_t replica, uint32_t shutdown_delay_sec = 5u) {
-        if (g_helper->replica_num() == replica) {
-            LOGINFO("Restart homestore: replica_num = {}", replica);
-            g_helper->restart(shutdown_delay_sec);
-            // g_helper->sync_for_test_start();
-        } else {
-            LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica);
-            std::this_thread::sleep_for(std::chrono::seconds{5});
-        }
-    }
-
-    void shutdown_replica(uint16_t replica) {
-        if (g_helper->replica_num() == replica) {
-            LOGINFO("Shutdown homestore: replica_num = {}", replica);
-            g_helper->shutdown();
-        } else {
-            LOGINFO("Wait for replica={} to completely go down and removed from alive raft-groups", replica);
-            std::this_thread::sleep_for(std::chrono::seconds{5});
-        }
+class RaftReplDevTest : public RaftReplDevTestBase {};
+TEST_F(RaftReplDevTest, Write_Duplicated_Data) {
+    uint64_t total_writes = 1;
+    g_helper->runner().qdepth_ = total_writes;
+    g_helper->runner().total_tasks_ = total_writes;
+    LOGINFO("Homestore replica={} setup completed", g_helper->replica_num());
+    g_helper->sync_for_test_start();
+    auto leader_uuid = wait_and_get_leader_id();
+
+    uint64_t id;
+    TestReplicatedDB::Key stored_key;
+    TestReplicatedDB::Value stored_val;
+    if (leader_uuid == g_helper->my_replica_id()) {
+        id = (uint64_t)rand() << 32 | rand();
+        LOGINFO("going to write data with id={}", id);
+        this->write_with_id(id, true /* wait_for_commit */);
+        stored_key = dbs_[0]->inmem_db_.cbegin()->first;
+        ASSERT_EQ(id, stored_key.id_);
+    } else {
+        LOGINFO("I am not leader, leader_uuid={} my_uuid={}, do nothing", boost::uuids::to_string(leader_uuid),
+                boost::uuids::to_string(g_helper->my_replica_id()));
     }
+    wait_for_commits(total_writes);
 
-    void start_replica(uint16_t replica) {
-        if (g_helper->replica_num() == replica) {
-            LOGINFO("Start homestore: replica_num = {}", replica);
-            g_helper->start();
-        }
+    g_helper->sync_for_verify_start();
+    LOGINFO("Validate all data written so far by reading them");
+    this->validate_data();
+    /* test duplication
+    if duplication found in leader proposal, reject it;
+    if duplication found in the followers, skip it.
+    */
+    // 1. write the same data again on leader, should fail
+    if (leader_uuid == g_helper->my_replica_id()) {
+        auto err = this->write_with_id(id, true /* wait_for_commit */);
+        ASSERT_EQ(ReplServiceError::DATA_DUPLICATED, err);
+
+        // 2. delete it from the db to simulate duplication in followers(skip the duplication check in leader side)
+        dbs_[0]->inmem_db_.erase(stored_key);
+        LOGINFO("data with id={} has been deleted from db", id);
+        err = this->write_with_id(id, true /* wait_for_commit */);
+        ASSERT_EQ(ReplServiceError::OK, err);
+    }
+    if (leader_uuid != g_helper->my_replica_id()) {
+        wait_for_commits(total_writes + 1);
+        ASSERT_EQ(dbs_[0]->inmem_db_.size(), total_writes);
     }
 
-    void create_snapshot() { dbs_[0]->create_snapshot(); }
-    void truncate(int num_reserved_entries) { dbs_[0]->truncate(num_reserved_entries); }
-
-protected:
-    std::vector< std::shared_ptr< TestReplicatedDB > > dbs_;
-    uint32_t written_entries_{0};
-
-#ifdef _PRERELEASE
-    flip::FlipClient m_fc{iomgr_flip::instance()};
-#endif
-};
+    g_helper->sync_for_cleanup_start();
+}
 
 TEST_F(RaftReplDevTest, Write_Restart_Write) {
     LOGINFO("Homestore replica={} setup completed", g_helper->replica_num());
@@ -658,6 +109,41 @@ TEST_F(RaftReplDevTest, Follower_Fetch_OnActive_ReplicaGroup) {
 
     g_helper->sync_for_cleanup_start();
 }
+
+TEST_F(RaftReplDevTest, Write_With_Diabled_Leader_Push_Data) {
+    g_helper->set_basic_flip("disable_leader_push_data", std::numeric_limits< int >::max(), 100);
+    LOGINFO("Homestore replica={} setup completed, all the push_data from leader are disabled",
+            g_helper->replica_num());
+    LOGINFO("Homestore replica={} setup completed", g_helper->replica_num());
+    g_helper->sync_for_test_start();
+
+    this->write_on_leader(20, true /* wait_for_commit */);
+
+    g_helper->sync_for_verify_start();
+
+    LOGINFO("Validate all data written so far by reading them");
+    this->validate_data();
+
+    g_helper->sync_for_cleanup_start();
+    g_helper->remove_flip("disable_leader_push_data");
+}
+
+TEST_F(RaftReplDevTest, Write_With_Handling_No_Space_Left) {
+    g_helper->set_basic_flip("simulate_no_space_left", std::numeric_limits< int >::max(), 50);
+    LOGINFO("Homestore replica={} setup completed", g_helper->replica_num());
+    g_helper->sync_for_test_start();
+
+    this->write_on_leader(20, true /* wait_for_commit */);
+
+    g_helper->sync_for_verify_start();
+
+    LOGINFO("Validate all data written so far by reading them");
+    this->validate_data();
+
+    g_helper->sync_for_cleanup_start();
+    g_helper->remove_flip("simulate_no_space_left");
+}
+
 #endif
 
 // do some io before restart;
@@ -749,6 +235,7 @@ TEST_F(RaftReplDevTest, Resync_From_Non_Originator) {
 }
 
 #if 0
+
 TEST_F(RaftReplDevTest, Leader_Restart) {
     LOGINFO("Homestore replica={} setup completed", g_helper->replica_num());
     g_helper->sync_for_test_start();
@@ -773,7 +260,6 @@ TEST_F(RaftReplDevTest, Leader_Restart) {
     g_helper->sync_for_cleanup_start();
 }
 
-
 TEST_F(RaftReplDevTest, Drop_Raft_Entry_Switch_Leader) {
     LOGINFO("Homestore replica={} setup completed", g_helper->replica_num());
     g_helper->sync_for_test_start();
@@ -958,7 +444,7 @@ TEST_F(RaftReplDevTest, BaselineTest) {
             // Leader does manual snapshot and truncate
             LOGINFO("Leader create snapshot and truncate");
             this->create_snapshot();
-            this->truncate(0);
+            // this->truncate(0);
         }
     }
 
@@ -982,6 +468,96 @@ TEST_F(RaftReplDevTest, BaselineTest) {
     LOGINFO("BaselineTest done");
 }
 
+TEST_F(RaftReplDevTest, LargeDataWrite) {
+    LOGINFO("Homestore replica={} setup completed", g_helper->replica_num());
+    g_helper->sync_for_test_start();
+
+    // TODO: Increase the data size (e.g., to 16MB) for testing.
+    // For now, use 4MB to ensure the test passes since there are issues with larger IO sizes on the uring drive.
+    uint64_t entries_per_attempt = SISL_OPTIONS["num_io"].as< uint64_t >();
+    uint64_t data_size = 4 * 1024 * 1024;
+    this->write_on_leader(entries_per_attempt, true /* wait_for_commit */, nullptr, &data_size);
+
+    g_helper->sync_for_verify_start();
+    LOGINFO("Validate all data written so far by reading them");
+    this->validate_data();
+    g_helper->sync_for_cleanup_start();
+}
+
+TEST_F(RaftReplDevTest, PriorityLeaderElection) {
+    LOGINFO("Homestore replica={} setup completed", g_helper->replica_num());
+    g_helper->sync_for_test_start();
+    uint64_t entries_per_attempt = SISL_OPTIONS["num_io"].as< uint64_t >();
+    if (g_helper->replica_num() == 0) {
+        auto leader = this->wait_and_get_leader_id();
+        ASSERT_EQ(leader, g_helper->my_replica_id());
+    }
+    this->write_on_leader(entries_per_attempt, true /* wait_for_commit */);
+
+    g_helper->sync_for_verify_start();
+    LOGINFO("Validate all data written so far by reading them");
+    this->validate_data();
+    g_helper->sync_for_cleanup_start();
+
+    LOGINFO("Restart leader");
+    if (g_helper->replica_num() == 0) { g_helper->restart_homestore(); }
+    g_helper->sync_for_test_start();
+
+    LOGINFO("Validate leader switched");
+    std::this_thread::sleep_for(std::chrono::milliseconds{500});
+    auto leader = this->wait_and_get_leader_id();
+    if (g_helper->replica_num() == 0) { ASSERT_NE(leader, g_helper->my_replica_id()); }
+    g_helper->sync_for_verify_start();
+
+    if (leader == g_helper->my_replica_id()) {
+        LOGINFO("Resign and trigger a priority leader election");
+        // resign and trigger a priority leader election
+        g_helper->restart_homestore();
+    }
+    g_helper->sync_for_test_start();
+
+    std::this_thread::sleep_for(std::chrono::milliseconds{500});
+    leader = this->wait_and_get_leader_id();
+    LOGINFO("Validate leader switched back to initial replica");
+    if (g_helper->replica_num() == 0) { ASSERT_EQ(leader, g_helper->my_replica_id()); }
+    g_helper->sync_for_verify_start();
+
+    LOGINFO("Post restart write the data again on the leader");
+    this->write_on_leader(entries_per_attempt, true /* wait_for_commit */);
+
+    LOGINFO("Validate all data written (including pre-restart data) by reading them");
+    this->validate_data();
+    g_helper->sync_for_cleanup_start();
+}
+
+TEST_F(RaftReplDevTest, ComputePriority) {
+    g_helper->sync_for_test_start();
+    auto& raftService = dynamic_cast< RaftReplService& >(hs()->repl_service());
+
+    HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.max_wait_rounds_of_priority_election = 0; });
+    HS_SETTINGS_FACTORY().save();
+    ASSERT_EQ(raftService.compute_raft_follower_priority(), raft_leader_priority);
+
+    for (auto i = 1; i <= int(raft_priority_election_round_upper_limit); i++) {
+        HS_SETTINGS_FACTORY().modifiable_settings(
+            [i](auto& s) { s.consensus.max_wait_rounds_of_priority_election = i; });
+        HS_SETTINGS_FACTORY().save();
+        auto follower_priority = raftService.compute_raft_follower_priority();
+        // Simulate nuraft algorithm
+        auto decayed_priority = raft_leader_priority;
+        for (auto j = 1; j <= i; j++) {
+            int gap = std::max((int)10, decayed_priority / 5);
+            decayed_priority = std::max(1, decayed_priority - gap);
+        }
+        LOGINFO("Follower priority={} decayed_priority={}", follower_priority, decayed_priority);
+        ASSERT_TRUE(follower_priority >= decayed_priority);
+    }
+    // Set back to default value
+    HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { s.consensus.max_wait_rounds_of_priority_election = 2; });
+    HS_SETTINGS_FACTORY().save();
+    g_helper->sync_for_cleanup_start();
+}
+
 int main(int argc, char* argv[]) {
     int parsed_argc = argc;
     char** orig_argv = argv;
@@ -1012,7 +588,6 @@ int main(int argc, char* argv[]) {
 
         // Snapshot and truncation tests needs num reserved to be 0 and distance 10.
         s.consensus.num_reserved_log_items = 0;
-        s.consensus.snapshot_freq_distance = 10;
         s.resource_limits.resource_audit_timer_ms = 0;
 
         // only reset when user specified the value for test;
@@ -1030,7 +605,8 @@ int main(int argc, char* argv[]) {
 
     FLAGS_folly_global_cpu_executor_threads = 4;
     g_helper = std::make_unique< test_common::HSReplTestHelper >("test_raft_repl_dev", args, orig_argv);
-    g_helper->setup();
+    // No spare replica's are created. Test cases in this file expects fixed number of replica's.
+    g_helper->setup(SISL_OPTIONS["replicas"].as< uint32_t >());
 
     auto ret = RUN_ALL_TESTS();
     g_helper->teardown();
diff --git a/src/tests/test_raft_repl_dev_dynamic.cpp b/src/tests/test_raft_repl_dev_dynamic.cpp
new file mode 100644
index 000000000..4ae56a9c3
--- /dev/null
+++ b/src/tests/test_raft_repl_dev_dynamic.cpp
@@ -0,0 +1,460 @@
+/*********************************************************************************
+ * Modifications Copyright 2017-2019 eBay Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *    https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ *
+ *********************************************************************************/
+#include "test_common/raft_repl_test_base.hpp"
+#include <boost/uuid/random_generator.hpp>
+#include "common/homestore_config.hpp"
+
+// Dynamic tests spawn spare replica's also which can be used to add and remove from a repl dev.
+class ReplDevDynamicTest : public RaftReplDevTestBase {
+private:
+    bool is_replica_num_in(const std::set< uint32_t >& replicas) {
+        // Check if the current replica process is in this set.
+        return replicas.count(g_helper->replica_num()) != 0 ? true : false;
+    }
+};
+
+TEST_F(ReplDevDynamicTest, ReplaceMember) {
+    LOGINFO("ReplaceMember test started replica={}", g_helper->replica_num());
+    // Write some IO's, replace a member, validate all members data except which is out.
+    auto db = dbs_.back();
+    auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >();
+    auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >();
+    uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >();
+
+    // Replace the last member in the group with index(num_replicas - 1) with a spare
+    // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N
+    uint32_t member_out = num_replicas - 1;
+    uint32_t member_in = num_replicas;
+
+    g_helper->sync_for_test_start(num_members);
+    if (g_helper->replica_num() < num_replicas) {
+        // With existing raft repl dev group, write IO's, validate and call replace_member on leader.
+        LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num());
+        this->write_on_leader(num_io_entries, true /* wait_for_commit */);
+
+        replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in));
+        std::this_thread::sleep_for(std::chrono::seconds(3));
+    } else if (g_helper->replica_num() == member_in) {
+        LOGINFO("Wait for commits replica={}", g_helper->replica_num());
+        wait_for_commits(num_io_entries);
+    }
+
+    g_helper->sync_for_verify_start(num_members);
+    LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num());
+    if (is_replica_num_in({0, 1, member_in})) {
+        // Skip the member which is going to be replaced. Validate data on all other replica's.
+        LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num());
+        this->validate_data();
+    }
+    g_helper->sync_for_verify_start(num_members);
+    LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num());
+
+    //wait for background reaper thread to trigger complete_replace_member
+    if (g_helper->replica_num() == member_out) {
+        // The out member will have the repl dev destroyed.
+        auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev());
+        while (repl_dev && !repl_dev->is_destroyed()) {
+            std::this_thread::sleep_for(std::chrono::seconds(1));
+            auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service());
+            raft_repl_svc.gc_repl_devs();
+            LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num());
+        }
+        LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num());
+    }
+
+    g_helper->sync_for_cleanup_start(num_members);
+    LOGINFO("ReplaceMember test done replica={}", g_helper->replica_num());
+}
+
+TEST_F(ReplDevDynamicTest, TwoMemberDown) {
+    LOGINFO("TwoMemberDown test started replica={}", g_helper->replica_num());
+
+    // Make two members down in a group and leader cant reach a quorum.
+    // We set the custom quorum size to 1 and call replace member.
+    // Leader should do some writes to validate it has reach quorum size.
+    LOGINFO("Homestore replica={} setup completed", g_helper->replica_num());
+    auto db = dbs_.back();
+    auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >();
+    auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >();
+
+    uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >();
+
+    // Replace the last member in the group with index(num_replicas - 1) with a spare
+    // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N
+    uint32_t member_out = num_replicas - 1;
+    uint32_t member_in = num_replicas;
+
+    g_helper->sync_for_test_start(num_members);
+
+    // Shutdown replica 1 and replica 2 to simulate two member down.
+    if (g_helper->replica_num() == 1) {
+        this->shutdown_replica(1);
+        LOGINFO("Shutdown replica 1");
+    }
+
+    if (g_helper->replica_num() == 2) {
+        this->shutdown_replica(2);
+        LOGINFO("Shutdown replica 2");
+    }
+
+    if (g_helper->replica_num() == 0) {
+        // Replace down replica 2 with spare replica 3 with commit quorum 1
+        // so that leader can go ahead with replacing member.
+        LOGINFO("Replace member started");
+        replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 1 /* commit quorum*/);
+        this->write_on_leader(num_io_entries, true /* wait_for_commit */);
+        LOGINFO("Leader completed num_io={}", num_io_entries);
+    }
+
+    if (g_helper->replica_num() == member_in) {
+        wait_for_commits(num_io_entries);
+        LOGINFO("Member in got all commits");
+    }
+
+    if (is_replica_num_in({0, member_in})) {
+        // Validate data on leader replica 0 and replica 3
+        LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num());
+        this->validate_data();
+    }
+
+    if (g_helper->replica_num() == 1) {
+        LOGINFO("Start replica 1");
+        db->set_zombie();
+        this->start_replica(1);
+    }
+    if (g_helper->replica_num() == 2) {
+        LOGINFO("Start replica 2");
+        db->set_zombie();
+        this->start_replica(2);
+    }
+
+    g_helper->sync_for_cleanup_start(num_members);
+    LOGINFO("TwoMemberDown test done replica={}", g_helper->replica_num());
+}
+
+TEST_F(ReplDevDynamicTest, OutMemberDown) {
+    // replica0(leader) and replica1 up, replica2 is down. Replace replica2 with replica3.
+    // replica0 should be able to baseline resync to replica4(new member).
+    // Write some IO's, replace a member, validate all members data except which is out.
+    LOGINFO("OutMemberDown test started replica={}", g_helper->replica_num());
+    auto db = dbs_.back();
+    auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >();
+    auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >();
+    uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >();
+
+    // Replace the last member in the group with index(num_replicas - 1) with a spare
+    // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N
+    uint32_t member_out = num_replicas - 1;
+    uint32_t member_in = num_replicas;
+
+    g_helper->sync_for_test_start(num_members);
+
+    std::this_thread::sleep_for(std::chrono::seconds(3));
+    if (g_helper->replica_num() == 0) {
+        // With existing raft repl dev group, write IO's, validate and call replace_member on leader.
+        LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num());
+        this->write_on_leader(num_io_entries, true /* wait_for_commit */);
+    }
+    //shut down before replace member
+    this->shutdown_replica(2);
+    LOGINFO("Shutdown replica 2");
+
+    if (g_helper->replica_num() == 0) {
+        replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in));
+        std::this_thread::sleep_for(std::chrono::seconds(3));
+    } else if (g_helper->replica_num() == member_in) {
+        LOGINFO("Wait for commits replica={}", g_helper->replica_num());
+        wait_for_commits(num_io_entries);
+    }
+
+    g_helper->sync_for_verify_start(num_members);
+    LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num());
+    if (is_replica_num_in({0, 1, member_in})) {
+        // Skip the member which is going to be replaced. Validate data on all other replica's.
+        LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num());
+        this->validate_data();
+    }
+
+    // shutdown after becoming learner, in this case, the member_out won't remove replDev after restart.
+    // this->shutdown_replica(2);
+    // LOGINFO("Shutdown replica 2");
+    // std::this_thread::sleep_for(std::chrono::seconds(2));
+
+    // data synced, waiting for removing learner
+    LOGINFO("data synced, sync for completing replace member, replica={}", g_helper->replica_num());
+    g_helper->sync_for_verify_start(num_members);
+    // Since the out_member stopped, it cannot response to remove_srv req, as a result the first time will get CANCELLED
+    // error, so waiting time is longer than other tests.
+    if (g_helper->replica_num() == 2) {
+        LOGINFO("Start replica 2");
+        this->start_replica(2);
+        // The out member will have the repl dev destroyed.
+        auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev());
+        while (repl_dev && !repl_dev->is_destroyed()) {
+            std::this_thread::sleep_for(std::chrono::seconds(1));
+            auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service());
+            raft_repl_svc.gc_repl_devs();
+            LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num());
+        }
+        LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num());
+        db->set_zombie();
+    }
+
+    g_helper->sync_for_cleanup_start(num_members);
+    LOGINFO("OneMemberDown test done replica={}", g_helper->replica_num());
+}
+
+TEST_F(ReplDevDynamicTest, LeaderReplace) {
+    // replica0(leader) and replica1 and replica2 is up. Replace replica0(leader) with replica3.
+    // replica0 will yield leadership and any other replica will be come leader  and leader
+    // will do baseline resync to replica4(new member).
+    // Write some IO's, replace a member, validate all members data except which is out.
+    LOGINFO("LeaderReplace test started replica={}", g_helper->replica_num());
+    auto db = dbs_.back();
+    auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >();
+    auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >();
+    uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >();
+
+    // Replace the leader in the group with index(0) with a spare
+    // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N
+    uint32_t member_out = 0;
+    uint32_t member_in = num_replicas;
+
+    g_helper->sync_for_test_start(num_members);
+
+    if (g_helper->replica_num() == member_out) {
+        LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num());
+        // With existing raft repl dev group, write IO's, validate and call replace_member on leader.
+        this->write_on_leader(num_io_entries, true /* wait_for_commit */);
+
+        // Leader will return error NOT_LEADER and yield leadership, sleep and connect again
+        // to the new leader.
+        LOGINFO("Replace old leader");
+        replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0,
+                       ReplServiceError::NOT_LEADER);
+        LOGINFO("Replace member leader yield done");
+    }
+    std::this_thread::sleep_for(std::chrono::seconds(3));
+    if (g_helper->replica_num() != member_in) {
+        replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in));
+        LOGINFO("Replace member old leader done");
+    }
+
+    if (g_helper->replica_num() == member_in) {
+        LOGINFO("Wait for commits replica={}", g_helper->replica_num());
+        wait_for_commits(num_io_entries);
+    }
+
+    g_helper->sync_for_verify_start(num_members);
+    if (is_replica_num_in({0, 1, member_in})) {
+        // Skip the member which is going to be replaced. Validate data on all other replica's.
+        LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num());
+        this->validate_data();
+    }
+
+    g_helper->sync_for_verify_start(num_members);
+    LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num());
+    if (g_helper->replica_num() == member_out) {
+        // The out member will have the repl dev destroyed.
+        auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev());
+        while (repl_dev && !repl_dev->is_destroyed()) {
+            std::this_thread::sleep_for(std::chrono::seconds(1));
+            auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service());
+            raft_repl_svc.gc_repl_devs();
+            LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num());
+        }
+        LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num());
+        db->set_zombie();
+    }
+
+    g_helper->sync_for_cleanup_start(num_members);
+    LOGINFO("LeaderReplace test done replica={}", g_helper->replica_num());
+}
+
+TEST_F(ReplDevDynamicTest, OneMemberRestart) {
+    // replica0(leader) is up and replica1 is restated, replica2 is down. Replace replica2 with replica3.
+    // replica0 should be able to baseline resync to replica4(new member).
+    // Write some IO's, replace a member, validate all members data except which is out.
+    LOGINFO("OneMemberRestart test started replica={}", g_helper->replica_num());
+    auto db = dbs_.back();
+    auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >();
+    auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >();
+    uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >();
+
+    // Replace the last member in the group with index(num_replicas - 1) with a spare
+    // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N
+    uint32_t member_out = num_replicas - 1;
+    uint32_t member_in = num_replicas;
+
+    g_helper->sync_for_test_start(num_members);
+    if (g_helper->replica_num() == 1) {
+        LOGINFO("Restart replica 1, ");
+        this->restart_replica(15);
+    }
+
+    if (g_helper->replica_num() == 0) {
+        // With existing raft repl dev group, write IO's, validate and call replace_member on leader.
+        LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num());
+        this->write_on_leader(num_io_entries, true /* wait_for_commit */);
+
+        replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in));
+        std::this_thread::sleep_for(std::chrono::seconds(3));
+    } else if (g_helper->replica_num() == member_in) {
+        LOGINFO("Wait for commits replica={}", g_helper->replica_num());
+        wait_for_commits(num_io_entries);
+    }
+
+    g_helper->sync_for_verify_start(num_members);
+    LOGINFO("sync_for_verify_state replica={} ", g_helper->replica_num());
+    if (is_replica_num_in({0, 1, member_in})) {
+        // Skip the member which is going to be replaced. Validate data on all other replica's.
+        LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num());
+        this->validate_data();
+    }
+
+    g_helper->sync_for_verify_start(num_members);
+    LOGINFO("data synced, sync_for_verify_state replica={} ", g_helper->replica_num());
+    if (g_helper->replica_num() == member_out) {
+        // The out member will have the repl dev destroyed.
+        auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev());
+        while (repl_dev && !repl_dev->is_destroyed()) {
+            std::this_thread::sleep_for(std::chrono::seconds(1));
+            auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service());
+            raft_repl_svc.gc_repl_devs();
+            LOGINFO("Waiting for repl dev to get destroyed on out member replica={}", g_helper->replica_num());
+        }
+        LOGINFO("Repl dev destroyed on out member replica={}", g_helper->replica_num());
+    }
+
+    g_helper->sync_for_cleanup_start(num_members);
+    LOGINFO("OneMemberRestart test done replica={}", g_helper->replica_num());
+}
+
+TEST_F(ReplDevDynamicTest, ValidateRequest) {
+    LOGINFO("ValidateRequest test started replica={}", g_helper->replica_num());
+    HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) {
+        s.consensus.laggy_threshold = 0;
+        LOGINFO("setup consensus.laggy_threshold to {}", 0);
+        HS_SETTINGS_FACTORY().save();
+    });
+
+    auto db = dbs_.back();
+    auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >();
+    auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >();
+    uint64_t num_io_entries = SISL_OPTIONS["num_io"].as< uint64_t >();
+
+    // Replace the last member in the group with index(num_replicas - 1) with a spare
+    // replica with index (num_replica). Member id's are 0,...,num_replicas-1, num_replicas,...,N
+    uint32_t member_out = num_replicas - 1;
+    uint32_t member_in = num_replicas;
+
+    g_helper->sync_for_test_start(num_members);
+
+    //shut down before replace member
+    this->shutdown_replica(1);
+    LOGINFO("Shutdown replica 1");
+
+    //wait for shutdown
+    std::this_thread::sleep_for(std::chrono::seconds(3));
+    g_helper->sync_for_verify_start(num_members);
+    if (g_helper->replica_num() == 0) {
+        // With existing raft repl dev group, write IO's, validate and call replace_member on leader.
+        LOGINFO("Writing on leader num_io={} replica={}", num_io_entries, g_helper->replica_num());
+        this->write_on_leader(num_io_entries, true /* wait_for_commit */);
+    }
+    g_helper->sync_for_verify_start(num_members);
+    if (g_helper->replica_num() == 0) {
+        // generate uuid
+        replica_id_t fake_member_out = boost::uuids::random_generator()();
+        replica_id_t fake_member_in = boost::uuids::random_generator()();
+        LOGINFO("test SERVER_NOT_FOUND");
+        replace_member(db, fake_member_out, fake_member_in, 0, ReplServiceError::SERVER_NOT_FOUND);
+        LOGINFO("test replace_member already complete");
+        replace_member(db, fake_member_out, g_helper->replica_id(0));
+        LOGINFO("test QUORUM_NOT_MET", num_io_entries, g_helper->replica_num());
+        replace_member(db, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0,
+                       ReplServiceError::QUORUM_NOT_MET);
+    }
+
+    if (g_helper->replica_num() == 1) {
+        LOGINFO("Start replica 1");
+        this->start_replica(1);
+    }
+    g_helper->sync_for_cleanup_start(num_members);
+    LOGINFO("ValidateRequest test done replica={}", g_helper->replica_num());
+}
+
+int main(int argc, char* argv[]) {
+    int parsed_argc = argc;
+    char** orig_argv = argv;
+
+    // Save the args for replica use
+    std::vector< std::string > args;
+    for (int i = 0; i < argc; ++i) {
+        args.emplace_back(argv[i]);
+    }
+
+    ::testing::InitGoogleTest(&parsed_argc, argv);
+
+    SISL_OPTIONS_LOAD(parsed_argc, argv, logging, config, test_raft_repl_dev, iomgr, test_common_setup,
+                      test_repl_common_setup);
+
+    //
+    // Entire test suite assumes that once a replica takes over as leader, it stays until it is explicitly yielded.
+    // Otherwise it is very hard to control or accurately test behavior. Hence we forcibly override the
+    // leadership_expiry time.
+    //
+    HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) {
+        s.generic.repl_dev_cleanup_interval_sec = 1;
+
+        // Disable implicit flush and timer.
+        s.logstore.flush_threshold_size = 0;
+        s.logstore.flush_timer_frequency_us = 0;
+
+        // Snapshot and truncation tests needs num reserved to be 0 and distance 10.
+        s.consensus.num_reserved_log_items = 0;
+        s.resource_limits.resource_audit_timer_ms = 0;
+
+        // only reset when user specified the value for test;
+        if (SISL_OPTIONS.count("snapshot_distance")) {
+            s.consensus.snapshot_freq_distance = SISL_OPTIONS["snapshot_distance"].as< uint32_t >();
+        }
+        if (SISL_OPTIONS.count("num_raft_logs_resv")) {
+            s.resource_limits.raft_logstore_reserve_threshold = SISL_OPTIONS["num_raft_logs_resv"].as< uint32_t >();
+        }
+        if (SISL_OPTIONS.count("res_mgr_audit_timer_ms")) {
+            s.resource_limits.resource_audit_timer_ms = SISL_OPTIONS["res_mgr_audit_timer_ms"].as< uint32_t >();
+        }
+    });
+    HS_SETTINGS_FACTORY().save();
+
+    FLAGS_folly_global_cpu_executor_threads = 4;
+    g_helper = std::make_unique< test_common::HSReplTestHelper >("test_raft_repl_dev_dynamic", args, orig_argv);
+
+    // We spawn spare replica's also for dynamic repl dev tests.
+    auto total_replicas = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >();
+    g_helper->setup(total_replicas);
+
+    auto ret = RUN_ALL_TESTS();
+    g_helper->teardown();
+
+    std::string str;
+    sisl::ObjCounterRegistry::foreach ([&str](const std::string& name, int64_t created, int64_t alive) {
+        fmt::format_to(std::back_inserter(str), "{}: created={} alive={}\n", name, created, alive);
+    });
+    LOGINFO("Object Life Counter\n:{}", str);
+
+    return ret;
+}
diff --git a/src/tests/test_scripts/CMakeLists.txt b/src/tests/test_scripts/CMakeLists.txt
index e1b5ff78c..4bb54bad5 100644
--- a/src/tests/test_scripts/CMakeLists.txt
+++ b/src/tests/test_scripts/CMakeLists.txt
@@ -1,15 +1,4 @@
-file(COPY vol_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts)
-file(COPY home_blk_flip.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts)
-file(COPY home_blk_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts)
-file(COPY index_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts)
-file(COPY log_meta_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts)
-file(COPY data_test.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts)
-file(COPY long_running.py DESTINATION ${CMAKE_BINARY_DIR}/bin/scripts)
-
-#add_test(NAME TestVolRecovery COMMAND ${CMAKE_BINARY_DIR}/bin/scripts/vol_test.py --test_suits=recovery --dirpath=${CMAKE_BINARY_DIR}/bin/)
-#SET_TESTS_PROPERTIES(TestVolRecovery PROPERTIES DEPENDS TestVol)
-
-#add_test(NAME PerfTestVol COMMAND perf_test_volume)
-#add_test(NAME RecoveryVol COMMAND python vol_test.py)
-#add_test(NAME CheckBtree COMMAND check_btree)
-
+file(COPY index_test.py DESTINATION ../test_scripts)
+file(COPY log_meta_test.py DESTINATION ../test_scripts)
+file(COPY data_test.py DESTINATION ../test_scripts)
+file(COPY long_running.py DESTINATION ../test_scripts)
diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py
index 4e4814ccb..b9e55a15e 100755
--- a/src/tests/test_scripts/index_test.py
+++ b/src/tests/test_scripts/index_test.py
@@ -20,11 +20,13 @@ def run_test(options, type):
         raise TestFailedError(f"Test failed for type {type}")
     print("Test completed")
 
+
 def run_crash_test(options):
-    cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --max_keys_in_node={options['max_keys_in_node']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} {options['dev_list']}"
+    cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --log_mods=wbcache:trace --max_keys_in_node={options['max_keys_in_node']} --num_entries_per_rounds={options['num_entries_per_rounds']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} --num_rounds={options['num_rounds']} {options['dev_list']} "
     # print(f"Running test with options: {cmd_opts}")
     try:
-        subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT, shell=True)
+        subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT,
+                              shell=True)
     except subprocess.CalledProcessError as e:
         print(f"Test failed: {e}")
         raise TestFailedError(f"Test failed for type {type}")
@@ -49,7 +51,10 @@ def parse_arguments():
     parser.add_argument('--dev_list', help='Device list', default='')
     parser.add_argument('--cleanup_after_shutdown', help='Cleanup after shutdown', type=bool, default=False)
     parser.add_argument('--init_device', help='Initialize device', type=bool, default=True)
-    parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=20)
+    parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=10)
+    parser.add_argument('--min_keys_in_node', help='Minimum num of keys in btree nodes', type=int, default=2)
+    parser.add_argument('--num_rounds', help='number of rounds for crash test', type=int, default=1000)
+    parser.add_argument('--num_entries_per_rounds', help='number of rounds for crash test', type=int, default=100)
 
     # Parse the known arguments and ignore any unknown arguments
     args, unknown = parser.parse_known_args()
@@ -73,8 +78,7 @@ def long_runnig_index(options, type=0):
 
 def long_running_clean_shutdown(options, type=0):
     print("Long running clean shutdown started")
-    options['run_time'] = int(options['run_time']) // 10  # 20 minutes
-
+    options['run_time'] =  options['run_time'] // 10
     try:
         run_test(options, type)
         options['init_device'] = False
@@ -87,14 +91,42 @@ def long_running_clean_shutdown(options, type=0):
         raise
     print("Long running clean shutdown completed")
 
+
 def long_running_crash_put(options):
     print("Long running crash put started")
-    options['num_entries'] = 20480 # 20K
+    options['num_entries'] = 1310720  # 1280K
     options['init_device'] = True
+    options['run_time'] = 14400  # 4 hours
+    options['preload_size'] = 1024
     print(f"options: {options}")
-    run_crash_test(options)
+    run_crash_test(options, 'put', 0)
     print("Long running crash put completed")
 
+def long_running_crash_remove(options):
+    print("Long running crash remove started")
+    options['num_entries'] = 1000
+    options['init_device'] = True
+    options['run_time'] = 14400  # 4 hours
+    options['num_entries_per_rounds'] = 100
+    options['min_keys_in_node'] = 2
+    options['max_keys_in_node'] = 10
+    print(f"options: {options}")
+    run_crash_test(options, 'remove', 0)
+    print("Long running crash put completed")
+
+def long_running_crash_put_remove(options):
+    print("Long running crash put_remove started")
+    options['num_entries'] = 2000  # 1280K
+    options['init_device'] = True
+    options['run_time'] = 14400  # 4 hours
+    options['preload_size'] = 1024
+    options['min_keys_in_node'] = 3
+    options['max_keys_in_node'] = 10
+    print(f"options: {options}")
+    run_crash_test(options, 'put_remove', 0)
+    print("Long running crash put_remove completed")
+
+
 def main():
     options = parse_arguments()
     test_suite_name = options['test_suits']
@@ -112,6 +144,19 @@ def main():
 
 def long_running(*args):
     options = parse_arguments()
+    long_runnig_index(options, 0)
+    long_running_clean_shutdown(options, 0)
+    long_runnig_index(options, 1)
+    long_running_clean_shutdown(options, 1)
+    for i in range(20):
+        print(f"Iteration {i + 1}")
+        long_running_crash_put_remove(options)
+    for i in range(50):
+        print(f"Iteration {i + 1}")
+        long_running_crash_remove(options)
+    for i in range(5):
+        print(f"Iteration {i + 1}")
+        long_running_crash_put(options)
     long_runnig_index(options)
     long_running_clean_shutdown(options)
     long_running_crash_put(options)
diff --git a/src/tests/test_scripts/log_meta_test.py b/src/tests/test_scripts/log_meta_test.py
index 5ffda0018..83c8f994f 100755
--- a/src/tests/test_scripts/log_meta_test.py
+++ b/src/tests/test_scripts/log_meta_test.py
@@ -85,7 +85,7 @@ def meta_nightly(options, addln_opts):
     subprocess.check_call(options.dirpath + "test_meta_blk_mgr " + cmd_opts + addln_opts, stderr=subprocess.STDOUT,
                           shell=True)
 
-    cmd_opts = "--gtest_filter=VMetaBlkMgrTest.random_load_test --gtest_filter=VMetaBlkMgrTest.write_to_full_test --use_file=true"  # write to file instead of real disk to save time;
+    cmd_opts = "--gtest_filter=VMetaBlkMgrTest.write_to_full_test --use_file=true"  # write to file instead of real disk to save time;
     subprocess.check_call(options.dirpath + "test_meta_blk_mgr " + cmd_opts + addln_opts, stderr=subprocess.STDOUT,
                           shell=True)
 
diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp
index c2b2460b5..57247dad7 100644
--- a/src/tests/test_solo_repl_dev.cpp
+++ b/src/tests/test_solo_repl_dev.cpp
@@ -62,22 +62,15 @@ struct test_repl_req : public repl_req_ctx {
     sisl::byte_array header;
     sisl::byte_array key;
     sisl::sg_list write_sgs;
-    sisl::sg_list read_sgs;
-    MultiBlkId written_blkids;
+    std::vector< MultiBlkId > written_blkids;
 
-    test_repl_req() {
-        write_sgs.size = 0;
-        read_sgs.size = 0;
-    }
+    test_repl_req() { write_sgs.size = 0; }
     ~test_repl_req() {
         for (auto const& iov : write_sgs.iovs) {
             iomanager.iobuf_free(uintptr_cast(iov.iov_base));
         }
-
-        for (auto const& iov : read_sgs.iovs) {
-            iomanager.iobuf_free(uintptr_cast(iov.iov_base));
-        }
     }
+
     struct journal_header {
         uint32_t key_size;
         uint64_t key_pattern;
@@ -96,8 +89,9 @@ class SoloReplDevTest : public testing::Test {
         Listener(SoloReplDevTest& test) : m_test{test} {}
         virtual ~Listener() = default;
 
-        void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids,
-                       cintrusive< repl_req_ctx >& ctx) override {
+        void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key,
+                       std::vector< MultiBlkId > const& blkids, cintrusive< repl_req_ctx >& ctx) override {
+            LOGINFO("Received on_commit lsn={}", lsn);
             if (ctx == nullptr) {
                 m_test.validate_replay(*repl_dev(), lsn, header, key, blkids);
             } else {
@@ -110,10 +104,10 @@ class SoloReplDevTest : public testing::Test {
         AsyncReplResult<> create_snapshot(shared< snapshot_context > context) override {
             return make_async_success<>();
         }
-        int read_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override {
+        int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override {
             return 0;
         }
-        void write_snapshot_data(shared< snapshot_context > context, shared< snapshot_data > snp_data) override {}
+        void write_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override {}
         bool apply_snapshot(shared< snapshot_context > context) override { return true; }
         shared< snapshot_context > last_snapshot() override { return nullptr; }
         void free_user_snp_ctx(void*& user_snp_ctx) override {}
@@ -125,7 +119,8 @@ class SoloReplDevTest : public testing::Test {
         void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key,
                          cintrusive< repl_req_ctx >& ctx) override {}
 
-        ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override {
+        ReplResult< blk_alloc_hints > get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size,
+                                                          cintrusive< homestore::repl_req_ctx >& hs_ctx) override {
             return blk_alloc_hints{};
         }
 
@@ -135,7 +130,12 @@ class SoloReplDevTest : public testing::Test {
                       cintrusive< repl_req_ctx >& ctx) override {
             LOGINFO("Received error={} on repl_dev", enum_name(error));
         }
-        void on_destroy() override {}
+        void on_start_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {}
+        void on_complete_replace_member(const replica_member_info& member_out, const replica_member_info& member_in, trace_id_t tid) override {}
+        void on_destroy(const group_id_t& group_id) override {}
+        void notify_committed_lsn(int64_t lsn) override {}
+        void on_config_rollback(int64_t lsn) override {}
+        void on_no_space_left(repl_lsn_t lsn, chunk_num_t chunk_id) override {}
     };
 
     class Application : public ReplApplication {
@@ -151,6 +151,8 @@ class SoloReplDevTest : public testing::Test {
         shared< ReplDevListener > create_repl_dev_listener(uuid_t) override {
             return std::make_shared< Listener >(m_test);
         }
+        void destroy_repl_dev_listener(uuid_t) override {}
+        void on_repl_devs_init_completed() { LOGINFO("Repl dev init completed CB called"); }
         std::pair< std::string, uint16_t > lookup_peer(uuid_t uuid) const override { return std::make_pair("", 0u); }
         replica_id_t get_my_repl_id() const override { return hs_utils::gen_random_uuid(); }
     };
@@ -221,60 +223,116 @@ class SoloReplDevTest : public testing::Test {
         rdev->async_alloc_write(*req->header, req->key ? *req->key : sisl::blob{}, req->write_sgs, req);
     }
 
+    void async_write_data_and_journal(uint32_t key_size, uint64_t data_size, uint32_t max_size_per_iov) {
+        data_size = data_size == 0 ? g_block_size : data_size;
+        auto req = intrusive< test_repl_req >(new test_repl_req());
+        req->header = sisl::make_byte_array(sizeof(test_repl_req::journal_header));
+        auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes());
+        hdr->key_size = key_size;
+        hdr->key_pattern = ((long long)rand() << 32) | rand();
+        hdr->data_size = data_size;
+        hdr->data_pattern = ((long long)rand() << 32) | rand();
+
+        if (key_size != 0) {
+            req->key = sisl::make_byte_array(key_size);
+            HSTestHelper::fill_data_buf(req->key->bytes(), key_size, hdr->key_pattern);
+        }
+
+        req->write_sgs = HSTestHelper::create_sgs(data_size, max_size_per_iov, hdr->data_pattern);
+
+        auto& rdev = (rand() % 2) ? m_repl_dev1 : m_repl_dev2;
+
+        auto const cap = hs()->repl_service().get_cap_stats();
+        LOGDEBUG("Before write, cap stats: used={} total={}", cap.used_capacity, cap.total_capacity);
+
+        std::vector< MultiBlkId > blkids;
+        blk_alloc_hints hints;
+        auto err = rdev->alloc_blks(data_size, hints, blkids);
+        RELEASE_ASSERT(!err, "Error during alloc_blks");
+        RELEASE_ASSERT(!blkids.empty(), "Empty blkids");
+
+        rdev->async_write(blkids, req->write_sgs).thenValue([this, rdev, blkids, data_size, req](auto&& err) {
+            RELEASE_ASSERT(!err, "Error during async_write");
+            rdev->async_write_journal(blkids, *req->header, req->key ? *req->key : sisl::blob{}, data_size, req);
+        });
+    }
+
     void validate_replay(ReplDev& rdev, int64_t lsn, sisl::blob const& header, sisl::blob const& key,
-                         MultiBlkId const& blkids) {
+                         std::vector< MultiBlkId > const& blkids) {
+        if (blkids.empty()) {
+            m_task_waiter.one_complete();
+            return;
+        }
+
         auto const jhdr = r_cast< test_repl_req::journal_header const* >(header.cbytes());
         HSTestHelper::validate_data_buf(key.cbytes(), key.size(), jhdr->key_pattern);
-
-        uint32_t size = blkids.blk_count() * g_block_size;
-        if (size) {
-            auto read_sgs = HSTestHelper::create_sgs(size, size);
-            LOGDEBUG("[{}] Validating replay of lsn={} blkid = {}", boost::uuids::to_string(rdev.group_id()), lsn,
-                     blkids.to_string());
-            rdev.async_read(blkids, read_sgs, size)
-                .thenValue([this, hdr = *jhdr, read_sgs, lsn, blkids, &rdev](auto&& err) {
-                    RELEASE_ASSERT(!err, "Error during async_read");
-                    HS_REL_ASSERT_EQ(hdr.data_size, read_sgs.size, "journal hdr data size mismatch with actual size");
-
-                    for (auto const& iov : read_sgs.iovs) {
-                        HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr.data_pattern);
-                        iomanager.iobuf_free(uintptr_cast(iov.iov_base));
-                    }
-                    LOGDEBUG("[{}] Replay of lsn={} blkid={} validated successfully",
-                             boost::uuids::to_string(rdev.group_id()), lsn, blkids.to_string());
-                    m_task_waiter.one_complete();
-                });
-        } else {
-            m_task_waiter.one_complete();
+        uint64_t total_io = blkids.size();
+        auto io_count = std::make_shared< std::atomic< uint64_t > >(0);
+        for (const auto& blkid : blkids) {
+            uint32_t size = blkid.blk_count() * g_block_size;
+            if (size) {
+                auto read_sgs = HSTestHelper::create_sgs(size, size);
+                LOGDEBUG("[{}] Validating replay of lsn={} blkid = {}", boost::uuids::to_string(rdev.group_id()), lsn,
+                         blkid.to_string());
+                rdev.async_read(blkid, read_sgs, size)
+                    .thenValue([this, io_count, total_io, hdr = *jhdr, read_sgs, lsn, blkid, &rdev](auto&& err) {
+                        RELEASE_ASSERT(!err, "Error during async_read");
+                        // HS_REL_ASSERT_EQ(hdr.data_size, read_sgs.size,
+                        //                  "journal hdr data size mismatch with actual size");
+
+                        for (auto const& iov : read_sgs.iovs) {
+                            HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr.data_pattern);
+                            iomanager.iobuf_free(uintptr_cast(iov.iov_base));
+                        }
+                        LOGDEBUG("[{}] Replay of lsn={} blkid={} validated successfully",
+                                 boost::uuids::to_string(rdev.group_id()), lsn, blkid.to_string());
+
+                        io_count->fetch_add(1);
+                        if (*io_count == total_io) { m_task_waiter.one_complete(); }
+                    });
+            } else {
+                m_task_waiter.one_complete();
+            }
         }
     }
 
     void on_write_complete(ReplDev& rdev, intrusive< test_repl_req > req) {
-        // If we did send some data to the repl_dev, validate it by doing async_read
-        if (req->write_sgs.size != 0) {
-            req->read_sgs = HSTestHelper::create_sgs(req->write_sgs.size, req->write_sgs.size);
-
-            auto const cap = hs()->repl_service().get_cap_stats();
-            LOGDEBUG("Write complete with cap stats: used={} total={}", cap.used_capacity, cap.total_capacity);
-
-            rdev.async_read(req->written_blkids, req->read_sgs, req->read_sgs.size)
-                .thenValue([this, &rdev, req](auto&& err) {
-                    RELEASE_ASSERT(!err, "Error during async_read");
-
-                    LOGDEBUG("[{}] Write complete with lsn={} for size={} blkids={}",
-                             boost::uuids::to_string(rdev.group_id()), req->lsn(), req->write_sgs.size,
-                             req->written_blkids.to_string());
-                    auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes());
-                    HS_REL_ASSERT_EQ(hdr->data_size, req->read_sgs.size,
-                                     "journal hdr data size mismatch with actual size");
-
-                    for (auto const& iov : req->read_sgs.iovs) {
-                        HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr->data_pattern);
-                    }
-                    m_io_runner.next_task();
-                });
-        } else {
+        if (req->written_blkids.empty()) {
             m_io_runner.next_task();
+            return;
+        }
+
+        // If we did send some data to the repl_dev, validate it by doing async_read
+        auto io_count = std::make_shared< std::atomic< uint64_t > >(0);
+        for (const auto blkid : req->written_blkids) {
+            if (req->write_sgs.size != 0) {
+                auto const cap = hs()->repl_service().get_cap_stats();
+                LOGDEBUG("Write complete with cap stats: used={} total={}", cap.used_capacity, cap.total_capacity);
+
+                auto sgs_size = blkid.blk_count() * g_block_size;
+                auto read_sgs = HSTestHelper::create_sgs(sgs_size, sgs_size);
+                rdev.async_read(blkid, read_sgs, read_sgs.size)
+                    .thenValue([this, io_count, blkid, &rdev, sgs_size, read_sgs, req](auto&& err) {
+                        RELEASE_ASSERT(!err, "Error during async_read");
+
+                        LOGINFO("[{}] Write complete with lsn={} for size={} blkid={}",
+                                boost::uuids::to_string(rdev.group_id()), req->lsn(), sgs_size, blkid.to_string());
+                        auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes());
+                        // HS_REL_ASSERT_EQ(hdr->data_size, read_sgs.size,
+                        //                  "journal hdr data size mismatch with actual size");
+
+                        for (auto const& iov : read_sgs.iovs) {
+                            LOGDEBUG("Read data blkid={} len={} data={}", blkid.to_integer(), iov.iov_len,
+                                     *(uint64_t*)iov.iov_base);
+                            HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, hdr->data_pattern);
+                            iomanager.iobuf_free(uintptr_cast(iov.iov_base));
+                        }
+                        io_count->fetch_add(1);
+                        if (*io_count == req->written_blkids.size()) { m_io_runner.next_task(); }
+                    });
+            } else {
+                m_io_runner.next_task();
+            }
         }
     }
 };
@@ -295,7 +353,9 @@ TEST_F(SoloReplDevTest, TestRandomSizedDataBlock) {
         uint32_t key_size = rand() % 512 + 8;
         this->write_io(key_size, nblks * g_block_size, g_block_size);
     });
+
     this->m_io_runner.execute().get();
+    LOGINFO("Step 2: Restart homestore and validate replay data.", g_block_size);
     this->m_task_waiter.start([this]() { this->restart(); }).get();
 }
 
@@ -303,6 +363,20 @@ TEST_F(SoloReplDevTest, TestHeaderOnly) {
     LOGINFO("Step 1: run on worker threads to schedule write");
     this->m_io_runner.set_task([this]() { this->write_io(0u, 0u, g_block_size); });
     this->m_io_runner.execute().get();
+    LOGINFO("Step 2: Restart homestore and validate replay data.", g_block_size);
+    this->m_task_waiter.start([this]() { this->restart(); }).get();
+}
+
+TEST_F(SoloReplDevTest, TestAsyncWriteJournal) {
+    LOGINFO("Step 1: run on worker threads to schedule write for random bytes ranging {}-{}.", 0, 1 * Mi);
+    this->m_io_runner.set_task([this]() {
+        uint32_t nblks = rand() % ((1 * Mi) / g_block_size);
+        uint32_t key_size = rand() % 512 + 8;
+        this->async_write_data_and_journal(key_size, nblks * g_block_size, g_block_size);
+    });
+
+    this->m_io_runner.execute().get();
+    LOGINFO("Step 2: Restart homestore and validate replay data.", g_block_size);
     this->m_task_waiter.start([this]() { this->restart(); }).get();
 }