From d21241ed776fd00a19fbef60edeed9ebf17361ef Mon Sep 17 00:00:00 2001 From: Anthony Vardaro Date: Thu, 25 Dec 2025 04:13:14 +0000 Subject: [PATCH] Add option for bypassing directory scans in read-only mode --- db/c.cc | 10 +++++ db/db_impl/db_impl_open.cc | 7 +++- db/db_test2.cc | 79 ++++++++++++++++++++++++++++++++++++++ include/rocksdb/c.h | 6 +++ include/rocksdb/options.h | 13 +++++++ java/rocksjni/options.cc | 24 ++++++++++++ options/db_options.cc | 7 ++++ options/db_options.h | 1 + options/options_helper.cc | 2 + test_util/testutil.h | 17 ++++++++ 10 files changed, 164 insertions(+), 2 deletions(-) diff --git a/db/c.cc b/db/c.cc index 9f058c55aba4..c7ba85acf06d 100644 --- a/db/c.cc +++ b/db/c.cc @@ -4404,6 +4404,16 @@ unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open( return opt->rep.skip_checking_sst_file_sizes_on_db_open; } +void rocksdb_options_set_skip_directory_scan_on_readonly_db_open( + rocksdb_options_t* opt, unsigned char val) { + opt->rep.skip_directory_scan_on_readonly_db_open = val; +} + +unsigned char rocksdb_options_get_skip_directory_scan_on_readonly_db_open( + rocksdb_options_t* opt) { + return opt->rep.skip_directory_scan_on_readonly_db_open; +} + /* Blob Options Settings */ void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt, unsigned char val) { diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index cccc3ea2c708..c03b5c9e8e43 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -723,7 +723,9 @@ Status DBImpl::Recover( // attention to it in case we are recovering a database // produced by an older version of rocksdb. auto wal_dir = immutable_db_options_.GetWalDir(); - if (!immutable_db_options_.best_efforts_recovery) { + if (!immutable_db_options_.best_efforts_recovery && + !(read_only && + immutable_db_options_.skip_directory_scan_on_readonly_db_open)) { IOOptions io_opts; io_opts.do_not_recurse = true; s = immutable_db_options_.fs->GetChildren( @@ -824,7 +826,8 @@ Status DBImpl::Recover( } } - if (read_only) { + if (read_only && + !immutable_db_options_.skip_directory_scan_on_readonly_db_open) { // If we are opening as read-only, we need to update options_file_number_ // to reflect the most recent OPTIONS file. It does not matter for regular // read-write db instance because options_file_number_ will later be diff --git a/db/db_test2.cc b/db/db_test2.cc index 33da1ffaf12f..5f126bb97f6a 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -15,6 +15,7 @@ #include "db/db_test_util.h" #include "db/read_callback.h" #include "db/version_edit.h" +#include "env/composite_env_wrapper.h" #include "env/fs_readonly.h" #include "options/options_helper.h" #include "port/port.h" @@ -94,6 +95,84 @@ TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) { ASSERT_NOK(env_->FileExists(dbname)); } +TEST_F(DBTest2, SkipDirectoryScanOnReadOnlyOpen) { + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Put("key2", "value2")); + ASSERT_OK(Flush()); + Close(); + + auto base_fs = env_->GetFileSystem(); + auto no_readdir_fs = std::make_shared(base_fs); + std::unique_ptr custom_env(new CompositeEnvWrapper(env_, no_readdir_fs)); + + Options ro_options = CurrentOptions(); + ro_options.env = custom_env.get(); + DB* db_ptr = nullptr; + ASSERT_NOK(DB::OpenForReadOnly(ro_options, dbname_, &db_ptr)); + + // getdents64() should be skipped. + ro_options.skip_directory_scan_on_readonly_db_open = true; + ASSERT_OK(DB::OpenForReadOnly(ro_options, dbname_, &db_ptr)); + + std::string value; + ASSERT_OK(db_ptr->Get(ReadOptions(), "key1", &value)); + ASSERT_EQ("value1", value); + ASSERT_OK(db_ptr->Get(ReadOptions(), "key2", &value)); + ASSERT_EQ("value2", value); + + delete db_ptr; +} + +TEST_F(DBTest2, SkipDirectoryScanUnflushedDataNotVisible) { + Options options = CurrentOptions(); + + options.create_if_missing = true; + options.write_buffer_size = 64 << 20; + options.max_write_buffer_number = 10; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + // Write a key, then flush. Write another key, don't flush to make + // sure its resident in the WAL. + ASSERT_OK(Put("flushed_key", "flushed_value")); + ASSERT_OK(Flush()); + + WriteOptions wo; + wo.disableWAL = false; + ASSERT_OK(db_->Put(wo, "unflushed_key", "unflushed_value")); + + Close(); + + // Open read-only with skip_directory_scan_on_readonly_db_open = true + // The unflushed data should not be visible (WAL not replayed). + Options ro_options = CurrentOptions(); + ro_options.skip_directory_scan_on_readonly_db_open = true; + DB* db_ptr = nullptr; + ASSERT_OK(DB::OpenForReadOnly(ro_options, dbname_, &db_ptr)); + + std::string value; + ASSERT_OK(db_ptr->Get(ReadOptions(), "flushed_key", &value)); + ASSERT_EQ("flushed_value", value); + ASSERT_TRUE(db_ptr->Get(ReadOptions(), "unflushed_key", &value).IsNotFound()); + + delete db_ptr; + + // Set skip_directory_scan_on_readonly_db_open to false, WAL should be found + // and replayed. + ro_options.skip_directory_scan_on_readonly_db_open = false; + ASSERT_OK(DB::OpenForReadOnly(ro_options, dbname_, &db_ptr)); + + ASSERT_OK(db_ptr->Get(ReadOptions(), "flushed_key", &value)); + ASSERT_EQ("flushed_value", value); + ASSERT_OK(db_ptr->Get(ReadOptions(), "unflushed_key", &value)); + ASSERT_EQ("unflushed_value", value); + + delete db_ptr; +} + class PartitionedIndexTestListener : public EventListener { public: void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index ffb5583e4aca..ae6898880279 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -1621,6 +1621,12 @@ rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open( extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open( rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_skip_directory_scan_on_readonly_db_open( + rocksdb_options_t* opt, unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_skip_directory_scan_on_readonly_db_open( + rocksdb_options_t* opt); /* Blob Options Settings */ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_files( diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index d8acfe8f7175..0b5bf0e81431 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1373,6 +1373,19 @@ struct DBOptions { // Default: false bool skip_checking_sst_file_sizes_on_db_open = false; + // If true, skip directory listing operations (readdir/getdents64) during + // read-only database open. Use this for file systems that do not support + // directory listing. + // + // When enabled, WAL directory scanning is skipped. This is safe for any + // database that was closed cleanly or flushed before being opened + // read-only. + // + // Only affects DB::OpenForReadOnly(); ignored for read-write opens. + // + // Default: false + bool skip_directory_scan_on_readonly_db_open = false; + // Recovery mode to control the consistency while replaying WAL // Default: kPointInTimeRecovery WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index 2bb07cf45828..db1e6d4d9891 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -1983,6 +1983,30 @@ jboolean Java_org_rocksdb_Options_skipCheckingSstFileSizesOnDbOpen( return static_cast(opt->skip_checking_sst_file_sizes_on_db_open); } +/* + * Class: org_rocksdb_Options + * Method: setSkipDirectoryScanOnReadOnlyDbOpen + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setSkipDirectoryScanOnReadOnlyDbOpen( + JNIEnv*, jclass, jlong jhandle, + jboolean jskip_directory_scan_on_read_only_db_open) { + auto* opt = reinterpret_cast(jhandle); + opt->skip_directory_scan_on_readonly_db_open = + static_cast(jskip_directory_scan_on_read_only_db_open); +} + +/* + * Class: org_rocksdb_Options + * Method: skipDirectoryScanOnReadOnlyDbOpen + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_skipDirectoryScanOnReadOnlyDbOpen( + JNIEnv*, jclass, jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->skip_directory_scan_on_readonly_db_open); +} + /* * Class: org_rocksdb_Options * Method: setWalRecoveryMode diff --git a/options/db_options.cc b/options/db_options.cc index dfacea8e5b22..5c188931eff0 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -263,6 +263,11 @@ static std::unordered_map skip_checking_sst_file_sizes_on_db_open), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"skip_directory_scan_on_readonly_db_open", + {offsetof(struct ImmutableDBOptions, + skip_directory_scan_on_readonly_db_open), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"new_table_reader_for_compaction_inputs", {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, OptionTypeFlags::kNone}}, @@ -769,6 +774,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) skip_stats_update_on_db_open(options.skip_stats_update_on_db_open), skip_checking_sst_file_sizes_on_db_open( options.skip_checking_sst_file_sizes_on_db_open), + skip_directory_scan_on_readonly_db_open( + options.skip_directory_scan_on_readonly_db_open), wal_recovery_mode(options.wal_recovery_mode), allow_2pc(options.allow_2pc), row_cache(options.row_cache), diff --git a/options/db_options.h b/options/db_options.h index ef8607d8bba1..68e7081c085a 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -71,6 +71,7 @@ struct ImmutableDBOptions { uint64_t write_thread_slow_yield_usec; bool skip_stats_update_on_db_open; bool skip_checking_sst_file_sizes_on_db_open; + bool skip_directory_scan_on_readonly_db_open; WALRecoveryMode wal_recovery_mode; bool allow_2pc; std::shared_ptr row_cache; diff --git a/options/options_helper.cc b/options/options_helper.cc index e5622d0a3238..fb4e998eb8df 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -151,6 +151,8 @@ void BuildDBOptions(const ImmutableDBOptions& immutable_db_options, immutable_db_options.skip_stats_update_on_db_open; options.skip_checking_sst_file_sizes_on_db_open = immutable_db_options.skip_checking_sst_file_sizes_on_db_open; + options.skip_directory_scan_on_readonly_db_open = + immutable_db_options.skip_directory_scan_on_readonly_db_open; options.wal_recovery_mode = immutable_db_options.wal_recovery_mode; options.allow_2pc = immutable_db_options.allow_2pc; options.row_cache = immutable_db_options.row_cache; diff --git a/test_util/testutil.h b/test_util/testutil.h index 3bd97ef14b76..623f5f4a4bdb 100644 --- a/test_util/testutil.h +++ b/test_util/testutil.h @@ -744,6 +744,23 @@ class StringFS : public FileSystemWrapper { std::unordered_map files_; }; +// Filesystem wrapper that rejects directory listing operations (GetChildren). +// Used for testing skip_directory_scan_on_readonly_db_open option. +class NoReaddirFS : public FileSystemWrapper { + public: + explicit NoReaddirFS(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + + static const char* kClassName() { return "NoReaddirFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus GetChildren(const std::string& /*dir*/, const IOOptions& /*opts*/, + std::vector* /*result*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported("Directory listing not supported"); + } +}; + // A compressor that essentially implements a custom compression algorithm // by leveraging an existing compression algorithm and putting a custom header // on it to detect any attempts to decompress it with the wrong compression