From 4838b2c79a95c8e5029690cedfaa414744c59b66 Mon Sep 17 00:00:00 2001 From: Eric O Date: Wed, 1 Oct 2025 15:21:26 -0400 Subject: [PATCH 1/3] DSM-148: Create atc:aws:restore_archived_objects rake task. Also add atc:aws:list_file_extensions convenience task to be used for analysis related to the atc:aws:restore_archived_objects task --- lib/tasks/aws.rake | 173 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) diff --git a/lib/tasks/aws.rake b/lib/tasks/aws.rake index 6163c94..dedb712 100644 --- a/lib/tasks/aws.rake +++ b/lib/tasks/aws.rake @@ -2,6 +2,179 @@ namespace :atc do namespace :aws do + # Calls the S3_CLIENT#list_objects_v2 method multiple times to automatically page through all results. + # S3_CLIENT#list_objects_v2 method returns up to 1000 results per call, and returns a token that can be + # used in subsequent calls to get the next page of results. This method wraps that paging functionality. + def auto_paginating_list_object_v2(list_objects_v2_opts) + next_continuation_token = nil + + counter = 0 + loop do + counter += 1 + result_object = S3_CLIENT.list_objects_v2(list_objects_v2_opts.merge({ + continuation_token: next_continuation_token + })) + + S3_CLIENT.list_objects_v2(list_objects_v2_opts).contents.each do |object| + yield object + end + + next_continuation_token = result_object.next_continuation_token + break if next_continuation_token.nil? + end + end + + + desc 'For the given bucket_name and key_prefix, iterates over objects and generates a list of their file extensions and counts' + task list_file_extensions: :environment do + bucket_name = ENV['bucket_name'] + key_prefix = ENV['key_prefix'] + + extension_counts = {} + + auto_paginating_list_object_v2({ + bucket: bucket_name, + prefix: key_prefix + }) do |object| + ext = File.extname(object.key) + extension_counts[ext] ||= 0 + extension_counts[ext] += 1 + end + + # Sort the files by count, descending. + extension_counts.to_a.sort_by {|pair| pair[1] }.reverse.each do |pair| + puts "#{pair[0]}: #{pair[1]}" + end + end + + desc 'For the given bucket_name and key_prefix, iterates over objects in Intelligent Tiering and restores them '\ + ' if they have already transitioned to the Archive Access tier.' + task restore_archived_objects: :environment do + bucket_name = ENV['bucket_name'] + key_prefix = ENV['key_prefix'] + key_suffix_filter = ENV['key_suffix_filter'] + dry_run = ENV['dry_run'] == 'true' + + puts "" + + puts "This is a dry run because dry_run=true has been set. No objects will actually be restored during this run.\n\n" if dry_run + + if key_suffix_filter.present? + puts "Searching for objects (and filtering on objects with keys that end with \"#{key_suffix_filter}\")...\n\n" + else + puts "Searching for objects...\n\n" + end + number_of_intelligent_tiering_object_resoration_requests_submitted = 0 + number_of_intelligent_tiering_objects_with_restoration_in_progress = 0 + number_of_intelligent_tiering_objects_already_available = 0 + number_of_non_intelligent_tiering_objects_skipped = 0 + number_of_objects_skipped_based_on_key_suffix_filter = 0 + errors_encountered = [] + + puts "--------------------" + puts "Results:" + auto_paginating_list_object_v2({ + bucket: bucket_name, + prefix: key_prefix + }) do |object| + object_key = object.key + storage_class = object.storage_class + + if storage_class == 'INTELLIGENT_TIERING' + if key_suffix_filter.present? && !object_key.end_with?(key_suffix_filter) + number_of_objects_skipped_based_on_key_suffix_filter += 1 + next + end + + begin + S3_CLIENT.restore_object({ + bucket: bucket_name, + key: object_key, + # For an object in Intelligent Tiering Archive Instant storage, we just pass an empty hash here. + # No further configuration is needed. + restore_request: {} + }) unless dry_run + number_of_intelligent_tiering_object_resoration_requests_submitted += 1 + rescue Aws::S3::Errors::ServiceError => e + if e.message.include?("Restore is not allowed for the object's current storage class") + # If we got here, that means that this object was already restored and doesn't need to be restored again + # because it is available. We'll silently ignore this error. + number_of_intelligent_tiering_objects_already_available += 1 + elsif e.message.include?("Object restore is already in progress") + # If we got here, that means that this object's restoration is already in progress and we do not need to + # initiate another restoration request. We'll silently ignore this error. + number_of_intelligent_tiering_objects_with_restoration_in_progress += 1 + else + errors_encountered << "An unexpected error occured while attempting to restore #{object_key}: #{e.message}" + end + end + else + number_of_non_intelligent_tiering_objects_skipped += 1 + end + + end + + if dry_run + puts "Number of intelligent tiering object restoration requests that would have been made (if this wasn't a dry run): #{number_of_intelligent_tiering_object_resoration_requests_submitted}" + else + puts "Number of intelligent tiering object restoration requests submitted: #{number_of_intelligent_tiering_object_resoration_requests_submitted}" + puts "Number of intelligent tiering objects with restoration in progress: #{number_of_intelligent_tiering_objects_with_restoration_in_progress}" + puts "Number of intelligent tiering objects already available: #{number_of_intelligent_tiering_objects_already_available}" + end + puts "Number of objects skipped based on key_suffix_filter: #{number_of_objects_skipped_based_on_key_suffix_filter}" + puts "Number of non intelligent tiering objects skipped: #{number_of_non_intelligent_tiering_objects_skipped}" + puts "\nReminder: After restoration has been initiated, it will take 3-5 hours until the files are available for download. "\ + "The current time is #{Time.current}, so the files should be available after #{Time.current + 5.hours}." + puts "--------------------" + puts "Errors: " + (errors_encountered.empty? ? 'None' : "\n#{errors_encountered.join("\n")}") + + # pids.each_with_index do |pid| + # print "Checking #{pid}..." + # dobj = DigitalObject::Base.find(pid) + # fobj = dobj.fedora_object + # storage_object = Hyacinth::Storage.storage_object_for(fobj.datastreams['content'].dsLocation) + # if storage_object.is_a?(Hyacinth::Storage::S3Object) + # # NOTE: storage_object.s3_object.restore will return nil if the object has not been restored yet, + # # but it will return a string if a restore operation has already been run on the object and it is + # # in the process of being restored. + # if storage_object.s3_object.archive_status == 'ARCHIVE_ACCESS' + # if storage_object.s3_object.restore.nil? + # puts "Need to restore object at: #{storage_object.location_uri}" + # puts "---> Restoring archived object..." + # bucket_name = storage_object.s3_object.bucket_name + # key = storage_object.s3_object.key + # # Make sure that bucket_name and key aren't blank. They shouldn't ever be blank at this point in the + # # code, but we want to make sure not to call restore if either of them somehow are blank. + # raise if bucket_name.blank? || key.blank? + + # begin + # restore_object_response = storage_object.s3_object.restore_object({ + # bucket: bucket_name, + # key: key, + # # For an object in Intelligent Tiering Archive Instant storage, we just pass an empty hash here. + # # No further configuration is needed. + # restore_request: {} + # }) + # puts "---> Object restoration request submitted! The object should be available within 3-5 hours." + # rescue Aws::S3::Errors::ServiceError => e + # puts "---> An unexpected error occurred while attempting to restore the object." + # end + # else + # puts "---> A restore request has already been made for this object and restoration is in progress: #{storage_object.s3_object.restore}" + # end + # else + # puts "---> Object is not currently in ARCHIVE_ACCESS state, so we will not make any changes." + # end + + # # puts "Do we need to restore this object?" + # elsif storage_object.is_a?(Hyacinth::Storage::FileObject) + # puts "No need to restore this object because it's available on the local filesystem." + # else + # puts "Ignoring unknown object type: #{storage_object.class.name}" + # end + # end + end + desc 'Run a fixity check using a remote CheckPlease app deployment.' task fixity_check: :environment do bucket_name = ENV['bucket_name'] From 21d37fc81218ce3593bbd56c9fa6c153d3a5d339 Mon Sep 17 00:00:00 2001 From: Eric O Date: Wed, 1 Oct 2025 15:22:31 -0400 Subject: [PATCH 2/3] Update capistrano and sshkit gems --- Gemfile | 2 +- Gemfile.lock | 27 +++++++++++++++------------ config/deploy.rb | 2 +- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/Gemfile b/Gemfile index f3060f5..443cf3c 100644 --- a/Gemfile +++ b/Gemfile @@ -82,7 +82,7 @@ end group :development do # Use Capistrano for deployment - gem 'capistrano', '~> 3.18.0', require: false + gem 'capistrano', '~> 3.19.0', require: false gem 'capistrano-cul', require: false gem 'capistrano-passenger', '~> 0.1', require: false gem 'capistrano-rails', '~> 1.4', require: false diff --git a/Gemfile.lock b/Gemfile.lock index 7684236..8a6036a 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -77,7 +77,7 @@ GEM tzinfo (~> 2.0) addressable (2.8.6) public_suffix (>= 2.0.2, < 6.0) - airbrussh (1.5.1) + airbrussh (1.5.3) sshkit (>= 1.6.1, != 1.7.0) ast (2.4.2) aws-crt (0.2.0-arm64-darwin) @@ -100,7 +100,7 @@ GEM aws-sigv4 (~> 1.8) aws-sigv4 (1.8.0) aws-eventstream (~> 1, >= 1.0.2) - base64 (0.2.0) + base64 (0.3.0) bcrypt (3.1.20) bcrypt_pbkdf (1.1.0) best_type (1.0.0) @@ -110,7 +110,7 @@ GEM bootsnap (1.18.3) msgpack (~> 1.2) builder (3.2.4) - capistrano (3.18.1) + capistrano (3.19.2) airbrussh (>= 1.0.0) i18n rake (>= 10.0.0) @@ -139,7 +139,7 @@ GEM rack-test (>= 0.6.3) regexp_parser (>= 1.5, < 3.0) xpath (~> 3.2) - concurrent-ruby (1.2.3) + concurrent-ruby (1.3.5) connection_pool (2.4.1) crack (1.0.0) bigdecimal @@ -216,7 +216,7 @@ GEM hashdiff (1.1.0) hashie (5.0.0) httpclient (2.8.3) - i18n (1.14.4) + i18n (1.14.7) concurrent-ruby (~> 1.0) importmap-rails (2.0.1) actionpack (>= 6.0.0) @@ -234,6 +234,7 @@ GEM jwt (2.8.1) base64 language_server-protocol (3.17.0.3) + logger (1.7.0) loofah (2.22.0) crass (~> 1.0.2) nokogiri (>= 1.12.0) @@ -254,7 +255,7 @@ GEM multi_json (1.15.0) mustermann (3.0.0) ruby2_keywords (~> 0.0.1) - mutex_m (0.2.0) + mutex_m (0.3.0) mysql2 (0.5.6) net-http (0.4.1) uri @@ -265,13 +266,13 @@ GEM net-protocol net-protocol (0.2.2) timeout - net-scp (4.0.0) + net-scp (4.1.0) net-ssh (>= 2.6.5, < 8.0.0) net-sftp (4.0.0) net-ssh (>= 5.0.0, < 8.0.0) net-smtp (0.5.0) net-protocol - net-ssh (7.2.1) + net-ssh (7.3.0) nio4r (2.7.1) nokogiri (1.16.3-arm64-darwin) racc (~> 1.4) @@ -286,6 +287,7 @@ GEM omniauth (>= 2.0) orm_adapter (0.5.0) os (1.1.4) + ostruct (0.6.3) parallel (1.25.1) parser (3.3.4.0) ast (~> 2.4.1) @@ -337,7 +339,7 @@ GEM thor (~> 1.0, >= 1.2.2) zeitwerk (~> 2.6) rainbow (3.1.1) - rake (13.1.0) + rake (13.3.0) rdoc (6.6.3.1) psych (>= 4.0.0) redis (4.8.1) @@ -453,12 +455,13 @@ GEM sprockets (>= 3.0.0) sqlite3 (1.7.3-arm64-darwin) sqlite3 (1.7.3-x86_64-linux) - sshkit (1.22.1) + sshkit (1.24.0) base64 - mutex_m + logger net-scp (>= 1.1.2) net-sftp (>= 2.1.2) net-ssh (>= 2.8.0) + ostruct stimulus-rails (1.3.3) railties (>= 6.0.0) stringex (2.8.6) @@ -509,7 +512,7 @@ DEPENDENCIES aws-sdk-s3 (~> 1) best_type (~> 1.0) bootsnap - capistrano (~> 3.18.0) + capistrano (~> 3.19.0) capistrano-cul capistrano-passenger (~> 0.1) capistrano-rails (~> 1.4) diff --git a/config/deploy.rb b/config/deploy.rb index 50ead39..76107cd 100644 --- a/config/deploy.rb +++ b/config/deploy.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true # config valid for current version and patch releases of Capistrano -lock '~> 3.18.0' +lock '~> 3.19.0' # Until we retire all old CentOS VMs, we need to set the rvm_custom_path because rvm is installed # in a non-standard location for our AlmaLinux VMs. This is because our service accounts need to From 68fb7f182753ec3bd000e111c0fd72f541e81078 Mon Sep 17 00:00:00 2001 From: Eric O Date: Wed, 1 Oct 2025 17:13:46 -0400 Subject: [PATCH 3/3] Relocate print statement in rake task; Cleanup --- lib/tasks/aws.rake | 52 +++------------------------------------------- 1 file changed, 3 insertions(+), 49 deletions(-) diff --git a/lib/tasks/aws.rake b/lib/tasks/aws.rake index dedb712..d1bd04d 100644 --- a/lib/tasks/aws.rake +++ b/lib/tasks/aws.rake @@ -71,8 +71,6 @@ namespace :atc do number_of_objects_skipped_based_on_key_suffix_filter = 0 errors_encountered = [] - puts "--------------------" - puts "Results:" auto_paginating_list_object_v2({ bucket: bucket_name, prefix: key_prefix @@ -111,9 +109,11 @@ namespace :atc do else number_of_non_intelligent_tiering_objects_skipped += 1 end - end + puts "--------------------" + puts "Results:" + if dry_run puts "Number of intelligent tiering object restoration requests that would have been made (if this wasn't a dry run): #{number_of_intelligent_tiering_object_resoration_requests_submitted}" else @@ -127,52 +127,6 @@ namespace :atc do "The current time is #{Time.current}, so the files should be available after #{Time.current + 5.hours}." puts "--------------------" puts "Errors: " + (errors_encountered.empty? ? 'None' : "\n#{errors_encountered.join("\n")}") - - # pids.each_with_index do |pid| - # print "Checking #{pid}..." - # dobj = DigitalObject::Base.find(pid) - # fobj = dobj.fedora_object - # storage_object = Hyacinth::Storage.storage_object_for(fobj.datastreams['content'].dsLocation) - # if storage_object.is_a?(Hyacinth::Storage::S3Object) - # # NOTE: storage_object.s3_object.restore will return nil if the object has not been restored yet, - # # but it will return a string if a restore operation has already been run on the object and it is - # # in the process of being restored. - # if storage_object.s3_object.archive_status == 'ARCHIVE_ACCESS' - # if storage_object.s3_object.restore.nil? - # puts "Need to restore object at: #{storage_object.location_uri}" - # puts "---> Restoring archived object..." - # bucket_name = storage_object.s3_object.bucket_name - # key = storage_object.s3_object.key - # # Make sure that bucket_name and key aren't blank. They shouldn't ever be blank at this point in the - # # code, but we want to make sure not to call restore if either of them somehow are blank. - # raise if bucket_name.blank? || key.blank? - - # begin - # restore_object_response = storage_object.s3_object.restore_object({ - # bucket: bucket_name, - # key: key, - # # For an object in Intelligent Tiering Archive Instant storage, we just pass an empty hash here. - # # No further configuration is needed. - # restore_request: {} - # }) - # puts "---> Object restoration request submitted! The object should be available within 3-5 hours." - # rescue Aws::S3::Errors::ServiceError => e - # puts "---> An unexpected error occurred while attempting to restore the object." - # end - # else - # puts "---> A restore request has already been made for this object and restoration is in progress: #{storage_object.s3_object.restore}" - # end - # else - # puts "---> Object is not currently in ARCHIVE_ACCESS state, so we will not make any changes." - # end - - # # puts "Do we need to restore this object?" - # elsif storage_object.is_a?(Hyacinth::Storage::FileObject) - # puts "No need to restore this object because it's available on the local filesystem." - # else - # puts "Ignoring unknown object type: #{storage_object.class.name}" - # end - # end end desc 'Run a fixity check using a remote CheckPlease app deployment.'