From 693361d4a7efc6f30090a3d8a62bb1a988721610 Mon Sep 17 00:00:00 2001 From: Brett Boston Date: Wed, 3 Dec 2025 16:30:40 -0800 Subject: [PATCH 1/8] Add apply load script --- scripts/ApplyLoad.py | 192 ++++++++++++++++++++++++ scripts/apply_load/max-sac-template.cfg | 41 +++++ 2 files changed, 233 insertions(+) create mode 100755 scripts/ApplyLoad.py create mode 100644 scripts/apply_load/max-sac-template.cfg diff --git a/scripts/ApplyLoad.py b/scripts/ApplyLoad.py new file mode 100755 index 0000000000..1d35986696 --- /dev/null +++ b/scripts/ApplyLoad.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 + +import os +import subprocess +import tempfile +import time + +# Instance type to use. Matches SDF validator instance type. +INSTANCE_TYPE = 'c5d.2xlarge' + +# Key pair name and file for SSH access +# TODO: Fill these in with the proper values +KEY_NAME = 'TODO' +KEY_FILE = 'TODO.pem' + +# Directory containing helper files for this script +APPLY_LOAD_SCRIPT_DIR = os.path.join(os.path.dirname(__file__), "apply_load") + +# Path to the max SAC template configuration file +MAX_SAC_TEMPLATE = os.path.join(APPLY_LOAD_SCRIPT_DIR, "max-sac-template.cfg") + +# Path to the ephemeral NVMe drive on AWS instance +NVME_DRIVE = "/dev/nvme1n1" + +# Number of SSH connection retries before giving up +SSH_RETRIES = 10 + +def run(command, exit_on_fail=True): + """ Run a command and exit if it fails. Prints the command's output. """ + print(f"Running: {command}") + res = os.system(command) + if res != 0: + print(f"Command '{command}' failed with exit code {res}") + if exit_on_fail: + exit(1) + return False + return True + +def run_capture_output(command): + """ Run a command and exit if it fails. Returns the command's output. """ + try: + return subprocess.check_output(command) + except subprocess.CalledProcessError as e: + print(f"Command '{command}' failed with exit code {e.returncode}") + exit(1) + +# TODO: If anything fails AFTER starting the instance, we should terminate it. +# That could be done in this script, or in the Jenkinsfile that calls this +# script. +def start_ec2_instance(ami, region, security_group): + """ Start an EC2 instance and return its instance id """ + print("Starting EC2 instance...") + cmd = ["aws", "ec2", "run-instances", "--image-id", ami, + "--instance-type", INSTANCE_TYPE, + "--security-groups", security_group, + "--key-name", KEY_NAME, "--query", "Instances[0].InstanceId", + "--output", "text", "--region", region] + instance_id = run_capture_output(cmd).decode().strip() + print("Started EC2 instance with ID:", instance_id) + + # Wait for instance to be running + print("Waiting for instance to be in 'running' state...") + run(f"aws ec2 wait instance-running --instance-ids {instance_id} " + f"--region {region}") + return instance_id + +def install_script_on_instance(instance_id, region): + """ Install this script on the given EC2 instance. """ + # Get the instance's public IP address + # TODO: remove region + ip = run_capture_output( + ["aws", "ec2", "describe-instances", "--instance-ids", instance_id, + "--query", "Reservations[0].Instances[0].PublicIpAddress", + "--output", "text", "--region", region]).decode().strip() + print("Instance public IP:", ip) + + # Wait for SSH to be available + print("Checking SSH availability...") + for i in range(SSH_RETRIES): + res = run(f"ssh -o StrictHostKeyChecking=no -i {KEY_FILE} " + "-o ConnectTimeout=5 " + f"ubuntu@{ip} 'true'", + exit_on_fail=(i == SSH_RETRIES - 1)) + if res: + break + sleep_duration = 10 + print(f"SSH not available yet, retrying ({i+1}/{SSH_RETRIES}) in " + f"{sleep_duration} seconds...") + time.sleep(sleep_duration) + print("SSH is available.") + + # Copy this script and the apply-load directory to the instance + scp_base = f"scp -i {KEY_FILE} -o StrictHostKeyChecking=no" + dest = f"ubuntu@{ip}:" + run(f"{scp_base} {__file__} {dest}") + run(f"{scp_base} -r {APPLY_LOAD_SCRIPT_DIR} {dest}") + + return ip + +def local_aws_init(): + """ Initialize an AWS instance for running apply-load from the instance + itself. """ + # Mount ephemeral nvme drive such that docker uses it for storage + run(f"sudo mkfs.ext4 {NVME_DRIVE}") + run("sudo mkdir -p /var/lib/docker") + run(f"sudo mount {NVME_DRIVE} /var/lib/docker") + + # Install docker + run("sudo apt-get update") + run("sudo apt-get install -y docker.io") + + # Allow regular ubuntu use to run docker commands + run("sudo usermod -aG docker ubuntu") + +def aws_init(ami, region, security_group): + """ Create and initialize an AWS instance for running apply-load. """ + # Start instance + instance_id = start_ec2_instance(ami, region, security_group) + + # Install this script on the instance + ip = install_script_on_instance(instance_id, region) + + # Remotely invoke local-aws-init on the instance + run(f"ssh -o StrictHostKeyChecking=no -i {KEY_FILE} " + f"ubuntu@{ip} 'python3 ApplyLoad.py local-aws-init'") + + # Print instance id and ip for Jenkins to store + print(f"{instance_id},{ip}") + +def run_max_sac(cfg, image, iops): + """ Run apply-load in max SAC TPS mode with the given config file. """ + with tempfile.NamedTemporaryFile() as cfg_out: + cfg_out.write(cfg.encode()) + cfg_out.flush() + iops_cmd = (f"--device-write-iops {NVME_DRIVE}:{iops} " + f"--device-read-iops {NVME_DRIVE}:{iops}" + if iops is not None else "") + run(f"docker run --rm -v {cfg_out.name}:/config.cfg {iops_cmd} {image} " + "apply-load --mode max-sac-tps --console --conf /config.cfg") + +def generate_cfg(clusters, batch_size): + """ Generate a configuration file for max SAC TPS mode. """ + with open(MAX_SAC_TEMPLATE, "r") as template_file: + template = template_file.read() + return template.format(clusters=clusters, batch_size=batch_size) + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + description="Helper script to run apply-load tests on AWS") + + subparsers = parser.add_subparsers(dest="mode", required=True) + + aws_init = subparsers.add_parser( + "aws-init", + help="Create and initialize an AWS instance for running apply-load.") + aws_init.add_argument( + "--ubuntu-ami", type=str, + help="AMI ID to use. Must be an Ubuntu image.") + aws_init.add_argument("--region", type=str, help="AWS region to use.") + aws_init.add_argument("--security-group", type=str, + help="AWS security group to use.") + + subparsers.add_parser( + "local-aws-init", + help="Initialize the local AWS instance for running apply-load.") + + run_max_sac_parser = subparsers.add_parser( + "max-sac", help="Run apply-load in max SAC TPS mode.") + run_max_sac_parser.add_argument( + "--image", type=str, required=True, help="Docker image to use.") + run_max_sac_parser.add_argument( + "--clusters", type=int, required=True, + help="Number of transaction clusters (threads).") + run_max_sac_parser.add_argument( + "--batch-size", type=int, required=True, + help="Batch size for transactions.") + run_max_sac_parser.add_argument( + "--iops", type=int, required=False, help="IOPS limit for the disk.") + args = parser.parse_args() + + if args.mode == "aws-init": + aws_init(args.ubuntu_ami, args.region, args.security_group) + elif args.mode == "local-aws-init": + local_aws_init() + elif args.mode == "max-sac": + cfg = generate_cfg(args.clusters, args.batch_size) + run_max_sac(cfg, args.image, args.iops) + else: + print(f"Unknown mode: {args.mode}") + exit(1) \ No newline at end of file diff --git a/scripts/apply_load/max-sac-template.cfg b/scripts/apply_load/max-sac-template.cfg new file mode 100644 index 0000000000..93217c31a2 --- /dev/null +++ b/scripts/apply_load/max-sac-template.cfg @@ -0,0 +1,41 @@ +# This is the Stellar Core configuration example for using the load generation +# (apply-load) tool for testing the theoretical max SAC (Stellar asset contract) +# transfer TPS via binary search (measured based on apply time only). + +# The core with this configuration should run using `./stellar-core apply-load --mode max-sac-tps` + +# Enable load generation +ARTIFICIALLY_GENERATE_LOAD_FOR_TESTING=true + +# Diagnostic events should generally be disabled, but can be enabled for debug +ENABLE_SOROBAN_DIAGNOSTIC_EVENTS = false + +# The only relevant network configuration parameter - number of transaction +# clusters that are then mapped to the transaction execution threads. +APPLY_LOAD_LEDGER_MAX_DEPENDENT_TX_CLUSTERS = {clusters} + +# Number of payments to batch in a single transaction, similarly to how +# operations are batched for 'classic' transactions. +APPLY_LOAD_BATCH_SAC_COUNT = {batch_size} + +# Number of ledgers to close for every iteration of search. +APPLY_LOAD_NUM_LEDGERS = 50 + +# Disable bucket list pre-generation as it's not necessary for this mode. +APPLY_LOAD_BL_SIMULATED_LEDGERS = 0 +APPLY_LOAD_BL_WRITE_FREQUENCY = 0 +APPLY_LOAD_BL_BATCH_SIZE = 0 +APPLY_LOAD_BL_LAST_BATCH_SIZE = 0 +APPLY_LOAD_BL_LAST_BATCH_LEDGERS = 0 + +# Minimal core config boilerplate + +RUN_STANDALONE=true +NODE_IS_VALIDATOR=true +UNSAFE_QUORUM=true +NETWORK_PASSPHRASE="Standalone Network ; February 2017" +NODE_SEED="SDQVDISRYN2JXBS7ICL7QJAEKB3HWBJFP2QECXG7GZICAHBK4UNJCWK2 self" + +[QUORUM_SET] +THRESHOLD_PERCENT=100 +VALIDATORS=["$self"] From d6dd27c3cb52faa5deda3feff5ebeb328941ab37 Mon Sep 17 00:00:00 2001 From: Brett Boston Date: Wed, 7 Jan 2026 15:57:35 -0800 Subject: [PATCH 2/8] Keypair generation --- scripts/ApplyLoad.py | 18 ++++++++++++++++-- src/rust/soroban/p25 | 2 +- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/scripts/ApplyLoad.py b/scripts/ApplyLoad.py index 1d35986696..a25c0cb3d6 100755 --- a/scripts/ApplyLoad.py +++ b/scripts/ApplyLoad.py @@ -10,8 +10,8 @@ # Key pair name and file for SSH access # TODO: Fill these in with the proper values -KEY_NAME = 'TODO' -KEY_FILE = 'TODO.pem' +KEY_NAME = 'max-sac-test-key' +KEY_FILE = 'max-sac-test-key.pem' # Directory containing helper files for this script APPLY_LOAD_SCRIPT_DIR = os.path.join(os.path.dirname(__file__), "apply_load") @@ -44,6 +44,17 @@ def run_capture_output(command): print(f"Command '{command}' failed with exit code {e.returncode}") exit(1) +def create_key_pair(region): + """ Create an EC2 key pair and save the private key to KEY_FILE. """ + print("Creating EC2 key pair...") + cmd = ["aws", "ec2", "create-key-pair", "--key-name", KEY_NAME, + "--query", "KeyMaterial", "--output", "text", "--region", region] + private_key = run_capture_output(cmd).decode().strip() + with open(KEY_FILE, "w") as key_file: + key_file.write(private_key) + os.chmod(KEY_FILE, 0o400) + print(f"Saved private key to {KEY_FILE}") + # TODO: If anything fails AFTER starting the instance, we should terminate it. # That could be done in this script, or in the Jenkinsfile that calls this # script. @@ -114,6 +125,9 @@ def local_aws_init(): def aws_init(ami, region, security_group): """ Create and initialize an AWS instance for running apply-load. """ + # Create key pair + create_key_pair(region) + # Start instance instance_id = start_ec2_instance(ami, region, security_group) diff --git a/src/rust/soroban/p25 b/src/rust/soroban/p25 index 9f00199751..d2ff024b72 160000 --- a/src/rust/soroban/p25 +++ b/src/rust/soroban/p25 @@ -1 +1 @@ -Subproject commit 9f00199751db5f516517049ab9af952b0a26725a +Subproject commit d2ff024b72f7f3f75737402ac74ca5d0093a4690 From e14478f0d7435b21b6b122e4e4222b6e4f7f5094 Mon Sep 17 00:00:00 2001 From: Brett Boston Date: Wed, 7 Jan 2026 16:06:04 -0800 Subject: [PATCH 3/8] Fix name shadowing --- scripts/ApplyLoad.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/ApplyLoad.py b/scripts/ApplyLoad.py index a25c0cb3d6..6e623996b1 100755 --- a/scripts/ApplyLoad.py +++ b/scripts/ApplyLoad.py @@ -166,14 +166,14 @@ def generate_cfg(clusters, batch_size): subparsers = parser.add_subparsers(dest="mode", required=True) - aws_init = subparsers.add_parser( + aws_init_parser = subparsers.add_parser( "aws-init", help="Create and initialize an AWS instance for running apply-load.") - aws_init.add_argument( + aws_init_parser.add_argument( "--ubuntu-ami", type=str, help="AMI ID to use. Must be an Ubuntu image.") - aws_init.add_argument("--region", type=str, help="AWS region to use.") - aws_init.add_argument("--security-group", type=str, + aws_init_parser.add_argument("--region", type=str, help="AWS region to use.") + aws_init_parser.add_argument("--security-group", type=str, help="AWS security group to use.") subparsers.add_parser( From 2fc0bf370553c143aaa81c73bb4a1f6ea0a70be9 Mon Sep 17 00:00:00 2001 From: Traver Tischio Date: Mon, 2 Feb 2026 15:27:14 -0500 Subject: [PATCH 4/8] Add tags to keypair and instance and name keypair with datetime --- scripts/ApplyLoad.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/scripts/ApplyLoad.py b/scripts/ApplyLoad.py index 6e623996b1..cec7ec3cab 100755 --- a/scripts/ApplyLoad.py +++ b/scripts/ApplyLoad.py @@ -4,14 +4,14 @@ import subprocess import tempfile import time +from datetime import datetime # Instance type to use. Matches SDF validator instance type. INSTANCE_TYPE = 'c5d.2xlarge' # Key pair name and file for SSH access -# TODO: Fill these in with the proper values -KEY_NAME = 'max-sac-test-key' -KEY_FILE = 'max-sac-test-key.pem' +KEY_NAME = f'max-sac-test-key-{datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}' +KEY_FILE = f'{KEY_NAME}.pem' # Directory containing helper files for this script APPLY_LOAD_SCRIPT_DIR = os.path.join(os.path.dirname(__file__), "apply_load") @@ -48,6 +48,8 @@ def create_key_pair(region): """ Create an EC2 key pair and save the private key to KEY_FILE. """ print("Creating EC2 key pair...") cmd = ["aws", "ec2", "create-key-pair", "--key-name", KEY_NAME, + "--tag-specifications", + "ResourceType=key-pair,Tags=[{Key=test,Value=max-sac-tps},{Key=ManagedBy,Value=ApplyLoadScript}]", "--query", "KeyMaterial", "--output", "text", "--region", region] private_key = run_capture_output(cmd).decode().strip() with open(KEY_FILE, "w") as key_file: @@ -64,7 +66,10 @@ def start_ec2_instance(ami, region, security_group): cmd = ["aws", "ec2", "run-instances", "--image-id", ami, "--instance-type", INSTANCE_TYPE, "--security-groups", security_group, - "--key-name", KEY_NAME, "--query", "Instances[0].InstanceId", + "--key-name", KEY_NAME, + "--tag-specifications", + "ResourceType=instance,Tags=[{Key=test,Value=max-sac-tps},{Key=ManagedBy,Value=ApplyLoadScript}]", + "--query", "Instances[0].InstanceId", "--output", "text", "--region", region] instance_id = run_capture_output(cmd).decode().strip() print("Started EC2 instance with ID:", instance_id) @@ -203,4 +208,4 @@ def generate_cfg(clusters, batch_size): run_max_sac(cfg, args.image, args.iops) else: print(f"Unknown mode: {args.mode}") - exit(1) \ No newline at end of file + exit(1) From b6c097a68a080f0fd3b7c1563e3eb6c6683fac6d Mon Sep 17 00:00:00 2001 From: Traver Tischio Date: Mon, 9 Feb 2026 15:08:54 -0500 Subject: [PATCH 5/8] Use ssm in applyload script --- scripts/ApplyLoad.py | 162 ++++++++++++++++++++++++++++--------------- 1 file changed, 106 insertions(+), 56 deletions(-) diff --git a/scripts/ApplyLoad.py b/scripts/ApplyLoad.py index cec7ec3cab..325c639de1 100755 --- a/scripts/ApplyLoad.py +++ b/scripts/ApplyLoad.py @@ -5,14 +5,11 @@ import tempfile import time from datetime import datetime +import json # Instance type to use. Matches SDF validator instance type. INSTANCE_TYPE = 'c5d.2xlarge' -# Key pair name and file for SSH access -KEY_NAME = f'max-sac-test-key-{datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}' -KEY_FILE = f'{KEY_NAME}.pem' - # Directory containing helper files for this script APPLY_LOAD_SCRIPT_DIR = os.path.join(os.path.dirname(__file__), "apply_load") @@ -44,29 +41,16 @@ def run_capture_output(command): print(f"Command '{command}' failed with exit code {e.returncode}") exit(1) -def create_key_pair(region): - """ Create an EC2 key pair and save the private key to KEY_FILE. """ - print("Creating EC2 key pair...") - cmd = ["aws", "ec2", "create-key-pair", "--key-name", KEY_NAME, - "--tag-specifications", - "ResourceType=key-pair,Tags=[{Key=test,Value=max-sac-tps},{Key=ManagedBy,Value=ApplyLoadScript}]", - "--query", "KeyMaterial", "--output", "text", "--region", region] - private_key = run_capture_output(cmd).decode().strip() - with open(KEY_FILE, "w") as key_file: - key_file.write(private_key) - os.chmod(KEY_FILE, 0o400) - print(f"Saved private key to {KEY_FILE}") - # TODO: If anything fails AFTER starting the instance, we should terminate it. # That could be done in this script, or in the Jenkinsfile that calls this # script. -def start_ec2_instance(ami, region, security_group): +def start_ec2_instance(ami, region, security_group, iam_instance_profile): """ Start an EC2 instance and return its instance id """ print("Starting EC2 instance...") cmd = ["aws", "ec2", "run-instances", "--image-id", ami, "--instance-type", INSTANCE_TYPE, "--security-groups", security_group, - "--key-name", KEY_NAME, + "--iam-instance-profile", f"Name={iam_instance_profile}", "--tag-specifications", "ResourceType=instance,Tags=[{Key=test,Value=max-sac-tps},{Key=ManagedBy,Value=ApplyLoadScript}]", "--query", "Instances[0].InstanceId", @@ -80,38 +64,104 @@ def start_ec2_instance(ami, region, security_group): f"--region {region}") return instance_id -def install_script_on_instance(instance_id, region): - """ Install this script on the given EC2 instance. """ - # Get the instance's public IP address - # TODO: remove region - ip = run_capture_output( - ["aws", "ec2", "describe-instances", "--instance-ids", instance_id, - "--query", "Reservations[0].Instances[0].PublicIpAddress", - "--output", "text", "--region", region]).decode().strip() - print("Instance public IP:", ip) - - # Wait for SSH to be available - print("Checking SSH availability...") +def wait_for_ssm_agent(instance_id, region): + """ Wait for SSM agent to be ready on the instance """ + print("Waiting for SSM agent to be ready...") for i in range(SSH_RETRIES): - res = run(f"ssh -o StrictHostKeyChecking=no -i {KEY_FILE} " - "-o ConnectTimeout=5 " - f"ubuntu@{ip} 'true'", - exit_on_fail=(i == SSH_RETRIES - 1)) - if res: - break + try: + cmd = ["aws", "ssm", "describe-instance-information", + "--instance-information-filter-list", + f"key=InstanceIds,valueSet={instance_id}", + "--region", region] + output = run_capture_output(cmd).decode().strip() + info = json.loads(output) + if info.get("InstanceInformationList"): + print("SSM agent is ready.") + return True + except Exception as e: + pass + sleep_duration = 10 - print(f"SSH not available yet, retrying ({i+1}/{SSH_RETRIES}) in " + print(f"SSM agent not ready yet, retrying ({i+1}/{SSH_RETRIES}) in " f"{sleep_duration} seconds...") time.sleep(sleep_duration) - print("SSH is available.") - # Copy this script and the apply-load directory to the instance - scp_base = f"scp -i {KEY_FILE} -o StrictHostKeyChecking=no" - dest = f"ubuntu@{ip}:" - run(f"{scp_base} {__file__} {dest}") - run(f"{scp_base} -r {APPLY_LOAD_SCRIPT_DIR} {dest}") + print("ERROR: SSM agent failed to become ready") + exit(1) + +def run_ssm_command(instance_id, region, command): + """ Run a command on an EC2 instance via SSM """ + print(f"Running SSM command on {instance_id}: {command}") + + # Send command + cmd = ["aws", "ssm", "send-command", + "--instance-ids", instance_id, + "--document-name", "AWS-RunShellScript", + "--parameters", f"commands=['{command}']", + "--region", region, + "--query", "Command.CommandId", + "--output", "text"] + command_id = run_capture_output(cmd).decode().strip() + + # Wait for command to complete + print(f"Waiting for command {command_id} to complete...") + for i in range(30): + time.sleep(5) + cmd = ["aws", "ssm", "get-command-invocation", + "--command-id", command_id, + "--instance-id", instance_id, + "--region", region, + "--query", "Status", + "--output", "text"] + try: + status = run_capture_output(cmd).decode().strip() + if status in ["Success", "Failed", "Cancelled", "TimedOut"]: + # Get output + cmd = ["aws", "ssm", "get-command-invocation", + "--command-id", command_id, + "--instance-id", instance_id, + "--region", region] + output = run_capture_output(cmd).decode().strip() + result = json.loads(output) + print("Command output:", result.get("StandardOutputContent", "")) + if result.get("StandardErrorContent"): + print("Command error:", result.get("StandardErrorContent", "")) + return status == "Success" + except Exception as e: + pass + + print("ERROR: Command timed out") + return False + +def copy_files_to_instance(instance_id, region): + """ Copy files to the instance using SSM and S3 """ + # Create a temporary S3 bucket or use existing one + # For simplicity, we'll embed the script content in SSM commands + print("Copying files to instance via SSM...") + + # Read this script + with open(__file__, 'r') as f: + script_content = f.read() + + # Write script to instance + escaped_content = script_content.replace("'", "'\\''") + run_ssm_command(instance_id, region, + f"cat > /home/ubuntu/ApplyLoad.py << 'EOF'\n{script_content}\nEOF") + + # Copy apply_load directory files + # This is simplified - in production, use S3 or create a tarball + run_ssm_command(instance_id, region, + f"mkdir -p /home/ubuntu/apply_load") - return ip +def install_script_on_instance(instance_id, region): + """ Install this script on the given EC2 instance via SSM. """ + # Wait for SSM agent to be ready + wait_for_ssm_agent(instance_id, region) + + # Copy files + copy_files_to_instance(instance_id, region) + + return instance_id def local_aws_init(): """ Initialize an AWS instance for running apply-load from the instance @@ -128,23 +178,20 @@ def local_aws_init(): # Allow regular ubuntu use to run docker commands run("sudo usermod -aG docker ubuntu") -def aws_init(ami, region, security_group): +def aws_init(ami, region, security_group, iam_instance_profile): """ Create and initialize an AWS instance for running apply-load. """ - # Create key pair - create_key_pair(region) - # Start instance - instance_id = start_ec2_instance(ami, region, security_group) + instance_id = start_ec2_instance(ami, region, security_group, iam_instance_profile) # Install this script on the instance - ip = install_script_on_instance(instance_id, region) + install_script_on_instance(instance_id, region) # Remotely invoke local-aws-init on the instance - run(f"ssh -o StrictHostKeyChecking=no -i {KEY_FILE} " - f"ubuntu@{ip} 'python3 ApplyLoad.py local-aws-init'") + run_ssm_command(instance_id, region, + "cd /home/ubuntu && python3 ApplyLoad.py local-aws-init") - # Print instance id and ip for Jenkins to store - print(f"{instance_id},{ip}") + # Print instance id for Jenkins to store + print(f"{instance_id}") def run_max_sac(cfg, image, iops): """ Run apply-load in max SAC TPS mode with the given config file. """ @@ -180,6 +227,8 @@ def generate_cfg(clusters, batch_size): aws_init_parser.add_argument("--region", type=str, help="AWS region to use.") aws_init_parser.add_argument("--security-group", type=str, help="AWS security group to use.") + aws_init_parser.add_argument("--iam-instance-profile", type=str, + help="IAM instance profile to use.") subparsers.add_parser( "local-aws-init", @@ -200,7 +249,8 @@ def generate_cfg(clusters, batch_size): args = parser.parse_args() if args.mode == "aws-init": - aws_init(args.ubuntu_ami, args.region, args.security_group) + aws_init(args.ubuntu_ami, args.region, args.security_group, + args.iam_instance_profile) elif args.mode == "local-aws-init": local_aws_init() elif args.mode == "max-sac": From ca975927f3d17e53276325852744eaa58804c8b8 Mon Sep 17 00:00:00 2001 From: Traver Tischio Date: Mon, 9 Feb 2026 15:59:12 -0500 Subject: [PATCH 6/8] use the ssm user dir --- scripts/ApplyLoad.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/ApplyLoad.py b/scripts/ApplyLoad.py index 325c639de1..e924d144e6 100755 --- a/scripts/ApplyLoad.py +++ b/scripts/ApplyLoad.py @@ -16,6 +16,9 @@ # Path to the max SAC template configuration file MAX_SAC_TEMPLATE = os.path.join(APPLY_LOAD_SCRIPT_DIR, "max-sac-template.cfg") +# User directory on the instance +USER_DIR = "/home/ssm-user" + # Path to the ephemeral NVMe drive on AWS instance NVME_DRIVE = "/dev/nvme1n1" @@ -146,12 +149,12 @@ def copy_files_to_instance(instance_id, region): # Write script to instance escaped_content = script_content.replace("'", "'\\''") run_ssm_command(instance_id, region, - f"cat > /home/ubuntu/ApplyLoad.py << 'EOF'\n{script_content}\nEOF") + f"cat > {USER_DIR}/ApplyLoad.py << 'EOF'\n{script_content}\nEOF") # Copy apply_load directory files # This is simplified - in production, use S3 or create a tarball run_ssm_command(instance_id, region, - f"mkdir -p /home/ubuntu/apply_load") + f"mkdir -p {USER_DIR}/apply_load") def install_script_on_instance(instance_id, region): """ Install this script on the given EC2 instance via SSM. """ @@ -188,7 +191,7 @@ def aws_init(ami, region, security_group, iam_instance_profile): # Remotely invoke local-aws-init on the instance run_ssm_command(instance_id, region, - "cd /home/ubuntu && python3 ApplyLoad.py local-aws-init") + f"cd {USER_DIR} && python3 ApplyLoad.py local-aws-init") # Print instance id for Jenkins to store print(f"{instance_id}") From 8676aaaa0cb4d79812ba7d1983a6bbb3d1f4fa9c Mon Sep 17 00:00:00 2001 From: Traver Tischio Date: Mon, 9 Feb 2026 16:46:51 -0500 Subject: [PATCH 7/8] use s3 with ssm for scp --- scripts/ApplyLoad.py | 54 +++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/scripts/ApplyLoad.py b/scripts/ApplyLoad.py index e924d144e6..4e9059e01a 100755 --- a/scripts/ApplyLoad.py +++ b/scripts/ApplyLoad.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 +import json import os import subprocess import tempfile import time from datetime import datetime -import json # Instance type to use. Matches SDF validator instance type. INSTANCE_TYPE = 'c5d.2xlarge' @@ -136,33 +136,43 @@ def run_ssm_command(instance_id, region, command): print("ERROR: Command timed out") return False -def copy_files_to_instance(instance_id, region): +def copy_file_via_s3(instance_id, region, local_file, remote_path, s3_bucket): + """ Copy a file to an EC2 instance via S3 and SSM """ + # Upload file to S3 + s3_key = f"tmp/{os.path.basename(local_file)}" + print(f"Uploading {local_file} to s3://{s3_bucket}/{s3_key}...") + run(f"aws s3 cp {local_file} s3://{s3_bucket}/{s3_key} --region {region}") + + # Download from S3 on the instance + print(f"Downloading file to instance at {remote_path}...") + download_cmd = f"aws s3 cp s3://{s3_bucket}/{s3_key} {remote_path}" + run_ssm_command(instance_id, region, download_cmd) + + # Clean up S3 file + run(f"aws s3 rm s3://{s3_bucket}/{s3_key} --region {region}") + +def copy_files_to_instance(instance_id, region, s3_bucket): """ Copy files to the instance using SSM and S3 """ - # Create a temporary S3 bucket or use existing one - # For simplicity, we'll embed the script content in SSM commands - print("Copying files to instance via SSM...") + print("Copying files to instance via S3...") - # Read this script - with open(__file__, 'r') as f: - script_content = f.read() + # Copy this script + copy_file_via_s3(instance_id, region, __file__, + f"{USER_DIR}/ApplyLoad.py", s3_bucket) - # Write script to instance - escaped_content = script_content.replace("'", "'\\''") - run_ssm_command(instance_id, region, - f"cat > {USER_DIR}/ApplyLoad.py << 'EOF'\n{script_content}\nEOF") + # Copy config template + copy_file_via_s3(instance_id, region, MAX_SAC_TEMPLATE, + f"{USER_DIR}/apply_load/max-sac-template.cfg", s3_bucket) - # Copy apply_load directory files - # This is simplified - in production, use S3 or create a tarball - run_ssm_command(instance_id, region, - f"mkdir -p {USER_DIR}/apply_load") + # Create directory structure + run_ssm_command(instance_id, region, f"mkdir -p {USER_DIR}/apply_load") -def install_script_on_instance(instance_id, region): +def install_script_on_instance(instance_id, region, s3_bucket): """ Install this script on the given EC2 instance via SSM. """ # Wait for SSM agent to be ready wait_for_ssm_agent(instance_id, region) # Copy files - copy_files_to_instance(instance_id, region) + copy_files_to_instance(instance_id, region, s3_bucket) return instance_id @@ -181,13 +191,13 @@ def local_aws_init(): # Allow regular ubuntu use to run docker commands run("sudo usermod -aG docker ubuntu") -def aws_init(ami, region, security_group, iam_instance_profile): +def aws_init(ami, region, security_group, iam_instance_profile, s3_bucket): """ Create and initialize an AWS instance for running apply-load. """ # Start instance instance_id = start_ec2_instance(ami, region, security_group, iam_instance_profile) # Install this script on the instance - install_script_on_instance(instance_id, region) + install_script_on_instance(instance_id, region, s3_bucket) # Remotely invoke local-aws-init on the instance run_ssm_command(instance_id, region, @@ -232,6 +242,8 @@ def generate_cfg(clusters, batch_size): help="AWS security group to use.") aws_init_parser.add_argument("--iam-instance-profile", type=str, help="IAM instance profile to use.") + aws_init_parser.add_argument("--s3-bucket", type=str, + help="S3 bucket to use for file transfer.") subparsers.add_parser( "local-aws-init", @@ -253,7 +265,7 @@ def generate_cfg(clusters, batch_size): if args.mode == "aws-init": aws_init(args.ubuntu_ami, args.region, args.security_group, - args.iam_instance_profile) + args.iam_instance_profile, args.s3_bucket) elif args.mode == "local-aws-init": local_aws_init() elif args.mode == "max-sac": From 48f0a32e923fdd764dff0e1ff86351524cf5a019 Mon Sep 17 00:00:00 2001 From: Traver Tischio Date: Mon, 9 Feb 2026 17:33:29 -0500 Subject: [PATCH 8/8] fix script --- scripts/ApplyLoad.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/scripts/ApplyLoad.py b/scripts/ApplyLoad.py index 4e9059e01a..99fdfdfcde 100755 --- a/scripts/ApplyLoad.py +++ b/scripts/ApplyLoad.py @@ -17,7 +17,7 @@ MAX_SAC_TEMPLATE = os.path.join(APPLY_LOAD_SCRIPT_DIR, "max-sac-template.cfg") # User directory on the instance -USER_DIR = "/home/ssm-user" +USER_DIR = "/home/ubuntu" # Path to the ephemeral NVMe drive on AWS instance NVME_DRIVE = "/dev/nvme1n1" @@ -159,6 +159,9 @@ def copy_files_to_instance(instance_id, region, s3_bucket): copy_file_via_s3(instance_id, region, __file__, f"{USER_DIR}/ApplyLoad.py", s3_bucket) + run_ssm_command(instance_id, region, + f"chmod +x {USER_DIR}/ApplyLoad.py") + # Copy config template copy_file_via_s3(instance_id, region, MAX_SAC_TEMPLATE, f"{USER_DIR}/apply_load/max-sac-template.cfg", s3_bucket) @@ -171,24 +174,38 @@ def install_script_on_instance(instance_id, region, s3_bucket): # Wait for SSM agent to be ready wait_for_ssm_agent(instance_id, region) + # Install the awscli on the instance + install_awscli(instance_id, region) + # Copy files copy_files_to_instance(instance_id, region, s3_bucket) return instance_id +def install_awscli(instance_id, region): + """Install AWS CLI on Ubuntu Linux machine.""" + print("Installing AWS CLI...") + + install_cmd = """ + apt-get update && apt-get install -y unzip curl && \ + curl \"https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip\" -o awscliv2.zip && \ + unzip -q awscliv2.zip && ./aws/install && \ + rm -rf aws awscliv2.zip + """ + + run_ssm_command(instance_id, region, install_cmd) + print("AWS CLI installation complete.") + def local_aws_init(): """ Initialize an AWS instance for running apply-load from the instance itself. """ # Mount ephemeral nvme drive such that docker uses it for storage - run(f"sudo mkfs.ext4 {NVME_DRIVE}") - run("sudo mkdir -p /var/lib/docker") - run(f"sudo mount {NVME_DRIVE} /var/lib/docker") + run(f"sudo mkfs.ext4 {NVME_DRIVE} && sudo mkdir -p /var/lib/docker && sudo mount {NVME_DRIVE} /var/lib/docker") # Install docker - run("sudo apt-get update") - run("sudo apt-get install -y docker.io") + run("sudo apt-get update && sudo apt-get install -y docker.io") - # Allow regular ubuntu use to run docker commands + # Allow regular ubuntu user to run docker commands run("sudo usermod -aG docker ubuntu") def aws_init(ami, region, security_group, iam_instance_profile, s3_bucket):