From 87330be623e6f563a2611432d20951ea4ecf9181 Mon Sep 17 00:00:00 2001 From: Crambor Date: Sun, 17 Mar 2024 18:55:14 +0000 Subject: [PATCH 1/6] fix: #59 directly check for hq binary in hpc directory --- hpc/LoadBalancer.cpp | 15 +++++++++++++-- hpc/LoadBalancer.hpp | 6 +++--- hpc/hq_scripts/allocation_queue.sh | 2 +- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/hpc/LoadBalancer.cpp b/hpc/LoadBalancer.cpp index c23bdff3..b8395830 100644 --- a/hpc/LoadBalancer.cpp +++ b/hpc/LoadBalancer.cpp @@ -25,15 +25,19 @@ void clear_url(std::string directory) { } void launch_hq_with_alloc_queue() { - std::system("hq server stop &> /dev/null"); + std::system("./hq server stop &> /dev/null"); - std::system("hq server start &"); + std::system("./hq server start &"); sleep(1); // Workaround: give the HQ server enough time to start. // Create HQ allocation queue std::system("hq_scripts/allocation_queue.sh"); } +bool file_exists(const std::string& path) { + return std::filesystem::exists(path); +} + const std::vector get_model_names() { // Don't start a client, always use the default job submission script. HyperQueueJob hq_job("", false, true); @@ -49,6 +53,13 @@ int main(int argc, char *argv[]) create_directory_if_not_existing("sub-jobs"); clear_url("urls"); + // Check if the hq binary exists + std::string hq_binary_path = "./hq"; + if (!file_exists(hq_binary_path)) { + std::cerr << "Error: hq binary does not exist at " << hq_binary_path << std::endl; + return 1; + } + launch_hq_with_alloc_queue(); // Read environment variables for configuration diff --git a/hpc/LoadBalancer.hpp b/hpc/LoadBalancer.hpp index 20d877d2..8a4e433d 100644 --- a/hpc/LoadBalancer.hpp +++ b/hpc/LoadBalancer.hpp @@ -90,7 +90,7 @@ class HyperQueueJob ~HyperQueueJob() { // Cancel the SLURM job - std::system(("hq job cancel " + job_id).c_str()); + std::system(("./hq job cancel " + job_id).c_str()); // Delete the url text file std::system(("rm ./urls/url-" + job_id + ".txt").c_str()); @@ -113,7 +113,7 @@ class HyperQueueJob const std::filesystem::path submission_script_generic("job.sh"); const std::filesystem::path submission_script_model_specific("job_" + model_name + ".sh"); - std::string hq_command = "hq submit --output-mode=quiet "; + std::string hq_command = "./hq submit --output-mode=quiet "; hq_command += "--priority=" + std::to_string(job_count) + " "; if (std::filesystem::exists(submission_script_dir / submission_script_model_specific) && !force_default_submission_script) { @@ -154,7 +154,7 @@ class HyperQueueJob // state = ["WAITING", "RUNNING", "FINISHED", "CANCELED"] bool waitForHQJobState(const std::string &job_id, const std::string &state) { - const std::string command = "hq job info " + job_id + " | grep State | awk '{print $4}'"; + const std::string command = "./hq job info " + job_id + " | grep State | awk '{print $4}'"; // std::cout << "Checking runtime: " << command << std::endl; std::string job_status; diff --git a/hpc/hq_scripts/allocation_queue.sh b/hpc/hq_scripts/allocation_queue.sh index ae13b8d1..6e48073e 100755 --- a/hpc/hq_scripts/allocation_queue.sh +++ b/hpc/hq_scripts/allocation_queue.sh @@ -4,7 +4,7 @@ # hq worker start & -hq alloc add slurm --time-limit 10m \ +./hq alloc add slurm --time-limit 10m \ --idle-timeout 3m \ --backlog 1 \ --workers-per-alloc 1 \ From 51cec68788b0d5a1b0efbb2860c636327c203e95 Mon Sep 17 00:00:00 2001 From: Crambor Date: Sun, 17 Mar 2024 18:58:23 +0000 Subject: [PATCH 2/6] ci: #52 initial steps for bundled release --- .github/workflows/hpc-load-balancer.yml | 44 +++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 .github/workflows/hpc-load-balancer.yml diff --git a/.github/workflows/hpc-load-balancer.yml b/.github/workflows/hpc-load-balancer.yml new file mode 100644 index 00000000..cc3a1eb4 --- /dev/null +++ b/.github/workflows/hpc-load-balancer.yml @@ -0,0 +1,44 @@ +name: hpc-load-balancer + +on: + push: + pull_request: + branches: + - 'main' + + +jobs: + + build-and-setup: + runs-on: ubuntu-latest + container: ubuntu:latest + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Dependencies + run: | + apt update && DEBIAN_FRONTEND="noninteractive" apt install -y g++ make wget curl tar + + - name: Build load balancer binary + run: | + cd hpc && make build-load-balancer + + - name: Download and setup hq binary + run: | + url=$(curl -sL https://api.github.com/repos/It4innovations/hyperqueue/releases/latest | \ + grep -o "\"browser_download_url\": \"https://[^\"]*-linux-x64.tar.gz\"" | \ + cut -d '"' -f 4) + if [ -z "$url" ]; then + echo "Error: URL not found" + exit 1 + fi + + version=$(echo $url | grep -o 'v[0-9]*\.[0-9]*\.[0-9]*') + echo "Version: $version" + + filename="hq-${version}-linux-x64.tar.gz" + wget $url -O $filename + tar xzf $filename + From 6f3d738c2802f4f0b349d81babf63561b964dd57 Mon Sep 17 00:00:00 2001 From: Crambor Date: Sun, 17 Mar 2024 19:11:28 +0000 Subject: [PATCH 3/6] fix(ci): accidentally grepped all occurences --- .github/workflows/hpc-load-balancer.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/hpc-load-balancer.yml b/.github/workflows/hpc-load-balancer.yml index cc3a1eb4..5a2cb5c7 100644 --- a/.github/workflows/hpc-load-balancer.yml +++ b/.github/workflows/hpc-load-balancer.yml @@ -27,7 +27,7 @@ jobs: - name: Download and setup hq binary run: | - url=$(curl -sL https://api.github.com/repos/It4innovations/hyperqueue/releases/latest | \ + url=$(curl -sSL https://api.github.com/repos/It4innovations/hyperqueue/releases/latest | \ grep -o "\"browser_download_url\": \"https://[^\"]*-linux-x64.tar.gz\"" | \ cut -d '"' -f 4) if [ -z "$url" ]; then @@ -35,10 +35,10 @@ jobs: exit 1 fi - version=$(echo $url | grep -o 'v[0-9]*\.[0-9]*\.[0-9]*') + version=$(echo $url | grep -o 'v[0-9]*\.[0-9]*\.[0-9]*' | head -1) echo "Version: $version" filename="hq-${version}-linux-x64.tar.gz" - wget $url -O $filename + wget -q $url -O $filename tar xzf $filename From 4648e3813326b8b57d9f0e442ef3d983900ae5e9 Mon Sep 17 00:00:00 2001 From: Crambor Date: Sun, 17 Mar 2024 19:13:35 +0000 Subject: [PATCH 4/6] chore(ci): QoL silencing on apt --- .github/workflows/hpc-load-balancer.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hpc-load-balancer.yml b/.github/workflows/hpc-load-balancer.yml index 5a2cb5c7..37b5657b 100644 --- a/.github/workflows/hpc-load-balancer.yml +++ b/.github/workflows/hpc-load-balancer.yml @@ -19,7 +19,7 @@ jobs: - name: Dependencies run: | - apt update && DEBIAN_FRONTEND="noninteractive" apt install -y g++ make wget curl tar + apt update -qq && DEBIAN_FRONTEND="noninteractive" apt install -yq g++ make wget curl tar - name: Build load balancer binary run: | From 28ce6f16bd39653c067e50b2e486c8dc96cf9ee8 Mon Sep 17 00:00:00 2001 From: Crambor Date: Sun, 17 Mar 2024 19:22:02 +0000 Subject: [PATCH 5/6] fix(ci): remove unnecessary version checking logic --- .github/workflows/hpc-load-balancer.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/hpc-load-balancer.yml b/.github/workflows/hpc-load-balancer.yml index 37b5657b..90ace9bf 100644 --- a/.github/workflows/hpc-load-balancer.yml +++ b/.github/workflows/hpc-load-balancer.yml @@ -35,10 +35,8 @@ jobs: exit 1 fi - version=$(echo $url | grep -o 'v[0-9]*\.[0-9]*\.[0-9]*' | head -1) - echo "Version: $version" - - filename="hq-${version}-linux-x64.tar.gz" + filename="hq-linux-x64.tar.gz" wget -q $url -O $filename tar xzf $filename + ./hq --version From db74b1703681b2230cf17c594f26fb0ee2e80e9c Mon Sep 17 00:00:00 2001 From: crambor Date: Sun, 24 Mar 2024 16:44:33 +0200 Subject: [PATCH 6/6] fix: #59 add hq binary check in allocation queue script --- hpc/hq_scripts/allocation_queue.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hpc/hq_scripts/allocation_queue.sh b/hpc/hq_scripts/allocation_queue.sh index 6e48073e..35e4783f 100755 --- a/hpc/hq_scripts/allocation_queue.sh +++ b/hpc/hq_scripts/allocation_queue.sh @@ -1,8 +1,12 @@ #! /bin/bash # Note: For runs on systems without SLURM, replace the slurm allocator by -# hq worker start & +# ./hq worker start & +if [[ ! -f "./hq" ]]; then + echo "Error: hq binary does not exist at ./hq" + exit 1 +fi ./hq alloc add slurm --time-limit 10m \ --idle-timeout 3m \