From 3b204ab73bfaded8a80a0dfbebeac014ecfd8bbd Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 28 Oct 2025 12:38:34 -0500 Subject: [PATCH 01/22] Add manual build and push workflow --- .github/workflows/manual-build.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 .github/workflows/manual-build.yml diff --git a/.github/workflows/manual-build.yml b/.github/workflows/manual-build.yml new file mode 100644 index 0000000..bbdaea7 --- /dev/null +++ b/.github/workflows/manual-build.yml @@ -0,0 +1,17 @@ +--- +name: Manual Build & Push +on: + workflow_dispatch: + inputs: + platforms: + description: 'The platforms for which the Docker image should be built. If not specified, defaults to linux/amd64.' + required: false + default: 'linux/amd64' +jobs: + build-push: + uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + with: + name: '${{ github.event.repository.name }}-develop' + tags: br-${{ github.ref_name }} + platforms: ${{ github.event.inputs.platforms }} + secrets: inherit From bd48733c19c10c2dfdff648b1691d44f445dde8c Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 28 Oct 2025 13:47:30 -0500 Subject: [PATCH 02/22] Update build --- Dockerfile | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/Dockerfile b/Dockerfile index ce9915c..77778cc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,13 @@ -FROM centos:7 -ENV container docker +FROM htcondor/base:25.0.1-el9 # Get commonly used utilities -RUN yum -y update && yum update -y systemd && yum -y install -y epel-release wget which git deltarpm gcc libcgroup libcgroup-tools stress-ng tmpwatch +RUN yum -y update && yum update -y systemd && yum -y install -y epel-release wget which git gcc stress-ng tmpwatch bzip2 # Install docker binaries RUN yum install -y yum-utils device-mapper-persistent-data lvm2 && yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo && yum install -y docker-ce -# Get Java -RUN yum install -y java-11-openjdk java-11-openjdk-devel openjdk-11-jdk-headless -#Install Python3 and Libraries (source /root/miniconda/bin/activate) -RUN yum install -y bzip2 \ -&& wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh \ +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh \ && bash ~/miniconda.sh -b -p /miniconda \ && export PATH="/miniconda/bin:$PATH" @@ -29,11 +24,6 @@ RUN wget -N https://github.com/kbase/dockerize/raw/master/dockerize-linux-amd64- # Also add the user to the groups that map to "docker" on Linux and "daemon" on Mac RUN usermod -a -G 0 kbase && usermod -a -G 999 kbase -# Install HTCondor -RUN cd /etc/yum.repos.d && \ -wget http://research.cs.wisc.edu/htcondor/yum/repo.d/htcondor-development-rhel7.repo && \ -wget http://research.cs.wisc.edu/htcondor/yum/RPM-GPG-KEY-HTCondor && \ -rpm --import RPM-GPG-KEY-HTCondor && yum -y install condor #ADD DIRS RUN mkdir -p /var/run/condor && mkdir -p /var/log/condor && mkdir -p /var/lock/condor && mkdir -p /var/lib/condor/execute @@ -47,14 +37,9 @@ ARG BRANCH=develop RUN rm -rf /var/cache/yum ENV PATH /miniconda/bin:$PATH -# RUN \ -# git clone https://github.com/scanon/JobRunner && \ -# cd JobRunner && git checkout setup && \ -# pip install -r requirements.txt && \ -# python ./setup.py install && cd .. && rm -rf JobRunner + RUN wget https://raw.githubusercontent.com/kbase/JobRunner/master/requirements.txt && pip install -r requirements.txt && rm requirements.txt -#auth_service_url=https://appdev.kbase.us/services/auth/api/legacy/KBase/Sessions/Login COPY --chown=kbase deployment/ /kb/deployment/ @@ -62,14 +47,6 @@ RUN /kb/deployment/bin/install_python_dependencies.sh ENV KB_DEPLOYMENT_CONFIG /kb/deployment/conf/deployment.cfg -# The BUILD_DATE value seem to bust the docker cache when the timestamp changes, move to -# the end -LABEL org.label-schema.build-date=$BUILD_DATE \ - org.label-schema.vcs-url="https://github.com/kbase/condor-worker.git" \ - org.label-schema.vcs-ref=$VCS_REF \ - org.label-schema.schema-version="1.0.0" \ - us.kbase.vcs-branch=$BRANCH \ - maintainer="Steve Chan sychan@lbl.gov" ENTRYPOINT [ "/kb/deployment/bin/dockerize" ] CMD [ "-template", "/kb/deployment/conf/.templates/deployment.cfg.templ:/kb/deployment/conf/deployment.cfg", \ From 34d3952cf12f11241672923238f91ca0e9ffdefd Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 28 Oct 2025 13:48:26 -0500 Subject: [PATCH 03/22] Create pr_build.yml --- .github/workflows/pr_build.yml | 48 ++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 .github/workflows/pr_build.yml diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml new file mode 100644 index 0000000..5592521 --- /dev/null +++ b/.github/workflows/pr_build.yml @@ -0,0 +1,48 @@ +--- +name: Pull Request Build, Tag, & Push +on: + pull_request: + branches: + - develop + - main + - master + types: + - opened + - reopened + - synchronize + - closed +jobs: + build-develop-open: + if: github.base_ref == 'develop' && github.event.pull_request.merged == false + uses: kbase/.github/.github/workflows/reusable_build.yml@main + with: + platforms: "linux/amd64" + secrets: inherit + build-develop-merge: + if: github.base_ref == 'develop' && github.event.pull_request.merged == true + uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + with: + name: '${{ github.event.repository.name }}-develop' + tags: pr-${{ github.event.number }},latest + platforms: "linux/amd64" + secrets: inherit + build-main-open: + if: (github.base_ref == 'main' || github.base_ref == 'master') && github.event.pull_request.merged == false + uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + with: + name: '${{ github.event.repository.name }}' + tags: pr-${{ github.event.number }} + platforms: "linux/amd64" + secrets: inherit + build-main-merge: + if: (github.base_ref == 'main' || github.base_ref == 'master') && github.event.pull_request.merged == true + uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + with: + name: '${{ github.event.repository.name }}' + tags: pr-${{ github.event.number }},latest-rc + platforms: "linux/amd64" + secrets: inherit + trivy-scans: + if: (github.base_ref == 'develop' || github.base_ref == 'main' || github.base_ref == 'master' ) && github.event.pull_request.merged == false + uses: kbase/.github/.github/workflows/reusable_trivy-scans.yml@main + secrets: inherit From 4d462343346a8ae0dba2fd97d2155913fc24495f Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 28 Oct 2025 13:48:49 -0500 Subject: [PATCH 04/22] Add GitHub Actions workflow for release process --- .github/workflows/release-main.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 .github/workflows/release-main.yml diff --git a/.github/workflows/release-main.yml b/.github/workflows/release-main.yml new file mode 100644 index 0000000..095a92f --- /dev/null +++ b/.github/workflows/release-main.yml @@ -0,0 +1,26 @@ +--- +name: Release - Build & Push Image +on: + release: + branches: + - main + - master + types: [ published ] +jobs: + check-source-branch: + uses: kbase/.github/.github/workflows/reusable_validate-branch.yml@main + with: + build_branch: '${{ github.event.release.target_commitish }}' + validate-release-tag: + needs: check-source-branch + uses: kbase/.github/.github/workflows/reusable_validate-release-tag.yml@main + with: + release_tag: '${{ github.event.release.tag_name }}' + build-push: + needs: validate-release-tag + uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + with: + name: '${{ github.event.repository.name }}' + tags: '${{ github.event.release.tag_name }},latest' + platforms: "linux/amd64" + secrets: inherit From 00ec2666dfe69a592110d130583b9d3992e37680 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 28 Oct 2025 14:17:49 -0500 Subject: [PATCH 05/22] Update build --- Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 77778cc..47665a5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - && export PATH="/miniconda/bin:$PATH" # Add kbase user and set up directories -RUN useradd -c "KBase user" -rd /kb/deployment/ -u 998 -s /bin/bash kbase && \ +RUN useradd -c "KBase user" -rd /kb/deployment/ -u 1000 -s /bin/bash kbase && \ mkdir -p /kb/deployment/bin && \ mkdir -p /kb/deployment/jettybase/logs/ && \ touch /kb/deployment/jettybase/logs/request.log && \ @@ -21,8 +21,7 @@ RUN useradd -c "KBase user" -rd /kb/deployment/ -u 998 -s /bin/bash kbase && \ #INSTALL DOCKERIZE RUN wget -N https://github.com/kbase/dockerize/raw/master/dockerize-linux-amd64-v0.6.1.tar.gz && tar xvzf dockerize-linux-amd64-v0.6.1.tar.gz && cp dockerize /kb/deployment/bin && rm dockerize* -# Also add the user to the groups that map to "docker" on Linux and "daemon" on Mac -RUN usermod -a -G 0 kbase && usermod -a -G 999 kbase + #ADD DIRS @@ -39,7 +38,8 @@ RUN rm -rf /var/cache/yum ENV PATH /miniconda/bin:$PATH -RUN wget https://raw.githubusercontent.com/kbase/JobRunner/master/requirements.txt && pip install -r requirements.txt && rm requirements.txt +RUN pip install requests sanic==21.12.2 docker==7.0.0 + COPY --chown=kbase deployment/ /kb/deployment/ From f1631a712f7d85b06ad5168a1a100b4154c9e29e Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 28 Oct 2025 14:18:20 -0500 Subject: [PATCH 06/22] Update build --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 47665a5..3511bba 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,7 +35,7 @@ ARG BRANCH=develop # Maybe you want: rm -rf /var/cache/yum, to also free up space taken by orphaned data from disabled or removed repos RUN rm -rf /var/cache/yum -ENV PATH /miniconda/bin:$PATH +ENV PATH=/miniconda/bin:$PATH RUN pip install requests sanic==21.12.2 docker==7.0.0 @@ -45,7 +45,7 @@ COPY --chown=kbase deployment/ /kb/deployment/ RUN /kb/deployment/bin/install_python_dependencies.sh -ENV KB_DEPLOYMENT_CONFIG /kb/deployment/conf/deployment.cfg +ENV KB_DEPLOYMENT_CONFIG=/kb/deployment/conf/deployment.cfg ENTRYPOINT [ "/kb/deployment/bin/dockerize" ] From b8b703addc269640bec54a00432fd545906e2d14 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 28 Oct 2025 15:43:08 -0500 Subject: [PATCH 07/22] Docker --- deployment/bin/cron/health_check.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deployment/bin/cron/health_check.py b/deployment/bin/cron/health_check.py index bc5c6db..d097f2e 100755 --- a/deployment/bin/cron/health_check.py +++ b/deployment/bin/cron/health_check.py @@ -140,7 +140,7 @@ def test_docker_socket(): socket_gid = os.stat(socket).st_gid # TODO FIX THIS TEST.. GROUPS ARE NOT BEING CORRECTLY SET INSIDE THE DOCKER CONTAINER - gids = [999, 996, 995, 987] + gids = [1000, 999, 996, 995, 987] if socket_gid in gids: return @@ -155,7 +155,7 @@ def test_docker_socket2(): Check to see if the nobody user has access to the docker socket """ dc = docker.from_env() - if len(dc.containers.list()) < 1: + if not dc.ping(): message = f"Cannot access docker socket" exit_unsuccessfully(message) From 30c6d9f320a36be70027b80224affd77d5217ae6 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 28 Oct 2025 16:14:30 -0500 Subject: [PATCH 08/22] Update health check --- Dockerfile | 2 +- deployment/bin/cron/health_check.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3511bba..38dc294 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,7 +38,7 @@ RUN rm -rf /var/cache/yum ENV PATH=/miniconda/bin:$PATH -RUN pip install requests sanic==21.12.2 docker==7.0.0 +RUN pip install requests sanic==21.12.2 docker==7.1.0 COPY --chown=kbase deployment/ /kb/deployment/ diff --git a/deployment/bin/cron/health_check.py b/deployment/bin/cron/health_check.py index d097f2e..de7b464 100755 --- a/deployment/bin/cron/health_check.py +++ b/deployment/bin/cron/health_check.py @@ -154,12 +154,15 @@ def test_docker_socket2(): """ Check to see if the nobody user has access to the docker socket """ - dc = docker.from_env() - if not dc.ping(): + try: + dc = docker.DockerClient(base_url='unix:///var/run/docker.sock') + dc.ping() + except Exception: message = f"Cannot access docker socket" exit_unsuccessfully(message) + def test_world_writeable(): """ Check to see if /mnt/awe/condor is writeable From f65563e8889218270f4e5f4aec5bcb39e8a0c0de Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 28 Oct 2025 16:53:31 -0500 Subject: [PATCH 09/22] udpate suffix --- deployment/bin/cron/health_check.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/deployment/bin/cron/health_check.py b/deployment/bin/cron/health_check.py index de7b464..105bf66 100755 --- a/deployment/bin/cron/health_check.py +++ b/deployment/bin/cron/health_check.py @@ -33,8 +33,10 @@ def send_slack_message(message: str): debug = False -scratch = os.environ.get("CONDOR_SUBMIT_WORKDIR", "/cdr") -scratch += os.environ.get("EXECUTE_SUFFIX", "") +workdir = os.environ.get("CONDOR_SUBMIT_WORKDIR", "/cdr") +suffix = os.environ.get("EXECUTE_SUFFIX", "") +scratch = f"{workdir}/{suffix}" + check_condor_starter_health = ( os.environ.get("CHECK_CONDOR_STARTER_HEALTH", "true").lower() == "true" ) From d4cbb4f3097aaad7436573b5ef43c45346c3cbca Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 28 Oct 2025 17:03:26 -0500 Subject: [PATCH 10/22] Update health check --- deployment/bin/cron/health_check.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/deployment/bin/cron/health_check.py b/deployment/bin/cron/health_check.py index 105bf66..1b27ea9 100755 --- a/deployment/bin/cron/health_check.py +++ b/deployment/bin/cron/health_check.py @@ -208,8 +208,7 @@ def checkEndpoints(): Check auth/njs/catalog/ws """ - services = { - f"{endpoint}/auth": {}, + post_services = { f"{endpoint}/catalog": { "method": "Catalog.status", "version": "1.1", @@ -223,10 +222,14 @@ def checkEndpoints(): "params": [], }, } + get_services = {f"{endpoint}/auth": {}} - for service in services: + for service in {**post_services, **get_services}: try: - response = requests.post(url=service, json=services[service], timeout=30) + if service in post_services: + response = requests.post(url=service, json=post_services[service], timeout=30) + else: + response = requests.get(url=service, timeout=30) if response.status_code != 200: message = f"{service} is not available." exit_unsuccessfully(message) @@ -235,6 +238,7 @@ def checkEndpoints(): exit_unsuccessfully(message) + def main(): try: # send_slack_message(f"Job HEALTH_CHECK is beginning at {datetime.datetime.now()}") From e5063e84f9c0a3937bf3ca36c5c40fd89654cce3 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 28 Oct 2025 17:44:47 -0500 Subject: [PATCH 11/22] Update cronjobs --- deployment/bin/cron/container_reaper.py | 229 ++++++++---------- deployment/bin/cron/container_reaper_ee2.py | 154 ------------ .../bin/cron/delete_exited_containers.py | 28 ++- deployment/bin/install_python_dependencies.sh | 6 - deployment/bin/misc/java_stats.sh | 10 - deployment/bin/misc/jshell-wrapper | 7 - deployment/condor_config.local.jinja | 0 .../conf/.templates/cronjobs.config.templ | 36 ++- .../deployment.cfg.templ | 0 .../limitBigMemSlots.templ | 0 .../shared_port_config.templ | 0 .../start_server.sh.templ | 0 12 files changed, 133 insertions(+), 337 deletions(-) delete mode 100755 deployment/bin/cron/container_reaper_ee2.py delete mode 100755 deployment/bin/install_python_dependencies.sh delete mode 100755 deployment/bin/misc/java_stats.sh delete mode 100755 deployment/bin/misc/jshell-wrapper create mode 100644 deployment/condor_config.local.jinja rename deployment/conf/{.templates => legacy}/deployment.cfg.templ (100%) rename deployment/conf/{.templates => legacy}/limitBigMemSlots.templ (100%) rename deployment/conf/{.templates => legacy}/shared_port_config.templ (100%) rename deployment/conf/{.templates => legacy}/start_server.sh.templ (100%) diff --git a/deployment/bin/cron/container_reaper.py b/deployment/bin/cron/container_reaper.py index 8a6f948..522fadd 100755 --- a/deployment/bin/cron/container_reaper.py +++ b/deployment/bin/cron/container_reaper.py @@ -1,163 +1,140 @@ #!/miniconda/bin/python -import datetime -import fnmatch +""" +This script is automatically run by the condor cronjob periodically +in order to clean up containers > 7 days or running without a starter +Required env vars are +# CONTAINER_REAPER_ENDPOINTS - A comma separated list of EE2 endpoints to manage containers for +# DELETE_ABANDONED_CONTAINERS - Set to true to enable the container reaper +# SLACK_WEBHOOK_URL - The slack webhook url to send messages to +""" + import json -import logging import os import socket +import subprocess +import time +from datetime import datetime, timedelta +from typing import Set import docker -import psutil import requests -from clients.NarrativeJobServiceClient import NarrativeJobService - -from typing import List, Dict - -slack_key = os.environ.get("SLACK_WEBHOOK_KEY", None) -# ee_notifications_channel -webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None) - -kill = os.environ.get("DELETE_ABANDONED_CONTAINERS", "false") -if kill.lower() == "true": - kill = True -else: - kill = False - -njs_endpoint_url = os.environ.get("NJS_ENDPOINT", None) - -if njs_endpoint_url is None: - raise Exception("NJS Endpoint not set") - -hostname = socket.gethostname() -dc = docker.from_env() - - -def find_dockerhub_jobs() -> Dict: - # send_slack_message(f"Job CONTAINER_REAPER is FINDING DOCKERHUB JOBS at {datetime.datetime.now()}") - - try: - all_containers = dc.containers - list = all_containers.list() - except Exception as e: - send_slack_message(str(e) + hostname) - - job_containers = {} - - for container in list: - cnt_id = container.id - try: - cnt = all_containers.get(cnt_id) - labels = cnt.labels - if "condor_id" in labels.keys() and "njs_endpoint" in labels.keys(): - labels["image"] = cnt.image - job_containers[cnt_id] = labels - except Exception as e: - logging.error(f"Container {cnt_id} doesn't exist anymore") - logging.error(e) - - return job_containers - - -def find_running_jobs(ps_name: str): - # send_slack_message(f"Job CONTAINER_REAPER is FINDING RUNNING JOBS at {datetime.datetime.now()}") - - "Return a list of processes matching 'name'." - ls = [] - for p in psutil.process_iter(attrs=["name", "cmdline"]): - if ps_name in p.info["cmdline"]: - ls.append(p.info["cmdline"][-2]) - return ls +from docker.models.containers import Container def send_slack_message(message: str): """ - :param message: Escaped Message to send to slack - :return: """ - + webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None) slack_data = {"text": message} - response = requests.post( + requests.post( webhook_url, data=json.dumps(slack_data), headers={"Content-Type": "application/json"}, ) -def notify_slack(cnt_id: str, labels: dict(), running_job_ids: List): - now = datetime.datetime.now() +def filter_containers_by_time(potential_containers, days=0, minutes=0): + filtered_containers = [] + seven_days_ago = datetime.now() - timedelta(days=days, minutes=minutes) - job_id = labels.get("job_id", None) - # app_id = labels['app_id'] - app_name = labels.get("app_name", None) - method_name = labels.get("method_name", None) - condor_id = labels.get("condor_id", None) - username = labels.get("user_name", None) + for old_container in potential_containers: + # Do we need to catch the chance that there is no created attribute? + created_time_str = old_container.attrs['Created'][:26] + created_time = datetime.fromisoformat(created_time_str) + if created_time <= seven_days_ago: + filtered_containers.append(old_container) + return filtered_containers - msg = f"cnt_id:{cnt_id} job_id:{job_id} condor_id:{condor_id} for {username} not in running_job_ids {running_job_ids} ({now}) hostname:({hostname}) app:{app_name} method:{method_name} (kill = {kill}) " - send_slack_message(msg) +def get_running_time_message(container, title=""): + image_name = container.attrs['Config']['Image'] + if "kbase" in image_name: + image_name = image_name.split(":")[1] + user_name = container.attrs['Config']['Labels'].get('user_name') -# @deprecated for EVENTLOG -def notify_user(cnt_id: str, labels: Dict): - username = labels.get("user_name", None) - job_id = labels.get("job_id", None) - # TODO add this to a configuration somewhere or ENV variable - job_directory = f"/mnt/awe/condor/{username}/{job_id}" + total_running_time = datetime.now() - datetime.fromisoformat(container.attrs['Created'][:26]) + days = total_running_time.days + hours = total_running_time.seconds // 3600 - print("About to notify") - print(labels) + formatted_running_time = f"{days}D:{hours}H" + return f"{title}:{hostname} {image_name}:{user_name}:{formatted_running_time}" - env_files = [] - for file in os.listdir(job_directory): - if fnmatch.fnmatch(file, "env_*"): - env_files.append(file) +def remove_with_backoff(container,message,backoff=30): + try: + container.stop() + time.sleep(backoff) # Wait for backoff period before attempting to remove + container.remove() + except Exception as e: + # Not much we can do here, just hope that the next pass will remove it + pass +def reap_containers_running_more_than_7_days(potential_containers: Set[Container]): + old_containers = filter_containers_by_time(potential_containers, days=7) - print(env_files) - env_filepath = env_files[0] - if os.path.isfile(env_filepath): - with open(env_filepath, "r") as content_file: - content = content_file.readlines() + if old_containers: + for old_container in old_containers: + message = get_running_time_message(old_container, title="reaper7daylimit") + send_slack_message(message) + remove_with_backoff(old_container, message) - token = None - for line in content: - if "KB_AUTH_TOKEN" in line: - token = line.split("=")[1] - if token: - njs = NarrativeJobService(token=token, url=njs_endpoint_url) - status = njs.check_job(job_id) - print(status) +def reap_containers_when_there_is_no_starter(potential_containers: Set[Container]): + """ + This function will reap containers that are running but have no starter, and have been running for 30 mins + """ + condor_starter = check_for_condor_starter() + if condor_starter: + return -def kill_docker_container(cnt_id: str): - if kill is True: - cnt = dc.containers.get(cnt_id) - cnt.kill() - else: - pass + runaway_containers = filter_containers_by_time(potential_containers, minutes=30) + if runaway_containers: + for runaway_container in runaway_containers: + message = get_running_time_message(runaway_container, title="reaper_no_starter") + send_slack_message(message) + remove_with_backoff(container,message) -def kill_dead_jobs(running_jobs: List, docker_processes: Dict): - # send_slack_message(f"Job CONTAINER_REAPER is KILLING DEAD JOBS at {datetime.datetime.now()}") - for cnt_id in docker_processes: - labels = docker_processes[cnt_id] - job_id = labels.get("job_id", None) - if job_id not in running_jobs: - if kill is True: - kill_docker_container(cnt_id) - notify_slack(cnt_id, labels, running_jobs) +def check_for_condor_starter(): + result = subprocess.run("ps -ef | grep '[c]ondor_starter'", shell=True, stdout=subprocess.PIPE, text=True) + count = len(result.stdout.strip().split('\n')) if result.stdout.strip() else 0 + return count > 0 if __name__ == "__main__": - try: - # send_slack_message(f"Job CONTAINER_REAPER is beginning at {datetime.datetime.now()}") - name = "us.kbase.narrativejobservice.sdkjobs.SDKLocalMethodRunner" + """ + PDSH_SSH_ARGS_APPEND="-o StrictHostKeyChecking=no -q" pdsh -w rancher@km[2-28]-p "docker ps | grep kbase| grep days" | sort -V | grep -v worker + """ - running_java_jobs = find_running_jobs(name) - docker_jobs = find_dockerhub_jobs() - kill_dead_jobs(running_java_jobs, docker_jobs) - # send_slack_message(f"Job CONTAINER_REAPER is ENDING at {datetime.datetime.now()}") - except Exception as e: - send_slack_message(f"FAILURE on {hostname}" + str(e.with_traceback())) - logging.error(e.with_traceback()) + CONTAINER_REAPER_ENDPOINTS = os.environ.get("CONTAINER_REAPER_ENDPOINTS", "").split(",") + DELETE_ABANDONED_CONTAINERS = os.environ.get("DELETE_ABANDONED_CONTAINERS", "false").lower() == "true" + + if not DELETE_ABANDONED_CONTAINERS: + exit("DELETE_ABANDONED_CONTAINERS is not set to true") + if not CONTAINER_REAPER_ENDPOINTS or CONTAINER_REAPER_ENDPOINTS == [""]: + exit("No CONTAINER_REAPER_ENDPOINTS set, unsure where to manage containers") + + hostname = socket.gethostname() + dc = docker.from_env() + + # Define the filters to specify that you are searching for only your specific containers in a multi worker environment + # Also add user_name as a filter to make sure you aren't killing containers that happen to have EE2_ENDPOINT set, + # The chances of EE2_endpoint and user_name as labels on a container should be very small. + # CONTAINER_REAPER_ENDPOINTS = ["https://kbase.us/services/ee2", "https://appdev.kbase.us/services/ee2", "https://services.kbase.us/services/ee2/"] + unique_containers = set() + filters = {} + for endpoint in CONTAINER_REAPER_ENDPOINTS: + + filters.update({ + "status": "running", + "label": [ + f"ee2_endpoint={endpoint.strip()}", + "user_name" + ] + }) + containers = dc.containers.list(filters=filters) + for container in containers: + unique_containers.add(container) + + reap_containers_running_more_than_7_days(potential_containers=unique_containers) + reap_containers_when_there_is_no_starter(potential_containers=unique_containers) diff --git a/deployment/bin/cron/container_reaper_ee2.py b/deployment/bin/cron/container_reaper_ee2.py deleted file mode 100755 index d93ca77..0000000 --- a/deployment/bin/cron/container_reaper_ee2.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/miniconda/bin/python -import datetime -import json -import logging -import os -import socket -from typing import List, Dict - -import docker -from docker.models.containers import Container -import psutil -import requests - -logging.basicConfig(level=logging.INFO) - -slack_key = os.environ.get("SLACK_WEBHOOK_KEY", None) -# ee_notifications_channel -webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None) - -kill = os.environ.get("DELETE_ABANDONED_CONTAINERS", "false") -if kill.lower() == "true": - kill = True -else: - kill = False - -ee2_endpoint_url = os.environ.get("EE2_ENDPOINT", None) - -if ee2_endpoint_url is None: - raise Exception("EE2 Endpoint not set") - -hostname = socket.gethostname() -dc = docker.from_env() - - -def find_dockerhub_jobs() -> Dict: - # send_slack_message(f"Job CONTAINER_REAPER is FINDING DOCKERHUB JOBS at {datetime.datetime.now()}") - - try: - all_containers = dc.containers - container_list = all_containers.list() - except Exception as e: - send_slack_message(str(e) + hostname) - raise e - - job_containers = {} - - for container in container_list: - cnt_id = container.id - try: - cnt = all_containers.get(cnt_id) - labels = cnt.labels - label_keys = labels.keys() - if ( - "condor_id" in label_keys - and "ee2_endpoint" in label_keys - and "worker_hostname" in label_keys - ): - if ( - labels.get("worker_hostname") == hostname - and labels.get("ee2_endpoint") == ee2_endpoint_url - ): - labels["image"] = cnt.image - job_containers[cnt_id] = labels - except Exception as e: - logging.error(f"Container {cnt_id} doesn't exist anymore") - logging.error(e) - - return job_containers - - -def find_running_jobs(): - "Return a list of job ids from running job processes. Since python procs have multiple entries, keep only 1 version" - # send_slack_message(f"Job CONTAINER_REAPER is FINDING RUNNING JOBS at {datetime.datetime.now()}") - ls = [] - for p in psutil.process_iter(attrs=["name", "cmdline"]): - if ( - "/miniconda/bin/python" in p.info["cmdline"] - and "./jobrunner.py" in p.info["cmdline"] - ): - ls.append(p.info["cmdline"][-2]) - return list(set(ls)) - - -def send_slack_message(message: str): - """ - - :param message: Escaped Message to send to slack - :return: - """ - - slack_data = {"text": message} - response = requests.post( - webhook_url, - data=json.dumps(slack_data), - headers={"Content-Type": "application/json"}, - ) - - -def notify_slack(cnt_id: str, labels: dict(), running_job_ids: List): - now = datetime.datetime.now() - - job_id = labels.get("job_id", None) - # app_id = labels['app_id'] - app_name = labels.get("app_name", None) - method_name = labels.get("method_name", None) - condor_id = labels.get("condor_id", None) - username = labels.get("user_name", None) - - msg = f"cnt_id:{cnt_id} job_id:{job_id} condor_id:{condor_id} for {username} not in running_job_ids {running_job_ids} ({now}) hostname:({hostname}) app:{app_name} method:{method_name} (kill = {kill}) " - send_slack_message(msg) - - -def kill_docker_container(cnt_id: str): - """ - Kill a docker container. The job finish script should clean up after itself. - :param cnt_id: The container to kill/remove - """ - if kill is True: - cnt = dc.containers.get(cnt_id) # type: Container - try: - cnt.kill() - except Exception: - try: - cnt.remove(force=True) - except Exception: - send_slack_message(f"Couldn't delete {cnt_id} on {hostname}") - - -def kill_dead_jobs(running_jobs: List, docker_processes: Dict): - """ - Check whether there are runaway docker containers - :param running_jobs: A list of condor jobs gathered from the starter scripts - :param docker_processes: A list of docker containers - """ - # send_slack_message(f"Job CONTAINER_REAPER is KILLING DEAD JOBS at {datetime.datetime.now()}") - for cnt_id in docker_processes: - labels = docker_processes[cnt_id] - job_id = labels.get("job_id", None) - if job_id not in running_jobs: - notify_slack(cnt_id, labels, running_jobs) - if kill is True: - kill_docker_container(cnt_id) - - -if __name__ == "__main__": - try: - # send_slack_message(f"Job CONTAINER_REAPER is beginning at {datetime.datetime.now()}") - locally_running_jobrunners = find_running_jobs() - docker_jobs = find_dockerhub_jobs() - kill_dead_jobs(locally_running_jobrunners, docker_jobs) - # send_slack_message(f"Job CONTAINER_REAPER is ENDING at {datetime.datetime.now()}") - except Exception as e: - send_slack_message(f"FAILURE on {hostname}" + str(e)) - logging.error(str(e)) diff --git a/deployment/bin/cron/delete_exited_containers.py b/deployment/bin/cron/delete_exited_containers.py index 60c9ca5..7dfb2e3 100755 --- a/deployment/bin/cron/delete_exited_containers.py +++ b/deployment/bin/cron/delete_exited_containers.py @@ -1,17 +1,18 @@ #!/miniconda/bin/python -import os +# This script is automatically run by the condor cronjob periodically +# in order to clean up exited docker containers. import json -import requests -import docker +import os import socket -import datetime + +import docker +import requests def send_slack_message(message: str): """ :param message: Escaped Message to send to slack """ - # ee_notifications_channel webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None) slack_data = {"text": message} requests.post( @@ -22,14 +23,15 @@ def send_slack_message(message: str): if __name__ == "__main__": - # send_slack_message(f"Job DELETE_EXITED is beginning at {datetime.datetime.now()}") hostname = socket.gethostname() dc = docker.from_env() ec = dc.containers.list(filters={"status": "exited"}) - count = len(ec) - - if count > 0: - dc.containers.prune() - send_slack_message(f"Deleted {count} stopped containers on {hostname}") - - # send_slack_message(f"Job DELETE_EXITED is ENDING at {datetime.datetime.now()}") + kbase_containers = [c for c in ec if "kbase" in c.attrs["Config"]["Image"]] + container_image_names = [c.attrs["Config"]["Image"] for c in kbase_containers] + if kbase_containers: + for container in kbase_containers: + container.remove() + debug_mode = os.environ.get("DEBUG", "false").lower() == "true" + if debug_mode: + send_slack_message( + f"Deleted {len(kbase_containers)} `exited` containers with 'kbase' in image name on {hostname}: {container_image_names}") diff --git a/deployment/bin/install_python_dependencies.sh b/deployment/bin/install_python_dependencies.sh deleted file mode 100755 index 401325c..0000000 --- a/deployment/bin/install_python_dependencies.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -#Install Python3 Libraries -#TODO Requirements.txt -source /miniconda/bin/activate -pip install requests docker slackclient htcondor psutil lockfile diff --git a/deployment/bin/misc/java_stats.sh b/deployment/bin/misc/java_stats.sh deleted file mode 100755 index aed096e..0000000 --- a/deployment/bin/misc/java_stats.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env jshell-wrapper - -System.out.println("Available Processors"); -System.out.println(Runtime.getRuntime().availableProcessors()); - -System.out.println("Free Memory"); -System.out.println(Runtime.getRuntime().freeMemory() + " " + Runtime.getRuntime().freeMemory() / 1000000000 + "G"); - -System.out.println("Max Memory"); -System.out.println(Runtime.getRuntime().maxMemory() + " " + Runtime.getRuntime().maxMemory() / 1000000000 + "G"); diff --git a/deployment/bin/misc/jshell-wrapper b/deployment/bin/misc/jshell-wrapper deleted file mode 100755 index ab5ad0f..0000000 --- a/deployment/bin/misc/jshell-wrapper +++ /dev/null @@ -1,7 +0,0 @@ -TMP=`mktemp` -tail -n +2 $@ >> $TMP -echo "/exit" >> $TMP -$JAVA_HOME/bin/jshell -q --execution local $TMP -rm $TMP - -#put this file in /usr/local/bin/ or somewhere in your $PATH diff --git a/deployment/condor_config.local.jinja b/deployment/condor_config.local.jinja new file mode 100644 index 0000000..e69de29 diff --git a/deployment/conf/.templates/cronjobs.config.templ b/deployment/conf/.templates/cronjobs.config.templ index b0cbb03..f490f93 100644 --- a/deployment/conf/.templates/cronjobs.config.templ +++ b/deployment/conf/.templates/cronjobs.config.templ @@ -1,32 +1,26 @@ -# SLACK_WEBHOOK_KEY={{ .Env.SLACK_WEBHOOK_KEY }} - -# startd hook to check if node is healthy +# This checks if the node is healthy and reports to slack if it is not. Sets NODE_IS_HEALTHY to True or False STARTD_CRON_NodeHealth_EXECUTABLE = /kb/deployment/bin/cron/health_check.py STARTD_CRON_NodeHealth_PERIOD = 6m STARTD_CRON_NodeHealth_MODE = Periodic STARTD_CRON_NodeHealth_RECONFIG_RERUN = True -STARTD_CRON_NodeHealth_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} SERVICE_ENDPOINT={{ .Env.SERVICE_ENDPOINT }} CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} DOCKER_CACHE={{ .Env.DOCKER_CACHE }} DELETE_ABANDONED_CONTAINERS={{ .Env.DELETE_ABANDONED_CONTAINERS }}" +STARTD_CRON_NodeHealth_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} SERVICE_ENDPOINT={{ .Env.SERVICE_ENDPOINT }} CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} DOCKER_CACHE={{ .Env.DOCKER_CACHE }} EXECUTE_SUFFIX={{ .Env.EXECUTE_SUFFIX }} CHECK_CONDOR_STARTER_HEALTH={{ .Env.CHECK_CONDOR_STARTER_HEALTH }} " + -# startd hook to delete exited containers +# startd hook to delete exited containers (Might want to leave this longer for debugging) STARTD_CRON_DeleteExitedContainers_EXECUTABLE = /kb/deployment/bin/cron/delete_exited_containers.py -STARTD_CRON_DeleteExitedContainers_PERIOD = 10m +STARTD_CRON_DeleteExitedContainers_PERIOD = 30m STARTD_CRON_DeleteExitedContainers_MODE = Periodic STARTD_CRON_DeleteExitedContainers_RECONFIG_RERUN = True STARTD_CRON_DeleteExitedContainers_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }}" -# startd hook to delete abandoned containers -STARTD_CRON_ReapAbandondedContainers_EXECUTABLE = /kb/deployment/bin/cron/container_reaper.py -STARTD_CRON_ReapAbandondedContainers_PERIOD = 6m -STARTD_CRON_ReapAbandondedContainers_MODE = Periodic -STARTD_CRON_ReapAbandondedContainers_RECONFIG_RERUN = True -STARTD_CRON_ReapAbandondedContainers_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} SERVICE_ENDPOINT={{ .Env.SERVICE_ENDPOINT }} CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} DOCKER_CACHE={{ .Env.DOCKER_CACHE }} DELETE_ABANDONED_CONTAINERS={{ .Env.DELETE_ABANDONED_CONTAINERS }}" -# startd hook to delete abandoned containers -STARTD_CRON_ReapAbandondedContainersEE2_EXECUTABLE = /kb/deployment/bin/cron/container_reaper_ee2.py -STARTD_CRON_ReapAbandondedContainersEE2_PERIOD = 6m -STARTD_CRON_ReapAbandondedContainersEE2_MODE = Periodic -STARTD_CRON_ReapAbandondedContainersEE2_RECONFIG_RERUN = True -STARTD_CRON_ReapAbandondedContainersEE2_ENV = "EE2_ENDPOINT={{ .Env.EE2_ENDPOINT }} SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} SERVICE_ENDPOINT={{ .Env.SERVICE_ENDPOINT }} CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} DOCKER_CACHE={{ .Env.DOCKER_CACHE }} DELETE_ABANDONED_CONTAINERS={{ .Env.DELETE_ABANDONED_CONTAINERS }}" +# Container Reaper Version 2024 +STARTD_CRON_ContainerReaper_EXECUTABLE = /kb/deployment/bin/cron/container_reaper.py +STARTD_CRON_ContainerReaper_PERIOD = 6m +STARTD_CRON_ContainerReaper_MODE = Periodic +STARTD_CRON_ContainerReaper_RECONFIG_RERUN = True +STARTD_CRON_ContainerReaper_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} CONTAINER_REAPER_ENDPOINTS={{ .Env.CONTAINER_REAPER_ENDPOINTS }} DELETE_ABANDONED_CONTAINERS={{ .Env.DELETE_ABANDONED_CONTAINERS }}" + # Tmpwatch $CONDOR_SUBMIT_WORKDIR STARTD_CRON_ManageCondorSubmitWorkdir_EXECUTABLE = /usr/sbin/tmpwatch @@ -36,12 +30,12 @@ STARTD_CRON_ManageCondorSubmitWorkdir_MODE = Periodic STARTD_CRON_ManageCondorSubmitWorkdir_RECONFIG_RERUN = True STARTD_CRON_ManageCondorSubmitWorkdir_ENV = "CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} " -# Prune docker every 14 days.. This works right now, but need to redirect to a script +# Prune docker every 14 days STARTD_CRON_ManageVarLibDocker_EXECUTABLE = /usr/bin/docker STARTD_CRON_ManageVarLibDocker_ARGS = system prune -a -f STARTD_CRON_ManageVarLibDocker_PERIOD = 336h STARTD_CRON_ManageVarLibDocker_MODE = Periodic -STARTD_CRON_ManageCondorSubmitWorkdir_RECONFIG_RERUN = True +STARTD_CRON_ManageVarLibDocker_RECONFIG_RERUN = True -STARTD_CRON_JOBLIST = NodeHealth ReapAbandondedContainersEE2 ManageVarLibDocker ManageCondorSubmitWorkdir +STARTD_CRON_JOBLIST = NodeHealth ContainerReaper ManageVarLibDocker ManageCondorSubmitWorkdir DeleteExitedContainers # STARTD_CRON_AUTOPUBLISH = If_Changed diff --git a/deployment/conf/.templates/deployment.cfg.templ b/deployment/conf/legacy/deployment.cfg.templ similarity index 100% rename from deployment/conf/.templates/deployment.cfg.templ rename to deployment/conf/legacy/deployment.cfg.templ diff --git a/deployment/conf/.templates/limitBigMemSlots.templ b/deployment/conf/legacy/limitBigMemSlots.templ similarity index 100% rename from deployment/conf/.templates/limitBigMemSlots.templ rename to deployment/conf/legacy/limitBigMemSlots.templ diff --git a/deployment/conf/.templates/shared_port_config.templ b/deployment/conf/legacy/shared_port_config.templ similarity index 100% rename from deployment/conf/.templates/shared_port_config.templ rename to deployment/conf/legacy/shared_port_config.templ diff --git a/deployment/conf/.templates/start_server.sh.templ b/deployment/conf/legacy/start_server.sh.templ similarity index 100% rename from deployment/conf/.templates/start_server.sh.templ rename to deployment/conf/legacy/start_server.sh.templ From e7b275b462982113d46fb70810f0112c74e7b371 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 28 Oct 2025 17:49:06 -0500 Subject: [PATCH 12/22] Update deps --- Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 38dc294..f675beb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,12 +38,11 @@ RUN rm -rf /var/cache/yum ENV PATH=/miniconda/bin:$PATH -RUN pip install requests sanic==21.12.2 docker==7.1.0 +RUN pip install requests slackclient psutil sanic==21.12.2 docker==7.1.0 COPY --chown=kbase deployment/ /kb/deployment/ -RUN /kb/deployment/bin/install_python_dependencies.sh ENV KB_DEPLOYMENT_CONFIG=/kb/deployment/conf/deployment.cfg From 79e1916986c7bdf9f01ac8b2bdf238ff65486945 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 28 Oct 2025 18:09:06 -0500 Subject: [PATCH 13/22] Last reqs --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f675beb..e77cf57 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,7 +38,7 @@ RUN rm -rf /var/cache/yum ENV PATH=/miniconda/bin:$PATH -RUN pip install requests slackclient psutil sanic==21.12.2 docker==7.1.0 +RUN pip install requests websockets slackclient psutil sanic==21.12.2 docker==7.1.0 COPY --chown=kbase deployment/ /kb/deployment/ From e97865342d6273cbf2c34e22899dc9fa4bd67184 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Tue, 28 Oct 2025 18:17:24 -0500 Subject: [PATCH 14/22] Last reqs --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index e77cf57..7fd04d6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,7 +38,7 @@ RUN rm -rf /var/cache/yum ENV PATH=/miniconda/bin:$PATH -RUN pip install requests websockets slackclient psutil sanic==21.12.2 docker==7.1.0 +RUN pip install requests websockets==10.0 slackclient psutil sanic==21.12.2 docker==7.1.0 COPY --chown=kbase deployment/ /kb/deployment/ From 5adce79ce0229a6d8bdf88da6ea34fa237930eb0 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 29 Oct 2025 11:26:54 -0500 Subject: [PATCH 15/22] Fix docker permissions --- Dockerfile | 2 ++ deployment/bin/start-condor.sh | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 7fd04d6..e2b86fb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,8 @@ RUN useradd -c "KBase user" -rd /kb/deployment/ -u 1000 -s /bin/bash kbase && \ touch /kb/deployment/jettybase/logs/request.log && \ chown -R kbase /kb/deployment +RUN usermod -aG docker kbase + #INSTALL DOCKERIZE RUN wget -N https://github.com/kbase/dockerize/raw/master/dockerize-linux-amd64-v0.6.1.tar.gz && tar xvzf dockerize-linux-amd64-v0.6.1.tar.gz && cp dockerize /kb/deployment/bin && rm dockerize* diff --git a/deployment/bin/start-condor.sh b/deployment/bin/start-condor.sh index 7630a77..6a15411 100755 --- a/deployment/bin/start-condor.sh +++ b/deployment/bin/start-condor.sh @@ -4,7 +4,7 @@ # condor pool password if [ "$GROUPMOD_DOCKER" ] ; then - groupmod -g $GROUPMOD_DOCKER docker + groupmod -o -g $GROUPMOD_DOCKER docker fi if [ "$POOL_PASSWORD" ] ; then @@ -16,6 +16,7 @@ if [ "$SET_NOBODY_USER_GUID" ] ; then usermod -a -G "$SET_NOBODY_USER_GUID" condor # For backwards compatibility for directories already created by the kbase user usermod -a -G "kbase" nobody + usermod -a -G "docker" nobody fi if [ "$SET_NOBODY_USER_UID" ] ; then From 1e414f015790a0ca7b7c4cde2dc5f534b0e6109d Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 29 Oct 2025 11:48:30 -0500 Subject: [PATCH 16/22] Fix docker permissions --- Dockerfile | 2 -- deployment/bin/start-condor.sh | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index e2b86fb..7fd04d6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,8 +18,6 @@ RUN useradd -c "KBase user" -rd /kb/deployment/ -u 1000 -s /bin/bash kbase && \ touch /kb/deployment/jettybase/logs/request.log && \ chown -R kbase /kb/deployment -RUN usermod -aG docker kbase - #INSTALL DOCKERIZE RUN wget -N https://github.com/kbase/dockerize/raw/master/dockerize-linux-amd64-v0.6.1.tar.gz && tar xvzf dockerize-linux-amd64-v0.6.1.tar.gz && cp dockerize /kb/deployment/bin && rm dockerize* diff --git a/deployment/bin/start-condor.sh b/deployment/bin/start-condor.sh index 6a15411..93144a9 100755 --- a/deployment/bin/start-condor.sh +++ b/deployment/bin/start-condor.sh @@ -5,6 +5,7 @@ if [ "$GROUPMOD_DOCKER" ] ; then groupmod -o -g $GROUPMOD_DOCKER docker + usermod -aG docker kbase fi if [ "$POOL_PASSWORD" ] ; then From 347ad7f8313486e294fd1f6a37bb595934525222 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 29 Oct 2025 12:22:00 -0500 Subject: [PATCH 17/22] Update health check --- deployment/bin/cron/health_check.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/deployment/bin/cron/health_check.py b/deployment/bin/cron/health_check.py index 1b27ea9..8b28b00 100755 --- a/deployment/bin/cron/health_check.py +++ b/deployment/bin/cron/health_check.py @@ -159,8 +159,12 @@ def test_docker_socket2(): try: dc = docker.DockerClient(base_url='unix:///var/run/docker.sock') dc.ping() - except Exception: - message = f"Cannot access docker socket" + except Exception as e : + whoami = subprocess.check_output("whoami", shell=True).decode().strip() + my_groups = subprocess.check_output("groups", shell=True).decode().strip() + ggid = os.getgid() + uid = os.getuid() + message = f"Cannot access docker socket {e} user={whoami} groups={my_groups} uid={uid} gid={ggid}" exit_unsuccessfully(message) From 416f28bed62af6a6029c850cba4edc9ea0a38d63 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 29 Oct 2025 12:29:15 -0500 Subject: [PATCH 18/22] Update docker perms --- deployment/bin/start-condor.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/deployment/bin/start-condor.sh b/deployment/bin/start-condor.sh index 93144a9..09ee44d 100755 --- a/deployment/bin/start-condor.sh +++ b/deployment/bin/start-condor.sh @@ -6,6 +6,7 @@ if [ "$GROUPMOD_DOCKER" ] ; then groupmod -o -g $GROUPMOD_DOCKER docker usermod -aG docker kbase + usermod -aG docker condor fi if [ "$POOL_PASSWORD" ] ; then From 13d821d0d142fd3147305570bfc1cf7e07d8176f Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 29 Oct 2025 12:29:52 -0500 Subject: [PATCH 19/22] Update docker perms --- deployment/bin/start-condor.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deployment/bin/start-condor.sh b/deployment/bin/start-condor.sh index 09ee44d..5caf234 100755 --- a/deployment/bin/start-condor.sh +++ b/deployment/bin/start-condor.sh @@ -5,8 +5,8 @@ if [ "$GROUPMOD_DOCKER" ] ; then groupmod -o -g $GROUPMOD_DOCKER docker - usermod -aG docker kbase - usermod -aG docker condor + usermod -aG docker kbase # for jobs running as kbase user/nobody user + usermod -aG docker condor # for condor cronjobs fi if [ "$POOL_PASSWORD" ] ; then From 433357bc29ba944a728a0bf5df9099358f881532 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Wed, 29 Oct 2025 12:30:15 -0500 Subject: [PATCH 20/22] Update docker perms --- deployment/bin/start-condor.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deployment/bin/start-condor.sh b/deployment/bin/start-condor.sh index 5caf234..3154f9c 100755 --- a/deployment/bin/start-condor.sh +++ b/deployment/bin/start-condor.sh @@ -14,9 +14,10 @@ if [ "$POOL_PASSWORD" ] ; then fi if [ "$SET_NOBODY_USER_GUID" ] ; then + # For file permissions usermod -a -G "$SET_NOBODY_USER_GUID" nobody usermod -a -G "$SET_NOBODY_USER_GUID" condor -# For backwards compatibility for directories already created by the kbase user + # For backwards compatibility for directories already created by the kbase user usermod -a -G "kbase" nobody usermod -a -G "docker" nobody fi From b8d2994e4545236bf7cbb46358ccb93dd2f668bd Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 31 Oct 2025 16:51:20 -0500 Subject: [PATCH 21/22] Add ownership change for submit workdir --- deployment/bin/start-condor.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/deployment/bin/start-condor.sh b/deployment/bin/start-condor.sh index 3154f9c..0ca6817 100755 --- a/deployment/bin/start-condor.sh +++ b/deployment/bin/start-condor.sh @@ -35,6 +35,7 @@ if [ "$CONDOR_SUBMIT_WORKDIR" ] ; then chmod 01777 "$CONDOR_SUBMIT_WORKDIR/logs" chmod 01777 "$CONDOR_SUBMIT_WORKDIR/${EXECUTE_SUFFIX}/logs" chmod 01777 "$CONDOR_SUBMIT_WORKDIR/${EXECUTE_SUFFIX}/../logs" + chown condor $CONDOR_SUBMIT_WORKDIR/${EXECUTE_SUFFIX} else mkdir -p "/cdr/${EXECUTE_SUFFIX}" chmod 01777 "/cdr/${EXECUTE_SUFFIX}" From 945fc193967f456d657ac9066b59e0d5eaffe364 Mon Sep 17 00:00:00 2001 From: bio-boris Date: Fri, 9 Jan 2026 12:13:05 -0600 Subject: [PATCH 22/22] Add 'uv' to pip install command in Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 7fd04d6..5ea09dc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,7 +38,7 @@ RUN rm -rf /var/cache/yum ENV PATH=/miniconda/bin:$PATH -RUN pip install requests websockets==10.0 slackclient psutil sanic==21.12.2 docker==7.1.0 +RUN pip install uv requests websockets==10.0 slackclient psutil sanic==21.12.2 docker==7.1.0 COPY --chown=kbase deployment/ /kb/deployment/