diff --git a/.github/workflows/manual-build.yml b/.github/workflows/manual-build.yml index 944f903..bbdaea7 100644 --- a/.github/workflows/manual-build.yml +++ b/.github/workflows/manual-build.yml @@ -1,11 +1,17 @@ --- name: Manual Build & Push on: - workflow_dispatch: + workflow_dispatch: + inputs: + platforms: + description: 'The platforms for which the Docker image should be built. If not specified, defaults to linux/amd64.' + required: false + default: 'linux/amd64' jobs: build-push: uses: kbase/.github/.github/workflows/reusable_build-push.yml@main with: name: '${{ github.event.repository.name }}-develop' tags: br-${{ github.ref_name }} + platforms: ${{ github.event.inputs.platforms }} secrets: inherit diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml new file mode 100644 index 0000000..5592521 --- /dev/null +++ b/.github/workflows/pr_build.yml @@ -0,0 +1,48 @@ +--- +name: Pull Request Build, Tag, & Push +on: + pull_request: + branches: + - develop + - main + - master + types: + - opened + - reopened + - synchronize + - closed +jobs: + build-develop-open: + if: github.base_ref == 'develop' && github.event.pull_request.merged == false + uses: kbase/.github/.github/workflows/reusable_build.yml@main + with: + platforms: "linux/amd64" + secrets: inherit + build-develop-merge: + if: github.base_ref == 'develop' && github.event.pull_request.merged == true + uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + with: + name: '${{ github.event.repository.name }}-develop' + tags: pr-${{ github.event.number }},latest + platforms: "linux/amd64" + secrets: inherit + build-main-open: + if: (github.base_ref == 'main' || github.base_ref == 'master') && github.event.pull_request.merged == false + uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + with: + name: '${{ github.event.repository.name }}' + tags: pr-${{ github.event.number }} + platforms: "linux/amd64" + secrets: inherit + build-main-merge: + if: (github.base_ref == 'main' || github.base_ref == 'master') && github.event.pull_request.merged == true + uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + with: + name: '${{ github.event.repository.name }}' + tags: pr-${{ github.event.number }},latest-rc + platforms: "linux/amd64" + secrets: inherit + trivy-scans: + if: (github.base_ref == 'develop' || github.base_ref == 'main' || github.base_ref == 'master' ) && github.event.pull_request.merged == false + uses: kbase/.github/.github/workflows/reusable_trivy-scans.yml@main + secrets: inherit diff --git a/.github/workflows/release-main.yml b/.github/workflows/release-main.yml new file mode 100644 index 0000000..095a92f --- /dev/null +++ b/.github/workflows/release-main.yml @@ -0,0 +1,26 @@ +--- +name: Release - Build & Push Image +on: + release: + branches: + - main + - master + types: [ published ] +jobs: + check-source-branch: + uses: kbase/.github/.github/workflows/reusable_validate-branch.yml@main + with: + build_branch: '${{ github.event.release.target_commitish }}' + validate-release-tag: + needs: check-source-branch + uses: kbase/.github/.github/workflows/reusable_validate-release-tag.yml@main + with: + release_tag: '${{ github.event.release.tag_name }}' + build-push: + needs: validate-release-tag + uses: kbase/.github/.github/workflows/reusable_build-push.yml@main + with: + name: '${{ github.event.repository.name }}' + tags: '${{ github.event.release.tag_name }},latest' + platforms: "linux/amd64" + secrets: inherit diff --git a/Dockerfile b/Dockerfile index b364dee..eba490b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,23 +1,18 @@ -FROM htcondor/execute:lts-el8 -ENV container docker +FROM htcondor/base:25.0.1-el9 -# Ge$t commonly used utilities -RUN yum -y update && yum upgrade -y -RUN yum install -y drpm -RUN yum -y install -y epel-release wget which git gcc libcgroup libcgroup-tools stress-ng tmpwatch +# Get commonly used utilities +RUN yum -y update && yum update -y systemd && yum -y install -y epel-release wget which git gcc stress-ng tmpwatch bzip2 # Install docker binaries RUN yum install -y yum-utils device-mapper-persistent-data lvm2 && yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo && yum install -y docker-ce -#Install Python3 and Libraries (source /root/miniconda/bin/activate) -RUN yum install -y bzip2 \ -&& wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh \ +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh \ && bash ~/miniconda.sh -b -p /miniconda \ && export PATH="/miniconda/bin:$PATH" # Add kbase user and set up directories -RUN useradd -c "KBase user" -rd /kb/deployment/ -u 998 -s /bin/bash kbase && \ +RUN useradd -c "KBase user" -rd /kb/deployment/ -u 1000 -s /bin/bash kbase && \ mkdir -p /kb/deployment/bin && \ mkdir -p /kb/deployment/jettybase/logs/ && \ touch /kb/deployment/jettybase/logs/request.log && \ @@ -26,8 +21,8 @@ RUN useradd -c "KBase user" -rd /kb/deployment/ -u 998 -s /bin/bash kbase && \ #INSTALL DOCKERIZE RUN wget -N https://github.com/kbase/dockerize/raw/master/dockerize-linux-amd64-v0.6.1.tar.gz && tar xvzf dockerize-linux-amd64-v0.6.1.tar.gz && cp dockerize /kb/deployment/bin && rm dockerize* -# Also add the user to the groups that map to "docker" on Linux and "daemon" on Mac -RUN usermod -a -G 0 kbase && usermod -a -G 999 kbase + + #ADD DIRS RUN mkdir -p /var/run/condor && mkdir -p /var/log/condor && mkdir -p /var/lock/condor && mkdir -p /var/lib/condor/execute @@ -35,21 +30,17 @@ RUN mkdir -p /var/run/condor && mkdir -p /var/log/condor && mkdir -p /var/lock/c # Maybe you want: rm -rf /var/cache/yum, to also free up space taken by orphaned data from disabled or removed repos RUN rm -rf /var/cache/yum +ENV PATH=/miniconda/bin:$PATH + + +RUN pip install uv requests websockets==10.0 slackclient psutil sanic==21.12.2 docker==7.1.0 + + COPY --chown=kbase deployment/ /kb/deployment/ -# Install dependencies for JobRunner -ENV PATH /miniconda/bin:$PATH -RUN wget https://raw.githubusercontent.com/kbase/JobRunner/master/requirements.txt && pip install -r requirements.txt && rm requirements.txt -RUN /kb/deployment/bin/install_python_dependencies.sh - -# The BUILD_DATE value seem to bust the docker cache when the timestamp changes, move to -# the end -LABEL org.label-schema.build-date=$BUILD_DATE \ - org.label-schema.vcs-url="https://github.com/kbase/condor-worker.git" \ - org.label-schema.vcs-ref=$VCS_REF \ - org.label-schema.schema-version="1.0.0" \ - us.kbase.vcs-branch=$BRANCH \ - maintainer="Steve Chan sychan@lbl.gov" + +ENV KB_DEPLOYMENT_CONFIG=/kb/deployment/conf/deployment.cfg + ENTRYPOINT [ "/kb/deployment/bin/dockerize" ] CMD [ "-template", "/kb/deployment/conf/.templates/deployment.cfg.templ:/kb/deployment/conf/deployment.cfg", \ diff --git a/deployment/bin/cron/container_reaper.py b/deployment/bin/cron/container_reaper.py index 8a6f948..522fadd 100755 --- a/deployment/bin/cron/container_reaper.py +++ b/deployment/bin/cron/container_reaper.py @@ -1,163 +1,140 @@ #!/miniconda/bin/python -import datetime -import fnmatch +""" +This script is automatically run by the condor cronjob periodically +in order to clean up containers > 7 days or running without a starter +Required env vars are +# CONTAINER_REAPER_ENDPOINTS - A comma separated list of EE2 endpoints to manage containers for +# DELETE_ABANDONED_CONTAINERS - Set to true to enable the container reaper +# SLACK_WEBHOOK_URL - The slack webhook url to send messages to +""" + import json -import logging import os import socket +import subprocess +import time +from datetime import datetime, timedelta +from typing import Set import docker -import psutil import requests -from clients.NarrativeJobServiceClient import NarrativeJobService - -from typing import List, Dict - -slack_key = os.environ.get("SLACK_WEBHOOK_KEY", None) -# ee_notifications_channel -webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None) - -kill = os.environ.get("DELETE_ABANDONED_CONTAINERS", "false") -if kill.lower() == "true": - kill = True -else: - kill = False - -njs_endpoint_url = os.environ.get("NJS_ENDPOINT", None) - -if njs_endpoint_url is None: - raise Exception("NJS Endpoint not set") - -hostname = socket.gethostname() -dc = docker.from_env() - - -def find_dockerhub_jobs() -> Dict: - # send_slack_message(f"Job CONTAINER_REAPER is FINDING DOCKERHUB JOBS at {datetime.datetime.now()}") - - try: - all_containers = dc.containers - list = all_containers.list() - except Exception as e: - send_slack_message(str(e) + hostname) - - job_containers = {} - - for container in list: - cnt_id = container.id - try: - cnt = all_containers.get(cnt_id) - labels = cnt.labels - if "condor_id" in labels.keys() and "njs_endpoint" in labels.keys(): - labels["image"] = cnt.image - job_containers[cnt_id] = labels - except Exception as e: - logging.error(f"Container {cnt_id} doesn't exist anymore") - logging.error(e) - - return job_containers - - -def find_running_jobs(ps_name: str): - # send_slack_message(f"Job CONTAINER_REAPER is FINDING RUNNING JOBS at {datetime.datetime.now()}") - - "Return a list of processes matching 'name'." - ls = [] - for p in psutil.process_iter(attrs=["name", "cmdline"]): - if ps_name in p.info["cmdline"]: - ls.append(p.info["cmdline"][-2]) - return ls +from docker.models.containers import Container def send_slack_message(message: str): """ - :param message: Escaped Message to send to slack - :return: """ - + webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None) slack_data = {"text": message} - response = requests.post( + requests.post( webhook_url, data=json.dumps(slack_data), headers={"Content-Type": "application/json"}, ) -def notify_slack(cnt_id: str, labels: dict(), running_job_ids: List): - now = datetime.datetime.now() +def filter_containers_by_time(potential_containers, days=0, minutes=0): + filtered_containers = [] + seven_days_ago = datetime.now() - timedelta(days=days, minutes=minutes) - job_id = labels.get("job_id", None) - # app_id = labels['app_id'] - app_name = labels.get("app_name", None) - method_name = labels.get("method_name", None) - condor_id = labels.get("condor_id", None) - username = labels.get("user_name", None) + for old_container in potential_containers: + # Do we need to catch the chance that there is no created attribute? + created_time_str = old_container.attrs['Created'][:26] + created_time = datetime.fromisoformat(created_time_str) + if created_time <= seven_days_ago: + filtered_containers.append(old_container) + return filtered_containers - msg = f"cnt_id:{cnt_id} job_id:{job_id} condor_id:{condor_id} for {username} not in running_job_ids {running_job_ids} ({now}) hostname:({hostname}) app:{app_name} method:{method_name} (kill = {kill}) " - send_slack_message(msg) +def get_running_time_message(container, title=""): + image_name = container.attrs['Config']['Image'] + if "kbase" in image_name: + image_name = image_name.split(":")[1] + user_name = container.attrs['Config']['Labels'].get('user_name') -# @deprecated for EVENTLOG -def notify_user(cnt_id: str, labels: Dict): - username = labels.get("user_name", None) - job_id = labels.get("job_id", None) - # TODO add this to a configuration somewhere or ENV variable - job_directory = f"/mnt/awe/condor/{username}/{job_id}" + total_running_time = datetime.now() - datetime.fromisoformat(container.attrs['Created'][:26]) + days = total_running_time.days + hours = total_running_time.seconds // 3600 - print("About to notify") - print(labels) + formatted_running_time = f"{days}D:{hours}H" + return f"{title}:{hostname} {image_name}:{user_name}:{formatted_running_time}" - env_files = [] - for file in os.listdir(job_directory): - if fnmatch.fnmatch(file, "env_*"): - env_files.append(file) +def remove_with_backoff(container,message,backoff=30): + try: + container.stop() + time.sleep(backoff) # Wait for backoff period before attempting to remove + container.remove() + except Exception as e: + # Not much we can do here, just hope that the next pass will remove it + pass +def reap_containers_running_more_than_7_days(potential_containers: Set[Container]): + old_containers = filter_containers_by_time(potential_containers, days=7) - print(env_files) - env_filepath = env_files[0] - if os.path.isfile(env_filepath): - with open(env_filepath, "r") as content_file: - content = content_file.readlines() + if old_containers: + for old_container in old_containers: + message = get_running_time_message(old_container, title="reaper7daylimit") + send_slack_message(message) + remove_with_backoff(old_container, message) - token = None - for line in content: - if "KB_AUTH_TOKEN" in line: - token = line.split("=")[1] - if token: - njs = NarrativeJobService(token=token, url=njs_endpoint_url) - status = njs.check_job(job_id) - print(status) +def reap_containers_when_there_is_no_starter(potential_containers: Set[Container]): + """ + This function will reap containers that are running but have no starter, and have been running for 30 mins + """ + condor_starter = check_for_condor_starter() + if condor_starter: + return -def kill_docker_container(cnt_id: str): - if kill is True: - cnt = dc.containers.get(cnt_id) - cnt.kill() - else: - pass + runaway_containers = filter_containers_by_time(potential_containers, minutes=30) + if runaway_containers: + for runaway_container in runaway_containers: + message = get_running_time_message(runaway_container, title="reaper_no_starter") + send_slack_message(message) + remove_with_backoff(container,message) -def kill_dead_jobs(running_jobs: List, docker_processes: Dict): - # send_slack_message(f"Job CONTAINER_REAPER is KILLING DEAD JOBS at {datetime.datetime.now()}") - for cnt_id in docker_processes: - labels = docker_processes[cnt_id] - job_id = labels.get("job_id", None) - if job_id not in running_jobs: - if kill is True: - kill_docker_container(cnt_id) - notify_slack(cnt_id, labels, running_jobs) +def check_for_condor_starter(): + result = subprocess.run("ps -ef | grep '[c]ondor_starter'", shell=True, stdout=subprocess.PIPE, text=True) + count = len(result.stdout.strip().split('\n')) if result.stdout.strip() else 0 + return count > 0 if __name__ == "__main__": - try: - # send_slack_message(f"Job CONTAINER_REAPER is beginning at {datetime.datetime.now()}") - name = "us.kbase.narrativejobservice.sdkjobs.SDKLocalMethodRunner" + """ + PDSH_SSH_ARGS_APPEND="-o StrictHostKeyChecking=no -q" pdsh -w rancher@km[2-28]-p "docker ps | grep kbase| grep days" | sort -V | grep -v worker + """ - running_java_jobs = find_running_jobs(name) - docker_jobs = find_dockerhub_jobs() - kill_dead_jobs(running_java_jobs, docker_jobs) - # send_slack_message(f"Job CONTAINER_REAPER is ENDING at {datetime.datetime.now()}") - except Exception as e: - send_slack_message(f"FAILURE on {hostname}" + str(e.with_traceback())) - logging.error(e.with_traceback()) + CONTAINER_REAPER_ENDPOINTS = os.environ.get("CONTAINER_REAPER_ENDPOINTS", "").split(",") + DELETE_ABANDONED_CONTAINERS = os.environ.get("DELETE_ABANDONED_CONTAINERS", "false").lower() == "true" + + if not DELETE_ABANDONED_CONTAINERS: + exit("DELETE_ABANDONED_CONTAINERS is not set to true") + if not CONTAINER_REAPER_ENDPOINTS or CONTAINER_REAPER_ENDPOINTS == [""]: + exit("No CONTAINER_REAPER_ENDPOINTS set, unsure where to manage containers") + + hostname = socket.gethostname() + dc = docker.from_env() + + # Define the filters to specify that you are searching for only your specific containers in a multi worker environment + # Also add user_name as a filter to make sure you aren't killing containers that happen to have EE2_ENDPOINT set, + # The chances of EE2_endpoint and user_name as labels on a container should be very small. + # CONTAINER_REAPER_ENDPOINTS = ["https://kbase.us/services/ee2", "https://appdev.kbase.us/services/ee2", "https://services.kbase.us/services/ee2/"] + unique_containers = set() + filters = {} + for endpoint in CONTAINER_REAPER_ENDPOINTS: + + filters.update({ + "status": "running", + "label": [ + f"ee2_endpoint={endpoint.strip()}", + "user_name" + ] + }) + containers = dc.containers.list(filters=filters) + for container in containers: + unique_containers.add(container) + + reap_containers_running_more_than_7_days(potential_containers=unique_containers) + reap_containers_when_there_is_no_starter(potential_containers=unique_containers) diff --git a/deployment/bin/cron/container_reaper_ee2.py b/deployment/bin/cron/container_reaper_ee2.py deleted file mode 100755 index d93ca77..0000000 --- a/deployment/bin/cron/container_reaper_ee2.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/miniconda/bin/python -import datetime -import json -import logging -import os -import socket -from typing import List, Dict - -import docker -from docker.models.containers import Container -import psutil -import requests - -logging.basicConfig(level=logging.INFO) - -slack_key = os.environ.get("SLACK_WEBHOOK_KEY", None) -# ee_notifications_channel -webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None) - -kill = os.environ.get("DELETE_ABANDONED_CONTAINERS", "false") -if kill.lower() == "true": - kill = True -else: - kill = False - -ee2_endpoint_url = os.environ.get("EE2_ENDPOINT", None) - -if ee2_endpoint_url is None: - raise Exception("EE2 Endpoint not set") - -hostname = socket.gethostname() -dc = docker.from_env() - - -def find_dockerhub_jobs() -> Dict: - # send_slack_message(f"Job CONTAINER_REAPER is FINDING DOCKERHUB JOBS at {datetime.datetime.now()}") - - try: - all_containers = dc.containers - container_list = all_containers.list() - except Exception as e: - send_slack_message(str(e) + hostname) - raise e - - job_containers = {} - - for container in container_list: - cnt_id = container.id - try: - cnt = all_containers.get(cnt_id) - labels = cnt.labels - label_keys = labels.keys() - if ( - "condor_id" in label_keys - and "ee2_endpoint" in label_keys - and "worker_hostname" in label_keys - ): - if ( - labels.get("worker_hostname") == hostname - and labels.get("ee2_endpoint") == ee2_endpoint_url - ): - labels["image"] = cnt.image - job_containers[cnt_id] = labels - except Exception as e: - logging.error(f"Container {cnt_id} doesn't exist anymore") - logging.error(e) - - return job_containers - - -def find_running_jobs(): - "Return a list of job ids from running job processes. Since python procs have multiple entries, keep only 1 version" - # send_slack_message(f"Job CONTAINER_REAPER is FINDING RUNNING JOBS at {datetime.datetime.now()}") - ls = [] - for p in psutil.process_iter(attrs=["name", "cmdline"]): - if ( - "/miniconda/bin/python" in p.info["cmdline"] - and "./jobrunner.py" in p.info["cmdline"] - ): - ls.append(p.info["cmdline"][-2]) - return list(set(ls)) - - -def send_slack_message(message: str): - """ - - :param message: Escaped Message to send to slack - :return: - """ - - slack_data = {"text": message} - response = requests.post( - webhook_url, - data=json.dumps(slack_data), - headers={"Content-Type": "application/json"}, - ) - - -def notify_slack(cnt_id: str, labels: dict(), running_job_ids: List): - now = datetime.datetime.now() - - job_id = labels.get("job_id", None) - # app_id = labels['app_id'] - app_name = labels.get("app_name", None) - method_name = labels.get("method_name", None) - condor_id = labels.get("condor_id", None) - username = labels.get("user_name", None) - - msg = f"cnt_id:{cnt_id} job_id:{job_id} condor_id:{condor_id} for {username} not in running_job_ids {running_job_ids} ({now}) hostname:({hostname}) app:{app_name} method:{method_name} (kill = {kill}) " - send_slack_message(msg) - - -def kill_docker_container(cnt_id: str): - """ - Kill a docker container. The job finish script should clean up after itself. - :param cnt_id: The container to kill/remove - """ - if kill is True: - cnt = dc.containers.get(cnt_id) # type: Container - try: - cnt.kill() - except Exception: - try: - cnt.remove(force=True) - except Exception: - send_slack_message(f"Couldn't delete {cnt_id} on {hostname}") - - -def kill_dead_jobs(running_jobs: List, docker_processes: Dict): - """ - Check whether there are runaway docker containers - :param running_jobs: A list of condor jobs gathered from the starter scripts - :param docker_processes: A list of docker containers - """ - # send_slack_message(f"Job CONTAINER_REAPER is KILLING DEAD JOBS at {datetime.datetime.now()}") - for cnt_id in docker_processes: - labels = docker_processes[cnt_id] - job_id = labels.get("job_id", None) - if job_id not in running_jobs: - notify_slack(cnt_id, labels, running_jobs) - if kill is True: - kill_docker_container(cnt_id) - - -if __name__ == "__main__": - try: - # send_slack_message(f"Job CONTAINER_REAPER is beginning at {datetime.datetime.now()}") - locally_running_jobrunners = find_running_jobs() - docker_jobs = find_dockerhub_jobs() - kill_dead_jobs(locally_running_jobrunners, docker_jobs) - # send_slack_message(f"Job CONTAINER_REAPER is ENDING at {datetime.datetime.now()}") - except Exception as e: - send_slack_message(f"FAILURE on {hostname}" + str(e)) - logging.error(str(e)) diff --git a/deployment/bin/cron/delete_exited_containers.py b/deployment/bin/cron/delete_exited_containers.py index 60c9ca5..7dfb2e3 100755 --- a/deployment/bin/cron/delete_exited_containers.py +++ b/deployment/bin/cron/delete_exited_containers.py @@ -1,17 +1,18 @@ #!/miniconda/bin/python -import os +# This script is automatically run by the condor cronjob periodically +# in order to clean up exited docker containers. import json -import requests -import docker +import os import socket -import datetime + +import docker +import requests def send_slack_message(message: str): """ :param message: Escaped Message to send to slack """ - # ee_notifications_channel webhook_url = os.environ.get("SLACK_WEBHOOK_URL", None) slack_data = {"text": message} requests.post( @@ -22,14 +23,15 @@ def send_slack_message(message: str): if __name__ == "__main__": - # send_slack_message(f"Job DELETE_EXITED is beginning at {datetime.datetime.now()}") hostname = socket.gethostname() dc = docker.from_env() ec = dc.containers.list(filters={"status": "exited"}) - count = len(ec) - - if count > 0: - dc.containers.prune() - send_slack_message(f"Deleted {count} stopped containers on {hostname}") - - # send_slack_message(f"Job DELETE_EXITED is ENDING at {datetime.datetime.now()}") + kbase_containers = [c for c in ec if "kbase" in c.attrs["Config"]["Image"]] + container_image_names = [c.attrs["Config"]["Image"] for c in kbase_containers] + if kbase_containers: + for container in kbase_containers: + container.remove() + debug_mode = os.environ.get("DEBUG", "false").lower() == "true" + if debug_mode: + send_slack_message( + f"Deleted {len(kbase_containers)} `exited` containers with 'kbase' in image name on {hostname}: {container_image_names}") diff --git a/deployment/bin/cron/health_check.py b/deployment/bin/cron/health_check.py index 80cebc4..0a18970 100755 --- a/deployment/bin/cron/health_check.py +++ b/deployment/bin/cron/health_check.py @@ -33,8 +33,10 @@ def send_slack_message(message: str): debug = False -scratch = os.environ.get("CONDOR_SUBMIT_WORKDIR", "/cdr") -scratch += os.environ.get("EXECUTE_SUFFIX", "") +workdir = os.environ.get("CONDOR_SUBMIT_WORKDIR", "/cdr") +suffix = os.environ.get("EXECUTE_SUFFIX", "") +scratch = f"{workdir}/{suffix}" + check_condor_starter_health = ( os.environ.get("CHECK_CONDOR_STARTER_HEALTH", "true").lower() == "true" ) @@ -140,7 +142,7 @@ def test_docker_socket(): socket_gid = os.stat(socket).st_gid # TODO FIX THIS TEST.. GROUPS ARE NOT BEING CORRECTLY SET INSIDE THE DOCKER CONTAINER - gids = [999, 996, 995, 987] + gids = [1000, 999, 996, 995, 987] if socket_gid in gids: return @@ -154,12 +156,19 @@ def test_docker_socket2(): """ Check to see if the nobody user has access to the docker socket """ - dc = docker.from_env() - if len(dc.containers.list()) < 1: - message = f"Cannot access docker socket" + try: + dc = docker.DockerClient(base_url='unix:///var/run/docker.sock') + dc.ping() + except Exception as e : + whoami = subprocess.check_output("whoami", shell=True).decode().strip() + my_groups = subprocess.check_output("groups", shell=True).decode().strip() + ggid = os.getgid() + uid = os.getuid() + message = f"Cannot access docker socket {e} user={whoami} groups={my_groups} uid={uid} gid={ggid}" exit_unsuccessfully(message) + def test_world_writeable(): """ Check to see if /mnt/awe/condor is writeable @@ -235,6 +244,7 @@ def checkEndpoints(): + def main(): try: # send_slack_message(f"Job HEALTH_CHECK is beginning at {datetime.datetime.now()}") diff --git a/deployment/bin/install_python_dependencies.sh b/deployment/bin/install_python_dependencies.sh deleted file mode 100755 index 191437c..0000000 --- a/deployment/bin/install_python_dependencies.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash - -#Install Python3 Libraries for cronjobs and for job runner - -source /miniconda/bin/activate -pip install requests==2.29.0 -pip install docker==6.1.3 -pip install slackclient==2.9.4 -pip install htcondor==10.7.0 -pip install psutil==5.9.5 -pip install lockfile==0.12.2 -pip install sanic==21.9.3 -pip install websockets==10.4 diff --git a/deployment/bin/misc/java_stats.sh b/deployment/bin/misc/java_stats.sh deleted file mode 100755 index aed096e..0000000 --- a/deployment/bin/misc/java_stats.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env jshell-wrapper - -System.out.println("Available Processors"); -System.out.println(Runtime.getRuntime().availableProcessors()); - -System.out.println("Free Memory"); -System.out.println(Runtime.getRuntime().freeMemory() + " " + Runtime.getRuntime().freeMemory() / 1000000000 + "G"); - -System.out.println("Max Memory"); -System.out.println(Runtime.getRuntime().maxMemory() + " " + Runtime.getRuntime().maxMemory() / 1000000000 + "G"); diff --git a/deployment/bin/misc/jshell-wrapper b/deployment/bin/misc/jshell-wrapper deleted file mode 100755 index ab5ad0f..0000000 --- a/deployment/bin/misc/jshell-wrapper +++ /dev/null @@ -1,7 +0,0 @@ -TMP=`mktemp` -tail -n +2 $@ >> $TMP -echo "/exit" >> $TMP -$JAVA_HOME/bin/jshell -q --execution local $TMP -rm $TMP - -#put this file in /usr/local/bin/ or somewhere in your $PATH diff --git a/deployment/bin/start-condor.sh b/deployment/bin/start-condor.sh index d2d7ac2..fbbbcb2 100755 --- a/deployment/bin/start-condor.sh +++ b/deployment/bin/start-condor.sh @@ -4,7 +4,9 @@ # condor pool password if [ "$GROUPMOD_DOCKER" ] ; then - groupmod -g $GROUPMOD_DOCKER docker + groupmod -o -g $GROUPMOD_DOCKER docker + usermod -aG docker kbase # for jobs running as kbase user/nobody user + usermod -aG docker condor # for condor cronjobs fi if [ "$POOL_PASSWORD" ] ; then @@ -14,10 +16,12 @@ if [ "$POOL_PASSWORD" ] ; then fi if [ "$SET_NOBODY_USER_GUID" ] ; then + # For file permissions usermod -a -G "$SET_NOBODY_USER_GUID" nobody usermod -a -G "$SET_NOBODY_USER_GUID" condor -# For backwards compatibility for directories already created by the kbase user + # For backwards compatibility for directories already created by the kbase user usermod -a -G "kbase" nobody + usermod -a -G "docker" nobody fi if [ "$SET_NOBODY_USER_UID" ] ; then @@ -33,6 +37,7 @@ if [ "$CONDOR_SUBMIT_WORKDIR" ] ; then chmod 01777 "$CONDOR_SUBMIT_WORKDIR/logs" chmod 01777 "$CONDOR_SUBMIT_WORKDIR/${EXECUTE_SUFFIX}/logs" chmod 01777 "$CONDOR_SUBMIT_WORKDIR/${EXECUTE_SUFFIX}/../logs" + chown condor $CONDOR_SUBMIT_WORKDIR/${EXECUTE_SUFFIX} else mkdir -p "/cdr/${EXECUTE_SUFFIX}" chmod 01777 "/cdr/${EXECUTE_SUFFIX}" diff --git a/deployment/condor_config.local.jinja b/deployment/condor_config.local.jinja new file mode 100644 index 0000000..e69de29 diff --git a/deployment/conf/.templates/cronjobs.config.templ b/deployment/conf/.templates/cronjobs.config.templ index b0cbb03..f490f93 100644 --- a/deployment/conf/.templates/cronjobs.config.templ +++ b/deployment/conf/.templates/cronjobs.config.templ @@ -1,32 +1,26 @@ -# SLACK_WEBHOOK_KEY={{ .Env.SLACK_WEBHOOK_KEY }} - -# startd hook to check if node is healthy +# This checks if the node is healthy and reports to slack if it is not. Sets NODE_IS_HEALTHY to True or False STARTD_CRON_NodeHealth_EXECUTABLE = /kb/deployment/bin/cron/health_check.py STARTD_CRON_NodeHealth_PERIOD = 6m STARTD_CRON_NodeHealth_MODE = Periodic STARTD_CRON_NodeHealth_RECONFIG_RERUN = True -STARTD_CRON_NodeHealth_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} SERVICE_ENDPOINT={{ .Env.SERVICE_ENDPOINT }} CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} DOCKER_CACHE={{ .Env.DOCKER_CACHE }} DELETE_ABANDONED_CONTAINERS={{ .Env.DELETE_ABANDONED_CONTAINERS }}" +STARTD_CRON_NodeHealth_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} SERVICE_ENDPOINT={{ .Env.SERVICE_ENDPOINT }} CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} DOCKER_CACHE={{ .Env.DOCKER_CACHE }} EXECUTE_SUFFIX={{ .Env.EXECUTE_SUFFIX }} CHECK_CONDOR_STARTER_HEALTH={{ .Env.CHECK_CONDOR_STARTER_HEALTH }} " + -# startd hook to delete exited containers +# startd hook to delete exited containers (Might want to leave this longer for debugging) STARTD_CRON_DeleteExitedContainers_EXECUTABLE = /kb/deployment/bin/cron/delete_exited_containers.py -STARTD_CRON_DeleteExitedContainers_PERIOD = 10m +STARTD_CRON_DeleteExitedContainers_PERIOD = 30m STARTD_CRON_DeleteExitedContainers_MODE = Periodic STARTD_CRON_DeleteExitedContainers_RECONFIG_RERUN = True STARTD_CRON_DeleteExitedContainers_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }}" -# startd hook to delete abandoned containers -STARTD_CRON_ReapAbandondedContainers_EXECUTABLE = /kb/deployment/bin/cron/container_reaper.py -STARTD_CRON_ReapAbandondedContainers_PERIOD = 6m -STARTD_CRON_ReapAbandondedContainers_MODE = Periodic -STARTD_CRON_ReapAbandondedContainers_RECONFIG_RERUN = True -STARTD_CRON_ReapAbandondedContainers_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} SERVICE_ENDPOINT={{ .Env.SERVICE_ENDPOINT }} CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} DOCKER_CACHE={{ .Env.DOCKER_CACHE }} DELETE_ABANDONED_CONTAINERS={{ .Env.DELETE_ABANDONED_CONTAINERS }}" -# startd hook to delete abandoned containers -STARTD_CRON_ReapAbandondedContainersEE2_EXECUTABLE = /kb/deployment/bin/cron/container_reaper_ee2.py -STARTD_CRON_ReapAbandondedContainersEE2_PERIOD = 6m -STARTD_CRON_ReapAbandondedContainersEE2_MODE = Periodic -STARTD_CRON_ReapAbandondedContainersEE2_RECONFIG_RERUN = True -STARTD_CRON_ReapAbandondedContainersEE2_ENV = "EE2_ENDPOINT={{ .Env.EE2_ENDPOINT }} SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} SERVICE_ENDPOINT={{ .Env.SERVICE_ENDPOINT }} CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} DOCKER_CACHE={{ .Env.DOCKER_CACHE }} DELETE_ABANDONED_CONTAINERS={{ .Env.DELETE_ABANDONED_CONTAINERS }}" +# Container Reaper Version 2024 +STARTD_CRON_ContainerReaper_EXECUTABLE = /kb/deployment/bin/cron/container_reaper.py +STARTD_CRON_ContainerReaper_PERIOD = 6m +STARTD_CRON_ContainerReaper_MODE = Periodic +STARTD_CRON_ContainerReaper_RECONFIG_RERUN = True +STARTD_CRON_ContainerReaper_ENV = "SLACK_WEBHOOK_URL={{ .Env.SLACK_WEBHOOK_URL }} CONTAINER_REAPER_ENDPOINTS={{ .Env.CONTAINER_REAPER_ENDPOINTS }} DELETE_ABANDONED_CONTAINERS={{ .Env.DELETE_ABANDONED_CONTAINERS }}" + # Tmpwatch $CONDOR_SUBMIT_WORKDIR STARTD_CRON_ManageCondorSubmitWorkdir_EXECUTABLE = /usr/sbin/tmpwatch @@ -36,12 +30,12 @@ STARTD_CRON_ManageCondorSubmitWorkdir_MODE = Periodic STARTD_CRON_ManageCondorSubmitWorkdir_RECONFIG_RERUN = True STARTD_CRON_ManageCondorSubmitWorkdir_ENV = "CONDOR_SUBMIT_WORKDIR={{ .Env.CONDOR_SUBMIT_WORKDIR }} " -# Prune docker every 14 days.. This works right now, but need to redirect to a script +# Prune docker every 14 days STARTD_CRON_ManageVarLibDocker_EXECUTABLE = /usr/bin/docker STARTD_CRON_ManageVarLibDocker_ARGS = system prune -a -f STARTD_CRON_ManageVarLibDocker_PERIOD = 336h STARTD_CRON_ManageVarLibDocker_MODE = Periodic -STARTD_CRON_ManageCondorSubmitWorkdir_RECONFIG_RERUN = True +STARTD_CRON_ManageVarLibDocker_RECONFIG_RERUN = True -STARTD_CRON_JOBLIST = NodeHealth ReapAbandondedContainersEE2 ManageVarLibDocker ManageCondorSubmitWorkdir +STARTD_CRON_JOBLIST = NodeHealth ContainerReaper ManageVarLibDocker ManageCondorSubmitWorkdir DeleteExitedContainers # STARTD_CRON_AUTOPUBLISH = If_Changed diff --git a/deployment/conf/.templates/deployment.cfg.templ b/deployment/conf/legacy/deployment.cfg.templ similarity index 100% rename from deployment/conf/.templates/deployment.cfg.templ rename to deployment/conf/legacy/deployment.cfg.templ diff --git a/deployment/conf/.templates/limitBigMemSlots.templ b/deployment/conf/legacy/limitBigMemSlots.templ similarity index 100% rename from deployment/conf/.templates/limitBigMemSlots.templ rename to deployment/conf/legacy/limitBigMemSlots.templ diff --git a/deployment/conf/.templates/shared_port_config.templ b/deployment/conf/legacy/shared_port_config.templ similarity index 100% rename from deployment/conf/.templates/shared_port_config.templ rename to deployment/conf/legacy/shared_port_config.templ diff --git a/deployment/conf/.templates/start_server.sh.templ b/deployment/conf/legacy/start_server.sh.templ similarity index 100% rename from deployment/conf/.templates/start_server.sh.templ rename to deployment/conf/legacy/start_server.sh.templ