From 648db22317b8b045a6d9543775a73d84cafe972d Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 14 Jul 2023 10:47:31 +0100 Subject: [PATCH 001/152] Added Open Ondemand to image --- Dockerfile | 3 +++ docker-entrypoint.sh | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 167584b..bcc6231 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,6 +15,8 @@ RUN set -ex \ && yum -y install dnf-plugins-core epel-release \ && yum -y install dnf-plugins-core \ && yum config-manager --set-enabled powertools \ + && yum -y module enable ruby:2.7 nodejs:14 \ + && yum -y install https://yum.osc.edu/ondemand/2.0/ondemand-release-web-2.0-1.noarch.rpm \ && yum -y install \ wget \ bzip2 \ @@ -42,6 +44,7 @@ RUN set -ex \ hwloc-devel \ openssh-server \ apptainer \ + ondemand \ && yum clean all \ && rm -rf /var/cache/yum diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index ee12fab..eac6aa1 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -100,8 +100,12 @@ then /usr/sbin/sshd echo "---> Starting the MUNGE Authentication service (munged) ..." - gosu munge /usr/sbin/munged -F + gosu munge /usr/sbin/munged echo "---> MUNGE Complete" + + echo "---> Starting Apache Server" + /usr/libexec/httpd-ssl-gencerts + /usr/sbin/httpd -k start -X -e debug fi if [ "$1" = "check-queue-hook" ] From b241c36d985305e31ac276cefa46d988d6c3aed7 Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 14 Jul 2023 15:38:38 +0100 Subject: [PATCH 002/152] Running ood portal generator --- docker-entrypoint.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index eac6aa1..e7b4505 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -105,6 +105,7 @@ then echo "---> Starting Apache Server" /usr/libexec/httpd-ssl-gencerts + /opt/ood/ood-portal-generator/sbin/update_ood_portal /usr/sbin/httpd -k start -X -e debug fi From 1995fd954ab064c914e44eaa513a856ea899fb1c Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 14 Jul 2023 16:19:35 +0100 Subject: [PATCH 003/152] Trying adding ood user before starts --- docker-entrypoint.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index e7b4505..16a002d 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -106,6 +106,8 @@ then echo "---> Starting Apache Server" /usr/libexec/httpd-ssl-gencerts /opt/ood/ood-portal-generator/sbin/update_ood_portal + groupadd ood + useradd -d /home/ood -g ood -k /etc/skel -m ood /usr/sbin/httpd -k start -X -e debug fi From 26a475046835eb3a5a847d0033552cfbdd997e1a Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 17 Jul 2023 09:10:26 +0100 Subject: [PATCH 004/152] Apache runs but auth errors --- slurm-cluster-chart/files/httpd.conf | 356 ++++++++++++++++++ slurm-cluster-chart/files/ood_portal.yaml | 246 ++++++++++++ .../templates/httpd-configmap.yaml | 8 + .../templates/login-deployment.yaml | 13 + .../templates/login-service.yaml | 4 + .../templates/ood-portal-configmap.yaml | 8 + slurm-cluster-chart/values.yaml | 2 +- 7 files changed, 636 insertions(+), 1 deletion(-) create mode 100644 slurm-cluster-chart/files/httpd.conf create mode 100644 slurm-cluster-chart/files/ood_portal.yaml create mode 100644 slurm-cluster-chart/templates/httpd-configmap.yaml create mode 100644 slurm-cluster-chart/templates/ood-portal-configmap.yaml diff --git a/slurm-cluster-chart/files/httpd.conf b/slurm-cluster-chart/files/httpd.conf new file mode 100644 index 0000000..6d3783a --- /dev/null +++ b/slurm-cluster-chart/files/httpd.conf @@ -0,0 +1,356 @@ +# +# This is the main Apache HTTP server configuration file. It contains the +# configuration directives that give the server its instructions. +# See for detailed information. +# In particular, see +# +# for a discussion of each configuration directive. +# +# See the httpd.conf(5) man page for more information on this configuration, +# and httpd.service(8) on using and configuring the httpd service. +# +# Do NOT simply read the instructions in here without understanding +# what they do. They're here only as hints or reminders. If you are unsure +# consult the online docs. You have been warned. +# +# Configuration and logfile names: If the filenames you specify for many +# of the server's control files begin with "/" (or "drive:/" for Win32), the +# server will use that explicit path. If the filenames do *not* begin +# with "/", the value of ServerRoot is prepended -- so 'log/access_log' +# with ServerRoot set to '/www' will be interpreted by the +# server as '/www/log/access_log', where as '/log/access_log' will be +# interpreted as '/log/access_log'. + +# +# ServerRoot: The top of the directory tree under which the server's +# configuration, error, and log files are kept. +# +# Do not add a slash at the end of the directory path. If you point +# ServerRoot at a non-local disk, be sure to specify a local disk on the +# Mutex directive, if file-based mutexes are used. If you wish to share the +# same ServerRoot for multiple httpd daemons, you will need to change at +# least PidFile. +# +ServerRoot "/etc/httpd" + +# +# Listen: Allows you to bind Apache to specific IP addresses and/or +# ports, instead of the default. See also the +# directive. +# +# Change this to Listen on specific IP addresses as shown below to +# prevent Apache from glomming onto all bound IP addresses. +# +#Listen 12.34.56.78:80 +Listen 80 + +# +# Dynamic Shared Object (DSO) Support +# +# To be able to use the functionality of a module which was built as a DSO you +# have to place corresponding `LoadModule' lines at this location so the +# directives contained in it are actually available _before_ they are used. +# Statically compiled modules (those listed by `httpd -l') do not need +# to be loaded here. +# +# Example: +# LoadModule foo_module modules/mod_foo.so +# +Include conf.modules.d/*.conf + +# +# If you wish httpd to run as a different user or group, you must run +# httpd as root initially and it will switch. +# +# User/Group: The name (or #number) of the user/group to run httpd as. +# It is usually good practice to create a dedicated user and group for +# running httpd, as with most system services. +# +User apache +Group apache + +# 'Main' server configuration +# +# The directives in this section set up the values used by the 'main' +# server, which responds to any requests that aren't handled by a +# definition. These values also provide defaults for +# any containers you may define later in the file. +# +# All of these directives may appear inside containers, +# in which case these default settings will be overridden for the +# virtual host being defined. +# + +# +# ServerAdmin: Your address, where problems with the server should be +# e-mailed. This address appears on some server-generated pages, such +# as error documents. e.g. admin@your-domain.com +# +ServerAdmin root@localhost + +# +# ServerName gives the name and port that the server uses to identify itself. +# This can often be determined automatically, but we recommend you specify +# it explicitly to prevent problems during startup. +# +# If your host doesn't have a registered DNS name, enter its IP address here. +# +#ServerName www.example.com:80 + +# +# Deny access to the entirety of your server's filesystem. You must +# explicitly permit access to web content directories in other +# blocks below. +# + + AllowOverride none + Require all denied + + +# +# Note that from this point forward you must specifically allow +# particular features to be enabled - so if something's not working as +# you might expect, make sure that you have specifically enabled it +# below. +# + +# +# DocumentRoot: The directory out of which you will serve your +# documents. By default, all requests are taken from this directory, but +# symbolic links and aliases may be used to point to other locations. +# +DocumentRoot "/var/www/html" + +# +# Relax access to content within /var/www. +# + + AllowOverride None + # Allow open access: + Require all granted + + +# Further relax access to the default document root: + + # + # Possible values for the Options directive are "None", "All", + # or any combination of: + # Indexes Includes FollowSymLinks SymLinksifOwnerMatch ExecCGI MultiViews + # + # Note that "MultiViews" must be named *explicitly* --- "Options All" + # doesn't give it to you. + # + # The Options directive is both complicated and important. Please see + # http://httpd.apache.org/docs/2.4/mod/core.html#options + # for more information. + # + Options Indexes FollowSymLinks + + # + # AllowOverride controls what directives may be placed in .htaccess files. + # It can be "All", "None", or any combination of the keywords: + # Options FileInfo AuthConfig Limit + # + AllowOverride None + + # + # Controls who can get stuff from this server. + # + Require all granted + + +# +# DirectoryIndex: sets the file that Apache will serve if a directory +# is requested. +# + + DirectoryIndex index.html + + +# +# The following lines prevent .htaccess and .htpasswd files from being +# viewed by Web clients. +# + + Require all denied + + +# +# ErrorLog: The location of the error log file. +# If you do not specify an ErrorLog directive within a +# container, error messages relating to that virtual host will be +# logged here. If you *do* define an error logfile for a +# container, that host's errors will be logged there and not here. +# +ErrorLog "logs/error_log" + +# +# LogLevel: Control the number of messages logged to the error_log. +# Possible values include: debug, info, notice, warn, error, crit, +# alert, emerg. +# +LogLevel debug + + + # + # The following directives define some format nicknames for use with + # a CustomLog directive (see below). + # + LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined + LogFormat "%h %l %u %t \"%r\" %>s %b" common + + + # You need to enable mod_logio.c to use %I and %O + LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %I %O" combinedio + + + # + # The location and format of the access logfile (Common Logfile Format). + # If you do not define any access logfiles within a + # container, they will be logged here. Contrariwise, if you *do* + # define per- access logfiles, transactions will be + # logged therein and *not* in this file. + # + #CustomLog "logs/access_log" common + + # + # If you prefer a logfile with access, agent, and referer information + # (Combined Logfile Format) you can use the following directive. + # + CustomLog "logs/access_log" combined + + + + # + # Redirect: Allows you to tell clients about documents that used to + # exist in your server's namespace, but do not anymore. The client + # will make a new request for the document at its new location. + # Example: + # Redirect permanent /foo http://www.example.com/bar + + # + # Alias: Maps web paths into filesystem paths and is used to + # access content that does not live under the DocumentRoot. + # Example: + # Alias /webpath /full/filesystem/path + # + # If you include a trailing / on /webpath then the server will + # require it to be present in the URL. You will also likely + # need to provide a section to allow access to + # the filesystem path. + + # + # ScriptAlias: This controls which directories contain server scripts. + # ScriptAliases are essentially the same as Aliases, except that + # documents in the target directory are treated as applications and + # run by the server when requested rather than as documents sent to the + # client. The same rules about trailing "/" apply to ScriptAlias + # directives as to Alias. + # + ScriptAlias /cgi-bin/ "/var/www/cgi-bin/" + + + +# +# "/var/www/cgi-bin" should be changed to whatever your ScriptAliased +# CGI directory exists, if you have that configured. +# + + AllowOverride None + Options None + Require all granted + + + + # + # TypesConfig points to the file containing the list of mappings from + # filename extension to MIME-type. + # + TypesConfig /etc/mime.types + + # + # AddType allows you to add to or override the MIME configuration + # file specified in TypesConfig for specific file types. + # + #AddType application/x-gzip .tgz + # + # AddEncoding allows you to have certain browsers uncompress + # information on the fly. Note: Not all browsers support this. + # + #AddEncoding x-compress .Z + #AddEncoding x-gzip .gz .tgz + # + # If the AddEncoding directives above are commented-out, then you + # probably should define those extensions to indicate media types: + # + AddType application/x-compress .Z + AddType application/x-gzip .gz .tgz + + # + # AddHandler allows you to map certain file extensions to "handlers": + # actions unrelated to filetype. These can be either built into the server + # or added with the Action directive (see below) + # + # To use CGI scripts outside of ScriptAliased directories: + # (You will also need to add "ExecCGI" to the "Options" directive.) + # + #AddHandler cgi-script .cgi + + # For type maps (negotiated resources): + #AddHandler type-map var + + # + # Filters allow you to process content before it is sent to the client. + # + # To parse .shtml files for server-side includes (SSI): + # (You will also need to add "Includes" to the "Options" directive.) + # + AddType text/html .shtml + AddOutputFilter INCLUDES .shtml + + +# +# Specify a default charset for all content served; this enables +# interpretation of all content as UTF-8 by default. To use the +# default browser choice (ISO-8859-1), or to allow the META tags +# in HTML content to override this choice, comment out this +# directive: +# +AddDefaultCharset UTF-8 + + + # + # The mod_mime_magic module allows the server to use various hints from the + # contents of the file itself to determine its type. The MIMEMagicFile + # directive tells the module where the hint definitions are located. + # + MIMEMagicFile conf/magic + + +# +# Customizable error responses come in three flavors: +# 1) plain text 2) local redirects 3) external redirects +# +# Some examples: +#ErrorDocument 500 "The server made a boo boo." +#ErrorDocument 404 /missing.html +#ErrorDocument 404 "/cgi-bin/missing_handler.pl" +#ErrorDocument 402 http://www.example.com/subscription_info.html +# + +# +# EnableMMAP and EnableSendfile: On systems that support it, +# memory-mapping or the sendfile syscall may be used to deliver +# files. This usually improves server performance, but must +# be turned off when serving from networked-mounted +# filesystems or if support for these functions is otherwise +# broken on your system. +# Defaults if commented: EnableMMAP On, EnableSendfile Off +# +#EnableMMAP off +EnableSendfile on + +# Supplemental configuration +# +# Load config files in the "/etc/httpd/conf.d" directory, if any. +IncludeOptional conf.d/*.conf \ No newline at end of file diff --git a/slurm-cluster-chart/files/ood_portal.yaml b/slurm-cluster-chart/files/ood_portal.yaml new file mode 100644 index 0000000..88b6ed4 --- /dev/null +++ b/slurm-cluster-chart/files/ood_portal.yaml @@ -0,0 +1,246 @@ +--- +# +# Portal configuration +# + +# The address and port to listen for connections on +# Example: +# listen_addr_port: 443 +# Default: null (don't add any more listen directives) +#listen_addr_port: 80 + +# The server name used for name-based Virtual Host +# Example: +# servername: 'www.example.com' +# Default: null (don't use name-based Virtual Host) +servername: 128.232.226.84 +#serverAlias: 128.232.226.84 + +# The port specification for the Virtual Host +# Example: +# port: 8080 +#Default: null (use default port 80 or 443 if SSL enabled) +#port: null + +# List of SSL Apache directives +# Example: +# ssl: +# - 'SSLCertificateFile "/etc/pki/tls/certs/www.example.com.crt"' +# - 'SSLCertificateKeyFile "/etc/pki/tls/private/www.example.com.key"' +# Default: null (no SSL support) +#ssl: null + +# Root directory of log files (can be relative ServerRoot) +# Example: +# logroot: '/path/to/my/logs' +# Default: 'logs' (this is relative to ServerRoot) +#logroot: 'logs' + +# Root directory of the Lua handler code +# Example: +# lua_root: '/path/to/lua/handlers' +# Default : '/opt/ood/mod_ood_proxy/lib' (default install directory of mod_ood_proxy) +#lua_root: '/opt/ood/mod_ood_proxy/lib' + +# Verbosity of the Lua module logging +# (see https://httpd.apache.org/docs/2.4/mod/core.html#loglevel) +# Example: +# lua_log_level: 'warn' +# Default: 'info' (get verbose logs) +#lua_log_level: 'info' + +# System command used to map authenticated-user to system-user +# Example: +# user_map_cmd: '/opt/ood/ood_auth_map/bin/ood_auth_map.regex --regex=''^(\w+)@example.com$''' +# Default: '/opt/ood/ood_auth_map/bin/ood_auth_map.regex' (this echo's back auth-user) +#user_map_cmd: '/opt/ood/ood_auth_map/bin/ood_auth_map.regex' + +# Use an alternative CGI environment variable instead of REMOTE_USER for +# determining the authenticated-user fed to the mapping script +# Example: +# user_env: 'OIDC_CLAIM_preferred_username' +# Default: null (use REMOTE_USER) +#user_env: null + +# Redirect user to the following URI if fail to map there authenticated-user to +# a system-user +# Example: +# map_fail_uri: '/register' +# Default: null (don't redirect, just display error message) +#map_fail_uri: null + +# System command used to run the `nginx_stage` script with sudo privileges +# Example: +# pun_stage_cmd: 'sudo /path/to/nginx_stage' +# Default: 'sudo /opt/ood/nginx_stage/sbin/nginx_stage' (don't forget sudo) +#pun_stage_cmd: 'sudo /opt/ood/nginx_stage/sbin/nginx_stage' + +# List of Apache authentication directives +# NB: Be sure the appropriate Apache module is installed for this +# Default: (see below, uses basic auth with an htpasswd file) +# auth: +# - 'AuthType Basic' +# - 'AuthName "private"' +# - 'AuthUserFile "/opt/rh/httpd24/root/etc/httpd/.htpasswd"' +# - 'RequestHeader unset Authorization' +# - 'Require valid-user' + +# Redirect user to the following URI when accessing root URI +# Example: +# root_uri: '/my_uri' +# # https://www.example.com/ => https://www.example.com/my_uri +# Default: '/pun/sys/dashboard' (default location of the OOD Dashboard app) +#root_uri: '/pun/sys/dashboard' + +# Track server-side analytics with a Google Analytics account and property +# (see https://github.com/OSC/mod_ood_proxy/blob/master/lib/analytics.lua for +# information on how to setup the GA property) +# Example: +# analytics: +# url: 'http://www.google-analytics.com/collect' +# id: 'UA-79331310-4' +# Default: null (do not track) +#analytics: null + +# +# Publicly available assets +# + +# Public sub-uri (available to public with no authentication) +# Example: +# public_uri: '/assets' +# Default: '/public' +#public_uri: '/public' + +# Root directory that serves the public sub-uri (be careful, everything under +# here is open to the public) +# Example: +# public_root: '/path/to/public/assets' +# Default: '/var/www/ood/public' +#public_root: '/var/www/ood/public' + +# +# Logout redirect helper +# + +# Logout sub-uri +# Example +# logout_uri: '/log_me_out' +# NB: If you change this, then modify the Dashboard app with the new sub-uri +# Default: '/logout' (the Dashboard app is by default going to expect this) +#logout_uri: '/logout' + +# Redirect user to the following URI when accessing logout URI +# Example: +# logout_redirect: '/oidc?logout=https%3A%2F%2Fwww.example.com' +# Default: '/pun/sys/dashboard/logout' (the Dashboard app provides a simple +# HTML page explaining logout to the user) +#logout_redirect: '/pun/sys/dashboard/logout' + +# +# Reverse proxy to backend nodes +# + +# Regular expression used for whitelisting allowed hostnames of nodes +# Example: +# host_regex: '[\w.-]+\.example\.com' +# Default: '[^/]+' (allow reverse proxying to all hosts, this allows external +# hosts as well) +#host_regex: '[^/]+' + +# Sub-uri used to reverse proxy to backend web server running on node that +# knows the full URI path +# Example: +# node_uri: '/node' +# Default: null (disable this feature) +#node_uri: null + +# Sub-uri used to reverse proxy to backend web server running on node that +# ONLY uses *relative* URI paths +# Example: +# rnode_uri: '/rnode' +# Default: null (disable this feature) +#rnode_uri: null + +# +# Per-user NGINX Passenger apps +# + +# Sub-uri used to control PUN processes +# Example: +# nginx_uri: '/my_pun_controller' +# Default: '/nginx' +#nginx_uri: '/nginx' + +# Sub-uri used to access the PUN processes +# Example: +# pun_uri: '/my_pun_apps' +# Default: '/pun' +#pun_uri: '/pun' + +# Root directory that contains the PUN Unix sockets that the proxy uses to +# connect to +# Example: +# pun_socket_root: '/path/to/pun/sockets' +# Default: '/var/run/ondemand-nginx' (default location set in nginx_stage) +#pun_socket_root: '/var/run/ondemand-nginx' + +# Number of times the proxy attempts to connect to the PUN Unix socket before +# giving up and displaying an error to the user +# Example: +# pun_max_retries: 25 +# Default: 5 (only try 5 times) +#pun_max_retries: 5 + +# +# Support for OpenID Connect +# + +# Sub-uri used by mod_auth_openidc for authentication +# Example: +# oidc_uri: '/oidc' +# Default: null (disable OpenID Connect support) +#oidc_uri: null + +# Sub-uri user is redirected to if they are not authenticated. This is used to +# *discover* what ID provider the user will login through. +# Example: +# oidc_discover_uri: '/discover' +# Default: null (disable support for discovering OpenID Connect IdP) +#oidc_discover_uri: null + +# Root directory on the filesystem that serves the HTML code used to display +# the discovery page +# Example: +# oidc_discover_root: '/var/www/ood/discover' +# Default: null (disable support for discovering OpenID Connect IdP) +#oidc_discover_root: null + +# +# Support for registering unmapped users +# +# (Not necessary if using regular expressions for mapping users) +# + +# Sub-uri user is redirected to if unable to map authenticated-user to +# system-user +# Example: +# register_uri: '/register' +# Default: null (display error to user if mapping fails) +#register_uri: null + +# Root directory on the filesystem that serves the HTML code used to register +# an unmapped user +# Example: +# register_root: '/var/www/ood/register' +# Default: null (display error to user if mapping fails) +#register_root: null + +host_regex: 'head' +auth: + - 'AuthType Basic' + - 'AuthName "private"' + - 'AuthBasicProvider dbm' + - 'AuthDBMUserFile "/opt/rh/httpd24/root/etc/httpd/.htpasswd.dbm"' + - 'RequestHeader unset Authorization' + - 'Require valid-user' \ No newline at end of file diff --git a/slurm-cluster-chart/templates/httpd-configmap.yaml b/slurm-cluster-chart/templates/httpd-configmap.yaml new file mode 100644 index 0000000..93eb6ea --- /dev/null +++ b/slurm-cluster-chart/templates/httpd-configmap.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: httpd-configmap +data: + httpd.conf: | + {{- .Files.Get "files/httpd.conf" | nindent 4 -}} + \ No newline at end of file diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index 2b49536..b27a1af 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -28,6 +28,7 @@ spec: name: login ports: - containerPort: 22 + - containerPort: 80 volumeMounts: - mountPath: {{ .Values.nfs.mountPath }} name: slurm-jobdir @@ -43,6 +44,12 @@ spec: - name: authorized-keys mountPath: /tempmounts/authorized_keys subPath: authorized_keys + - name: ood-portal + mountPath: /etc/ood/config/ood_portal.yml + subPath: ood_portal.yml + - name: httpd-config + mountPath: /etc/httpd/conf/httpd.conf + subPath: httpd.conf resources: {} hostname: login restartPolicy: Always @@ -62,3 +69,9 @@ spec: - name: authorized-keys configMap: name: {{ .Values.configmaps.authorizedKeys }} + - name: ood-portal + configMap: + name: ood-portal-configmap + - name: httpd-config + configMap: + name: httpd-configmap diff --git a/slurm-cluster-chart/templates/login-service.yaml b/slurm-cluster-chart/templates/login-service.yaml index 0a38ba4..fee3480 100644 --- a/slurm-cluster-chart/templates/login-service.yaml +++ b/slurm-cluster-chart/templates/login-service.yaml @@ -11,6 +11,10 @@ spec: - name: "ssh" port: 22 targetPort: 22 + - name: "apache" + port: 80 + targetPort: 80 + protocol: TCP type: LoadBalancer selector: app.kubernetes.io/name: slurm diff --git a/slurm-cluster-chart/templates/ood-portal-configmap.yaml b/slurm-cluster-chart/templates/ood-portal-configmap.yaml new file mode 100644 index 0000000..6770d82 --- /dev/null +++ b/slurm-cluster-chart/templates/ood-portal-configmap.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: ood-portal-configmap +data: + ood_portal.yml: | + {{- .Files.Get "files/ood_portal.yaml" | nindent 4 -}} + \ No newline at end of file diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 92e5088..68c209b 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:05bbb87 +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:1995fd9 replicas: slurmd: 2 From 6abcad04da63cc532eb15a58009477ec371f46fd Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 17 Jul 2023 10:25:49 +0100 Subject: [PATCH 005/152] Creating htpasswd file and adding user on startup --- docker-entrypoint.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 16a002d..0ff2dc8 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -106,8 +106,10 @@ then echo "---> Starting Apache Server" /usr/libexec/httpd-ssl-gencerts /opt/ood/ood-portal-generator/sbin/update_ood_portal + mkdir --parents /opt/rh/httpd24/root/etc/httpd/ groupadd ood useradd -d /home/ood -g ood -k /etc/skel -m ood + /usr/bin/htpasswd -cb /opt/rh/httpd24/root/etc/httpd/.htpasswd.dbm ood password /usr/sbin/httpd -k start -X -e debug fi From 494a7a522fda5a55e764e0aad895d4213006f2db Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 17 Jul 2023 11:06:33 +0100 Subject: [PATCH 006/152] Now adds rocky as authenticated user and uses htdbm to generate auth file --- docker-entrypoint.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 0ff2dc8..951e8d0 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -107,9 +107,7 @@ then /usr/libexec/httpd-ssl-gencerts /opt/ood/ood-portal-generator/sbin/update_ood_portal mkdir --parents /opt/rh/httpd24/root/etc/httpd/ - groupadd ood - useradd -d /home/ood -g ood -k /etc/skel -m ood - /usr/bin/htpasswd -cb /opt/rh/httpd24/root/etc/httpd/.htpasswd.dbm ood password + /usr/bin/htdbm -cb /opt/rh/httpd24/root/etc/httpd/.htpasswd.dbm rocky password /usr/sbin/httpd -k start -X -e debug fi From 547428befb091bf32cd9ce49f7d3b8fc28c0f3f7 Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 17 Jul 2023 13:50:09 +0100 Subject: [PATCH 007/152] Updated image + mounted cluster config --- slurm-cluster-chart/files/ood-cluster-config.yml | 6 ++++++ slurm-cluster-chart/files/ood_portal.yaml | 2 +- .../templates/cluster-config-configmap.yaml | 7 +++++++ slurm-cluster-chart/templates/login-deployment.yaml | 6 ++++++ slurm-cluster-chart/values.yaml | 2 +- 5 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 slurm-cluster-chart/files/ood-cluster-config.yml create mode 100644 slurm-cluster-chart/templates/cluster-config-configmap.yaml diff --git a/slurm-cluster-chart/files/ood-cluster-config.yml b/slurm-cluster-chart/files/ood-cluster-config.yml new file mode 100644 index 0000000..c1b1905 --- /dev/null +++ b/slurm-cluster-chart/files/ood-cluster-config.yml @@ -0,0 +1,6 @@ +v2: + metadata: + title: "My Cluster" + login: + #host: www.example.com + host: 128.232.226.84 \ No newline at end of file diff --git a/slurm-cluster-chart/files/ood_portal.yaml b/slurm-cluster-chart/files/ood_portal.yaml index 88b6ed4..4eee040 100644 --- a/slurm-cluster-chart/files/ood_portal.yaml +++ b/slurm-cluster-chart/files/ood_portal.yaml @@ -13,7 +13,7 @@ # Example: # servername: 'www.example.com' # Default: null (don't use name-based Virtual Host) -servername: 128.232.226.84 +#servername: 128.232.226.84 #serverAlias: 128.232.226.84 # The port specification for the Virtual Host diff --git a/slurm-cluster-chart/templates/cluster-config-configmap.yaml b/slurm-cluster-chart/templates/cluster-config-configmap.yaml new file mode 100644 index 0000000..3076dc9 --- /dev/null +++ b/slurm-cluster-chart/templates/cluster-config-configmap.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: cluster-config +data: + httpd.conf: | + {{- .Files.Get "files/ood-cluster-config.yaml" | nindent 4 -}} \ No newline at end of file diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index b27a1af..ae3e91b 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -50,6 +50,9 @@ spec: - name: httpd-config mountPath: /etc/httpd/conf/httpd.conf subPath: httpd.conf + - name: cluster-config + mountPath: /etc/ood/config/cluster.d/ood-cluster-config.yml + subPath: ood-cluster-config.yml resources: {} hostname: login restartPolicy: Always @@ -69,6 +72,9 @@ spec: - name: authorized-keys configMap: name: {{ .Values.configmaps.authorizedKeys }} + - name: cluster-config + configMap: + name: cluster-config - name: ood-portal configMap: name: ood-portal-configmap diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 68c209b..824d5da 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:1995fd9 +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:494a7a5 replicas: slurmd: 2 From a1bd3706ed8fadd952d44fd15cee5c3651f04a77 Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 17 Jul 2023 13:51:29 +0100 Subject: [PATCH 008/152] Trying creating shell directory on startup --- docker-entrypoint.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 951e8d0..a228afb 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -107,6 +107,7 @@ then /usr/libexec/httpd-ssl-gencerts /opt/ood/ood-portal-generator/sbin/update_ood_portal mkdir --parents /opt/rh/httpd24/root/etc/httpd/ + mkdir --parents /etc/ood/config/apps/shell /usr/bin/htdbm -cb /opt/rh/httpd24/root/etc/httpd/.htpasswd.dbm rocky password /usr/sbin/httpd -k start -X -e debug fi From ee321c9ea55b27450d09238e1c99ca2e1c8c2a1b Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 17 Jul 2023 14:27:42 +0100 Subject: [PATCH 009/152] Trying adding env file to shell directory --- docker-entrypoint.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index a228afb..8c37d2c 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -108,6 +108,7 @@ then /opt/ood/ood-portal-generator/sbin/update_ood_portal mkdir --parents /opt/rh/httpd24/root/etc/httpd/ mkdir --parents /etc/ood/config/apps/shell + touch /etc/ood/config/apps/shell/env /usr/bin/htdbm -cb /opt/rh/httpd24/root/etc/httpd/.htpasswd.dbm rocky password /usr/sbin/httpd -k start -X -e debug fi From d48976b87039c6c92cb77aa3392b8ed12dc2db04 Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 17 Jul 2023 14:40:24 +0100 Subject: [PATCH 010/152] Bump values.yaml --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 824d5da..3a2ec3c 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:494a7a5 +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:ee321c9 replicas: slurmd: 2 From e3b877436a67ac5611063a4bb9fb4d49f0ab61c1 Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 18 Jul 2023 11:13:49 +0100 Subject: [PATCH 011/152] Trying installing modules in Dockerfile --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index bcc6231..855a1cc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,6 +17,7 @@ RUN set -ex \ && yum config-manager --set-enabled powertools \ && yum -y module enable ruby:2.7 nodejs:14 \ && yum -y install https://yum.osc.edu/ondemand/2.0/ondemand-release-web-2.0-1.noarch.rpm \ + && yum -y module install ruby nodejs \ && yum -y install \ wget \ bzip2 \ From 2172d7bec37e02ee865332a6741e44664a5735a7 Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 18 Jul 2023 13:14:00 +0100 Subject: [PATCH 012/152] Trying to cinfugre clusters (not working) --- slurm-cluster-chart/files/ood-cluster-config.yml | 8 ++++++-- slurm-cluster-chart/values.yaml | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/slurm-cluster-chart/files/ood-cluster-config.yml b/slurm-cluster-chart/files/ood-cluster-config.yml index c1b1905..c643189 100644 --- a/slurm-cluster-chart/files/ood-cluster-config.yml +++ b/slurm-cluster-chart/files/ood-cluster-config.yml @@ -2,5 +2,9 @@ v2: metadata: title: "My Cluster" login: - #host: www.example.com - host: 128.232.226.84 \ No newline at end of file + host: "localhost" + job: + cluster: "linux" + adapter: "slurm" + bin: "/usr/sbin" + conf: "/etc/slurm/slurm.conf" \ No newline at end of file diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 3a2ec3c..f3d0e41 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:ee321c9 +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:e3b8774 replicas: slurmd: 2 From 3f86fbe7853e977eb8099e00b2f52e99a271235a Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 18 Jul 2023 14:00:53 +0100 Subject: [PATCH 013/152] Trying entrypoint tweaks --- docker-entrypoint.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 8c37d2c..ff8b5e2 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -104,11 +104,14 @@ then echo "---> MUNGE Complete" echo "---> Starting Apache Server" + + mkdir --parents /etc/ood/config/apps/shell + env > /etc/ood/config/apps/shell/env + /usr/libexec/httpd-ssl-gencerts /opt/ood/ood-portal-generator/sbin/update_ood_portal mkdir --parents /opt/rh/httpd24/root/etc/httpd/ - mkdir --parents /etc/ood/config/apps/shell - touch /etc/ood/config/apps/shell/env + /usr/bin/htdbm -cb /opt/rh/httpd24/root/etc/httpd/.htpasswd.dbm rocky password /usr/sbin/httpd -k start -X -e debug fi From 7c541b0b22efbed961f1d655d6bded1c0f0ed8b1 Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 18 Jul 2023 15:06:02 +0100 Subject: [PATCH 014/152] Trying to configure cluster with the login nodes --- slurm-cluster-chart/files/ood-cluster-config.yml | 3 ++- slurm-cluster-chart/values.yaml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/slurm-cluster-chart/files/ood-cluster-config.yml b/slurm-cluster-chart/files/ood-cluster-config.yml index c643189..d50a59b 100644 --- a/slurm-cluster-chart/files/ood-cluster-config.yml +++ b/slurm-cluster-chart/files/ood-cluster-config.yml @@ -1,8 +1,9 @@ +--- v2: metadata: title: "My Cluster" login: - host: "localhost" + host: "login" job: cluster: "linux" adapter: "slurm" diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index f3d0e41..c9c72c2 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:e3b8774 +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:3f86fbe replicas: slurmd: 2 From c24c181a29f5f35c956dcd8ef43104ecf3a575df Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 18 Jul 2023 15:16:57 +0100 Subject: [PATCH 015/152] Image now sets up rocky OOD password with env variable from secret --- docker-entrypoint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index ff8b5e2..528b454 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -112,7 +112,7 @@ then /opt/ood/ood-portal-generator/sbin/update_ood_portal mkdir --parents /opt/rh/httpd24/root/etc/httpd/ - /usr/bin/htdbm -cb /opt/rh/httpd24/root/etc/httpd/.htpasswd.dbm rocky password + /usr/bin/htdbm -cb /opt/rh/httpd24/root/etc/httpd/.htpasswd.dbm rocky $ROCKY_OOD_PASS /usr/sbin/httpd -k start -X -e debug fi From ad79e16d07774696417f437ba1a16e6959cbf4a0 Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 18 Jul 2023 15:30:05 +0100 Subject: [PATCH 016/152] Rocky OOD password now set as secret from generate-secrets.sh --- generate-secrets.sh | 15 ++++++++++++++- .../templates/login-deployment.yaml | 6 ++++++ slurm-cluster-chart/values.yaml | 2 +- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/generate-secrets.sh b/generate-secrets.sh index db64a53..37fc2ef 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -10,4 +10,17 @@ kubectl create secret generic munge-key-secret \ --dry-run=client \ --from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64 -w 0) \ -o yaml | \ -kubectl apply -f - \ No newline at end of file +kubectl apply -f - + +OOD_PASS=$(tr -dc 'A-Za-z0-9' Date: Wed, 19 Jul 2023 09:44:17 +0100 Subject: [PATCH 017/152] Fixed broken mountpath for cluster config --- slurm-cluster-chart/templates/cluster-config-configmap.yaml | 4 ++-- slurm-cluster-chart/templates/login-deployment.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/slurm-cluster-chart/templates/cluster-config-configmap.yaml b/slurm-cluster-chart/templates/cluster-config-configmap.yaml index 3076dc9..914a456 100644 --- a/slurm-cluster-chart/templates/cluster-config-configmap.yaml +++ b/slurm-cluster-chart/templates/cluster-config-configmap.yaml @@ -3,5 +3,5 @@ kind: ConfigMap metadata: name: cluster-config data: - httpd.conf: | - {{- .Files.Get "files/ood-cluster-config.yaml" | nindent 4 -}} \ No newline at end of file + ood-cluster-config.yml: | + {{- .Files.Get "files/ood-cluster-config.yml" | nindent 4 -}} \ No newline at end of file diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index 95497a7..7631485 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -57,7 +57,7 @@ spec: mountPath: /etc/httpd/conf/httpd.conf subPath: httpd.conf - name: cluster-config - mountPath: /etc/ood/config/cluster.d/ood-cluster-config.yml + mountPath: /etc/ood/config/clusters.d/ood-cluster-config.yml subPath: ood-cluster-config.yml resources: {} hostname: login From 44e71b4d9ec62bde84f69e05aa4c3b4a45640e34 Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 19 Jul 2023 10:06:55 +0100 Subject: [PATCH 018/152] Fixed incorrect slurm binaries path --- slurm-cluster-chart/files/ood-cluster-config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/slurm-cluster-chart/files/ood-cluster-config.yml b/slurm-cluster-chart/files/ood-cluster-config.yml index d50a59b..336c0af 100644 --- a/slurm-cluster-chart/files/ood-cluster-config.yml +++ b/slurm-cluster-chart/files/ood-cluster-config.yml @@ -1,11 +1,11 @@ --- v2: metadata: - title: "My Cluster" + title: "Slurm Cluster" login: host: "login" job: cluster: "linux" adapter: "slurm" - bin: "/usr/sbin" + bin: "/usr/bin" conf: "/etc/slurm/slurm.conf" \ No newline at end of file From 804c74dc388bca8a3c68948483feeeacb09de567 Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 19 Jul 2023 10:09:51 +0100 Subject: [PATCH 019/152] Updated docs --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 92183f5..a23317e 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,8 @@ On initial deployment ONLY, run ``` This generates a set of secrets. If these need to be regenerated, see "Reconfiguring the Cluster" +Be sure to take note of the Open Ondemand credentials, you will need them to access the cluster through a browser + ### Connecting RWX Volume A ReadWriteMany (RWX) volume is required, if a named volume exists, set `nfs.claimName` in the `values.yaml` file to its name. If not, manifests to deploy a Rook NFS volume are provided in the `/nfs` directory. You can deploy this by running From 0e2666afe261ef4fb616bb45d62cd07ba93fa1b6 Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 19 Jul 2023 11:10:39 +0100 Subject: [PATCH 020/152] Changed image to allow self-sshing --- docker-entrypoint.sh | 4 ++++ slurm-cluster-chart/files/ood-cluster-config.yml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 528b454..8a0cad5 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -103,6 +103,10 @@ then gosu munge /usr/sbin/munged echo "---> MUNGE Complete" + echo "---> Setting up self ssh capabilities for OOD" + ssh-keyscan localhost > /etc/ssh/ssh_known_hosts + cat /home/rocky/.ssh/id_rsa.pub >> /home/rocky/.ssh/known_hosts + echo "---> Starting Apache Server" mkdir --parents /etc/ood/config/apps/shell diff --git a/slurm-cluster-chart/files/ood-cluster-config.yml b/slurm-cluster-chart/files/ood-cluster-config.yml index 336c0af..cc0ab76 100644 --- a/slurm-cluster-chart/files/ood-cluster-config.yml +++ b/slurm-cluster-chart/files/ood-cluster-config.yml @@ -3,7 +3,7 @@ v2: metadata: title: "Slurm Cluster" login: - host: "login" + host: "localhost" job: cluster: "linux" adapter: "slurm" From 7513b720200a0615dc9f402dd5ac3abc93baa82a Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 19 Jul 2023 11:16:17 +0100 Subject: [PATCH 021/152] Fixed incorrect path --- docker-entrypoint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 8a0cad5..2961a12 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -105,7 +105,7 @@ then echo "---> Setting up self ssh capabilities for OOD" ssh-keyscan localhost > /etc/ssh/ssh_known_hosts - cat /home/rocky/.ssh/id_rsa.pub >> /home/rocky/.ssh/known_hosts + cat /home/rocky/.ssh/id_rsa.pub >> /home/rocky/.ssh/authorized_keys echo "---> Starting Apache Server" From 4ba09915c728620098661ca9f4ddc8cea7ce54b4 Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 19 Jul 2023 11:25:30 +0100 Subject: [PATCH 022/152] Added newline to avoid breaking authorized_keys file --- docker-entrypoint.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 2961a12..039fc8d 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -105,6 +105,7 @@ then echo "---> Setting up self ssh capabilities for OOD" ssh-keyscan localhost > /etc/ssh/ssh_known_hosts + echo "" >> /home/rocky/.ssh/authorized_keys #Adding newline to avoid breaking authorized_keys file cat /home/rocky/.ssh/id_rsa.pub >> /home/rocky/.ssh/authorized_keys echo "---> Starting Apache Server" From 833b0d24cc55568cac3bdde440ada6e5f736c676 Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 19 Jul 2023 12:16:21 +0100 Subject: [PATCH 023/152] Bumped image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 739ae8d..6a669c7 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:c24c181 +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:4ba0991 replicas: slurmd: 2 From d38e241e5dc0e7d239a3456173810a972aae3a2d Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 19 Jul 2023 13:24:18 +0100 Subject: [PATCH 024/152] Removed host key generation from login image --- docker-entrypoint.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 039fc8d..aa8506a 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -96,7 +96,6 @@ then done echo "---> Complete" echo "Starting sshd" - ssh-keygen -A /usr/sbin/sshd echo "---> Starting the MUNGE Authentication service (munged) ..." From a89e584a1d63028c090dcf127e653f4602fbb5fe Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 19 Jul 2023 13:54:50 +0100 Subject: [PATCH 025/152] Updated image to copy and set permissions for host keys from mount --- docker-entrypoint.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index aa8506a..75be39a 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -96,6 +96,11 @@ then done echo "---> Complete" echo "Starting sshd" + cp /tempmounts/etc/ssh/* /etc/ssh/ + chmod 600 /etc/ssh/ssh_host_dsa_key + chmod 600 /etc/ssh/ssh_host_ecdsa_key + chmod 600 /etc/ssh/ssh_host_ed25519_key + chmod 600 /etc/ssh/ssh_host_rsa_key /usr/sbin/sshd echo "---> Starting the MUNGE Authentication service (munged) ..." From a6c8e3840b0e39f34ab586deed5901497af9e92e Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 19 Jul 2023 14:02:25 +0100 Subject: [PATCH 026/152] Server now has persistent set of host keys from mount --- generate-secrets.sh | 9 +++++ .../templates/login-deployment.yaml | 34 +++++++++++++++++++ slurm-cluster-chart/values.yaml | 2 +- 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/generate-secrets.sh b/generate-secrets.sh index 37fc2ef..e98b97e 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -12,6 +12,15 @@ kubectl create secret generic munge-key-secret \ -o yaml | \ kubectl apply -f - +mkdir -p ./temphostkeys/etc/ssh +ssh-keygen -A -f ./temphostkeys +kubectl create secret generic host-keys-secret \ +--dry-run=client \ +--from-file=./temphostkeys/etc/ssh \ +-o yaml | \ +kubectl apply -f - +rm -rf ./temphostkeys + OOD_PASS=$(tr -dc 'A-Za-z0-9' Date: Wed, 19 Jul 2023 14:03:03 +0100 Subject: [PATCH 027/152] Removed comments --- .../templates/login-deployment.yaml | 29 ------------------- 1 file changed, 29 deletions(-) diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index f0052e3..5d15550 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -61,35 +61,6 @@ spec: subPath: ood-cluster-config.yml - name: host-keys mountPath: /tempmounts/etc/ssh - #Host keys - # - name: host-keys - # mountPath: /etc/ssh/ssh_host_dsa_key - # subPath: ssh_host_dsa_key - # readOnly: true - # - name: host-keys - # mountPath: /etc/ssh/ssh_host_dsa_key.pub - # subPath: ssh_host_dsa_key.pub - # - name: host-keys - # mountPath: /etc/ssh/ssh_host_ecdsa_key - # subPath: ssh_host_ecdsa_key - # readOnly: true - # - name: host-keys - # mountPath: /etc/ssh/ssh_host_ecdsa_key.pub - # subPath: ssh_host_ecdsa_key.pub - # - name: host-keys - # mountPath: /etc/ssh/ssh_host_ed25519_key - # subPath: ssh_host_ed25519_key - # readOnly: true - # - name: host-keys - # mountPath: /etc/ssh/ssh_host_ed25519_key.pub - # subPath: ssh_host_ed25519_key.pub - # - name: host-keys - # mountPath: /etc/ssh/ssh_host_rsa_key - # subPath: ssh_host_rsa_key - # readOnly: true - # - name: host-keys - # mountPath: /etc/ssh/ssh_host_rsa_key.pub - # subPath: ssh_host_rsa_key.pub resources: {} hostname: login restartPolicy: Always From 1345a581f1fb5f8c1cf83315b9f8247d4a0fce06 Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 19 Jul 2023 16:12:43 +0100 Subject: [PATCH 028/152] Added https (fixes job composer) --- slurm-cluster-chart/files/ood_portal.yaml | 3 +++ slurm-cluster-chart/templates/login-deployment.yaml | 1 + slurm-cluster-chart/templates/login-service.yaml | 4 ++++ 3 files changed, 8 insertions(+) diff --git a/slurm-cluster-chart/files/ood_portal.yaml b/slurm-cluster-chart/files/ood_portal.yaml index 4eee040..9be3295 100644 --- a/slurm-cluster-chart/files/ood_portal.yaml +++ b/slurm-cluster-chart/files/ood_portal.yaml @@ -29,6 +29,9 @@ # - 'SSLCertificateKeyFile "/etc/pki/tls/private/www.example.com.key"' # Default: null (no SSL support) #ssl: null +ssl: +- 'SSLCertificateFile "/etc/pki/tls/certs/localhost.crt"' +- 'SSLCertificateKeyFile "/etc/pki/tls/private/localhost.key"' # Root directory of log files (can be relative ServerRoot) # Example: diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index 5d15550..1f24e8a 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -35,6 +35,7 @@ spec: ports: - containerPort: 22 - containerPort: 80 + - containerPort: 443 volumeMounts: - mountPath: {{ .Values.nfs.mountPath }} name: slurm-jobdir diff --git a/slurm-cluster-chart/templates/login-service.yaml b/slurm-cluster-chart/templates/login-service.yaml index fee3480..fcc3e49 100644 --- a/slurm-cluster-chart/templates/login-service.yaml +++ b/slurm-cluster-chart/templates/login-service.yaml @@ -15,6 +15,10 @@ spec: port: 80 targetPort: 80 protocol: TCP + - name: "https" + port: 443 + targetPort: 443 + protocol: TCP type: LoadBalancer selector: app.kubernetes.io/name: slurm From 0f286ed3c67afd6dc4cce590d803767eb6316e18 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 20 Jul 2023 15:47:20 +0100 Subject: [PATCH 029/152] Now generates keys for rocky to self-ssh if don't already exist (in image) --- docker-entrypoint.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 75be39a..cba2464 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -84,9 +84,16 @@ fi if [ "$1" = "login" ] then + echo "---> Setting up ssh for user" mkdir -p /home/rocky/.ssh cp tempmounts/authorized_keys /home/rocky/.ssh/authorized_keys + if [ -f /home/rocky/.ssh/id_rsa.pub ]; then + echo "ssh keys already found" + else + ssh-keygen -t rsa -f /home/rocky/.ssh/id_rsa -N "" + fi + echo "---> Setting permissions for user home directories" cd /home for DIR in */; From c0947542a68868591b9ae19f7f58c9385ac81b90 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 20 Jul 2023 15:52:59 +0100 Subject: [PATCH 030/152] Updated image tag --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 93c606c..93964f0 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:a89e584 +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:0f286ed replicas: slurmd: 2 From a5b71c24f4c57b939a1c37bea034d6bf0a8a2f80 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 20 Jul 2023 16:26:21 +0100 Subject: [PATCH 031/152] Updated image after merge --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index a0e5fdc..1bef86e 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:0f286ed #CHANGE AFTER REBUILD +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:3daa29f replicas: slurmd: 2 From 56c57ef17eaf2045bb0fd45a506f375d218a3be9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 10:17:05 +0000 Subject: [PATCH 032/152] add kubectl to slurmctl /etc/slurm --- generate-secrets.sh | 4 +++- slurm-cluster-chart/files/kubectl | 19 +++++++++++++++++++ slurm-cluster-chart/templates/kubectl.yml | 8 ++++++++ .../templates/slurmctld-statefulset.yaml | 8 ++++++++ 4 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 slurm-cluster-chart/files/kubectl create mode 100644 slurm-cluster-chart/templates/kubectl.yml diff --git a/generate-secrets.sh b/generate-secrets.sh index db64a53..a3eeac9 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -10,4 +10,6 @@ kubectl create secret generic munge-key-secret \ --dry-run=client \ --from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64 -w 0) \ -o yaml | \ -kubectl apply -f - \ No newline at end of file +kubectl apply -f - + +cp $KUBECONFIG slurm-cluster-chart/files/kubectl diff --git a/slurm-cluster-chart/files/kubectl b/slurm-cluster-chart/files/kubectl new file mode 100644 index 0000000..e51032d --- /dev/null +++ b/slurm-cluster-chart/files/kubectl @@ -0,0 +1,19 @@ +apiVersion: v1 +clusters: +- cluster: + certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUM2akNDQWRLZ0F3SUJBZ0lCQURBTkJna3Foa2lHOXcwQkFRc0ZBREFWTVJNd0VRWURWUVFERXdwcmRXSmwKY201bGRHVnpNQjRYRFRJek1EY3dOREUxTWpVME5Gb1hEVE16TURjd01URTFNekEwTkZvd0ZURVRNQkVHQTFVRQpBeE1LYTNWaVpYSnVaWFJsY3pDQ0FTSXdEUVlKS29aSWh2Y05BUUVCQlFBRGdnRVBBRENDQVFvQ2dnRUJBTE5QCnZXQ0dVUk5Da09URGw4cnoxeTdyV2V4eVFrVngrOWNQb0xwSGR1cTVnQ2ExRUpiQU5UYTAveVlhY0FQcjhtUGIKUDdsbmRRc3k5WFVKT3NQaXl1ckQwN1lrbllDeE1PZTl5WWtjaExpQ2o1N1dnTnN0QWxmc0VRUWlpLzFFeGw1RgpIeXZQYVlNTERRUjNFcUE1WjYwek9Xd2NTUGt3UlYvM2NjMy9oWlNXcG1uQi9WOGdaTUxjSmFKMDZ2dHFLZDVMCmxQbzFEK0F3a2RKV25GWlQyWWQ2aDQxSlJhUWw4Q1ZLSGVGZ256YkF3K2xnSXRUT0diODlDeS9vWVdxeERzNEYKREtHTitpTjk2WkdpWUJ0KzU3NndPYmVWaml3aW9pNldlUjFCRzZjQnBpdXgzT1pKZ0lkYjdFVDB2Tjl6bkxxMQpkd3NhcERHNFhEazVIb0llZlhNQ0F3RUFBYU5GTUVNd0RnWURWUjBQQVFIL0JBUURBZ0trTUJJR0ExVWRFd0VCCi93UUlNQVlCQWY4Q0FRQXdIUVlEVlIwT0JCWUVGRE5YR2JNMzNoSTBKbk0rc0F0MmtSYzdTWko5TUEwR0NTcUcKU0liM0RRRUJDd1VBQTRJQkFRQWkzWk9kRFR5aVc0Rm9PWU5NbmRhRTIrWGlYOVBPMDR2eFFRRzFUM0pyMlQxaApHQnpuRHVGMkQ0NEcrUVRsVGhUYXZpNGR5WTRMMmlkS29rSVp4U3gyMmZDUXlGaFhlb3dyTEJSaFZXam1BMGVqClhjczJWMkZJeWhTNHYrQkFtMHV4SDZ4S2JnREpDbS90WEpCOFZaMnRNVUZaVHk5T1p6WVFONnNHYi9tMUo2S0sKYUk3cmhjZWp0RVhkemZmUTFxSUZZbVRKQXVYck4rS3N6Wi82LzFPbGR1NkFEamw3M2hzUTNnbU9ka1FmeGdwTgpuSGVBazM1TnBEcitHUSttZ0NqZjZvSDBXWVI4QkMwT09aZGdGeGlXbkwvZXEybjVqUzlPdGh6dm9aZVM0T011CnI2cndEVDZoK2kzV1UvL0NoaTljcXZFeTJLTlJZMGpBYmxpbytxTnkKLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + server: https://tl4s3ncd18fezjy2ejctxdeem2ggh8zd5lrlf2yrcntgi.apps.hpc.cam.ac.uk:443 + name: sb-test +contexts: +- context: + cluster: sb-test + user: sb-test-admin + name: sb-test-admin@sb-test +current-context: sb-test-admin@sb-test +kind: Config +preferences: {} +users: +- name: sb-test-admin + user: + client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURFekNDQWZ1Z0F3SUJBZ0lJUFFmc2tza25YNlV3RFFZSktvWklodmNOQVFFTEJRQXdGVEVUTUJFR0ExVUUKQXhNS2EzVmlaWEp1WlhSbGN6QWVGdzB5TXpBM01EUXhOVEkxTkRSYUZ3MHlOREEzTURNeE5UTXdORFphTURReApGekFWQmdOVkJBb1REbk41YzNSbGJUcHRZWE4wWlhKek1Sa3dGd1lEVlFRREV4QnJkV0psY201bGRHVnpMV0ZrCmJXbHVNSUlCSWpBTkJna3Foa2lHOXcwQkFRRUZBQU9DQVE4QU1JSUJDZ0tDQVFFQXdxSElVSTZ5U29SaHpJQVcKaFZ1NlNIcnFUdFFZc2t0U0xwbjhTL2RvWTVFbmNWb09Dd2U1OTNneXV1dHVoZ2VoZ1N4RzUzVHY3K0hubjk5YwpOM0pvUHI2em9ybG1Scmg3OHFPRUlZUXVLd3RSSUpZeUJ5UXRmUThMRDExOWQ0Si9nTjc2QVpoNW5RR25UWEk5CnRDSm56ZGZid1FoT1Y2TWkrak1jYW1ZWXZNYjcvWXpZa0ZQRWNsYmZlbzhkTWhCWENKcWZjNHdjY0J0cG9kTngKNmpvZktqMTZGNTRLMmtKcFFnZEZUc1JKTEhtVjdjaWhLSWJVWjBUeWE1YS9ZYitoeU5rS21ad2JhR0NFZEdsWQorNkNmUjlscVhqaFBqVnVFR2Y4em1aQXUyQ2doR0dWSjUwb0xhUWZpVHpudlYwRlVlTlpWSk1UeTdjbDl0ZEZ1CmZMaFlPUUlEQVFBQm8wZ3dSakFPQmdOVkhROEJBZjhFQkFNQ0JhQXdFd1lEVlIwbEJBd3dDZ1lJS3dZQkJRVUgKQXdJd0h3WURWUjBqQkJnd0ZvQVVNMWNac3pmZUVqUW1jejZ3QzNhUkZ6dEprbjB3RFFZSktvWklodmNOQVFFTApCUUFEZ2dFQkFDOGl4Mk91V3p3Zk1YSXV6QlRqM05PTDFIUldidDhWUUxmVFVWMFBJQTVOcEM3S2ZTbzR3TzFXCnh5SjAyS25KclpxUkRpN1RSdXRxcUN5ZDJHV3pmajZmc1AvUXlOWUVlZ3dxVjN1dDZoQVBleUpxTHVLL3pvVWUKRnBZOVZDbUFVT0VnWGd2alExTEpTb1dJZDVGdjRqaW01Mk5JV0ZMdEdaREMxYXF1WjdEditrR0tQamlKb1l3ZQp3OTZkYWlJdDFvOE9KZkY4QW82L0FVSnF3Y05kZTdyUkt3Um5JMXFTVkJ2UGY2WVNLcEZKNjRERnZQL05WQ3pYCm4yQkdDZFZIRnNickpOeEhIbFRqMHhtUDE4VEtRWTJTTXM0MS95bzFLL3pFUENZSjdWSUlOQUdUK0NuQ2lqNysKanVlbWx2VGtWODdpeDFqUm1xenQ1V2EyaFBTdkFyMD0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + client-key-data: LS0tLS1CRUdJTiBSU0EgUFJJVkFURSBLRVktLS0tLQpNSUlFb3dJQkFBS0NBUUVBd3FISVVJNnlTb1JoeklBV2hWdTZTSHJxVHRRWXNrdFNMcG44Uy9kb1k1RW5jVm9PCkN3ZTU5M2d5dXV0dWhnZWhnU3hHNTNUdjcrSG5uOTljTjNKb1ByNnpvcmxtUnJoNzhxT0VJWVF1S3d0UklKWXkKQnlRdGZROExEMTE5ZDRKL2dONzZBWmg1blFHblRYSTl0Q0puemRmYndRaE9WNk1pK2pNY2FtWVl2TWI3L1l6WQprRlBFY2xiZmVvOGRNaEJYQ0pxZmM0d2NjQnRwb2ROeDZqb2ZLajE2RjU0SzJrSnBRZ2RGVHNSSkxIbVY3Y2loCktJYlVaMFR5YTVhL1liK2h5TmtLbVp3YmFHQ0VkR2xZKzZDZlI5bHFYamhQalZ1RUdmOHptWkF1MkNnaEdHVkoKNTBvTGFRZmlUem52VjBGVWVOWlZKTVR5N2NsOXRkRnVmTGhZT1FJREFRQUJBb0lCQUcxVENuMlZhYmhKbTlXTwpyUmZEYW1PRUIxQzMraGRNRDZGMWhTMzJqb0ErN0hUVExNZ3RVdHdhZkFSYWNmNS9Fc3pIM2h2c3AwbUxEdHZTClRxNG1hVCsxUnBuRW9ocGZUZUFBMFJzeWIreGxzdkFtN1hydGEwK3Z2M3FsL08vQU1YWmx5UEJVZ1JzYjdxbWwKM2RyczZIbkxJZmpQZlpIa1pLVTlTRnpMZEdHME40ODczOUtzSDdHMld4VlNTd2hVcGdGRUxzVHZhR1VQWHE5bQo1KzYwckhveFZEUWJBeGZpYWk4QU56Z3A4dzFHaVoya2tweDBpWWtaQXJZZXpiWmZxSjZVczdKWjRzK2xaMmdwCmJ4L1NGZHpzQWF4N1ZVN3RHdFFkUVJBbGkrdjlMUGEzZ2ZXN250RHAyaEUyUXRkbmg4SVJJUXJZYkRjbjJjME4KRmFIZGo4RUNnWUVBOXh3anNhM0FSVTNnUDZCV0kvS0o2QkhoN2NDa0YvQ2FKbEF2NVpsZ3EyMjBKZlBxOHRKRAp1WGlnOTNNS1JsQk54QWFsWW8rQW1zWitVMDF4UjVJWFMzY1ZaSXZzVlVtMG9iczIwb2JxN2xqMjhlUlpVbWxqCnRvTDdzVE0rSkROVHphcXcvblErQU02SEt1YkRYOUlzb29sWjI5OWNKWGtuRzRmeStCeHkrMjBDZ1lFQXlhSlQKWUxMdERRMVZTMHA3Z214bGJSMVE5dGdjUDZsZkJhRVoyOGNmaDJJWFBBTWFmc3VDbTcrbG9NYzNvMlcrT29WOApGeDVHRkVJbkNXS0NrcDViTlNWYjdsUWkyRVlQRDFTK2lJRnhCdXdsUk45QnoydmkwUjA2OE1TUy9tdm9mSklWCjdZQ3h0enR5aVc3YjE0M3BBQmNYYTdKVXBFb1M1WTE3M0RpeFpIMENnWUJ3K2dtTHMzK2piKzVseUoxNWcrcnYKRWpYMEtFNGRyK0FhUWpFVHpPTDRuWWh1amExT1pUbVhjNEpNZitranFwVlRXU0tHQkV2czkzRk1EcTBLNXMwRgpzS1UwT2hETUVZMm5IOXY1dHJ0MFMzSmp0MTNySXNuMjZMM0FEMGlLN25pVElFWVpuL1cxRXJlVHNydUNkS241CmljaHVrUmtrL05ZWGJUbDFuRFFwcFFLQmdRQ3FINEtYd1ArUEZxUFRqYmxkMXBWUkZmNGM5MFFHVnFJc3ZydHgKbXJVNFpnUFNoNC9RVVdjV2dBR0FBUFlwc0F0cmx2cVhDdHozOU1TNC8xdkoxMEIvTzlFdjZkOG9lUnYxeEh1cAo4d1RwWVU4a3AvWC95emdwVmE5SU82TUdkUWRJSzMzQzBPV2hBdEJsc3BwY2FZaWdvZHNKN0FITVNBOWZqUnRuCk9KSTdoUUtCZ0FHSzlreDJOcFJRMmt3dDl6NHZXNGpoajRRTld2UVkzU0krWnc2ZE9qTGdLQ1IvU1dwU0dGUngKWXN1SVZBTGF4Z0c0STNhQ0V1UjlEaGhGMUFsUjQ1Y2MzL1liRndjYlhqeWtiMnZtR29lMy8velhNN1JKaVJpUApXZ0lwZmwvb1cyYzdxSjJZYjNWemt0TkdyVkdyZmlXRjFyK2d0YzBsbWxockUxNjFVdVpOCi0tLS0tRU5EIFJTQSBQUklWQVRFIEtFWS0tLS0tCg== diff --git a/slurm-cluster-chart/templates/kubectl.yml b/slurm-cluster-chart/templates/kubectl.yml new file mode 100644 index 0000000..7e5d74b --- /dev/null +++ b/slurm-cluster-chart/templates/kubectl.yml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: kubectl-secret +data: + kubectl: | + {{- .Files.Get "files/kubectl" | b64enc | nindent 4 -}} + \ No newline at end of file diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml index e46dd7b..d5ac425 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml @@ -38,6 +38,10 @@ spec: subPath: munge.key - mountPath: /var/spool/slurmctld name: slurmctld-state + - mountPath: /etc/slurm/kubectl + name: kubectl-secret + subPath: kubectl + readOnly: true dnsConfig: searches: - slurmd.default.svc.cluster.local @@ -55,3 +59,7 @@ spec: - name: munge-key-secret secret: secretName: {{ .Values.secrets.mungeKey }} + - name: kubectl-secret + secret: + secretName: kubectl-secret + defaultMode: 0400 From 5705d4364a403b50f3f22397c92f4e28e561b1cb Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 15:03:02 +0000 Subject: [PATCH 033/152] make slurmd resource a template for use from slurmctld pod --- .../templates/slurmctld-statefulset.yaml | 6 +++ ...nt.yaml => slurmd-template-configmap.yaml} | 42 ++++++++----------- 2 files changed, 24 insertions(+), 24 deletions(-) rename slurm-cluster-chart/templates/{slurmd-deployment.yaml => slurmd-template-configmap.yaml} (63%) diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml index 848d8ea..52cd6a7 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml @@ -42,6 +42,9 @@ spec: name: kubectl-secret subPath: kubectl readOnly: true + - mountPath: /etc/slurm/slurmd-pod-template.yml + name: slurmd-pod-template + subPath: podTemplate dnsConfig: searches: - slurmd.default.svc.cluster.local @@ -64,3 +67,6 @@ spec: secret: secretName: kubectl-secret defaultMode: 0400 + - name: slurmd-pod-template + configMap: + name: slurmd-pod-template diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml similarity index 63% rename from slurm-cluster-chart/templates/slurmd-deployment.yaml rename to slurm-cluster-chart/templates/slurmd-template-configmap.yaml index 71ddc94..0ea8cdc 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml @@ -1,33 +1,27 @@ -apiVersion: apps/v1 -kind: StatefulSet +apiVersion: v1 +kind: ConfigMap metadata: - creationTimestamp: null - labels: - app.kubernetes.io/name: slurm - app.kubernetes.io/component: slurmd - name: slurmd -spec: - replicas: {{ .Values.replicas.slurmd }} - selector: - matchLabels: - app.kubernetes.io/name: slurm - app.kubernetes.io/component: slurmd - serviceName: slurmd - template: + name: slurmd-pod-template +data: + podTemplate: | + apiVersion: v1 + kind: Pod metadata: - creationTimestamp: null labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd + name: slurmd-0 # Irrelevant for DNS but must be be currently-unique so using slurmd name is convenient spec: - topologySpreadConstraints: - - maxSkew: 1 - whenUnsatisfiable: ScheduleAnyway - topologyKey: kubernetes.io/hostname - labelSelector: - matchLabels: - app.kubernetes.io/name: slurm - app.kubernetes.io/component: slurmd + # topologySpreadConstraints: + # - maxSkew: 1 + # whenUnsatisfiable: ScheduleAnyway + # topologyKey: kubernetes.io/hostname + # labelSelector: + # matchLabels: + # app.kubernetes.io/name: slurm + # app.kubernetes.io/component: slurmd + hostname: slurmd-0 # required to create DNS records for pod + subdomain: slurmd # has to match name of headless service to create DNS records for pod containers: - args: - slurmd From bf903fca14d785b7fc4af9bd82b8bea802a354d8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 15:04:00 +0000 Subject: [PATCH 034/152] fix hook mungekey --- slurm-cluster-chart/templates/check-jobs-finished-hook.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml b/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml index 58cac40..8687814 100644 --- a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml +++ b/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml @@ -29,6 +29,7 @@ spec: - name: munge-key-secret secret: secretName: {{ .Values.secrets.mungeKey }} + defaultMode: 0400 - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} From f23c1d76555915f0052c43809281d0216372c5b2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 15:46:50 +0000 Subject: [PATCH 035/152] WIP - autoscale programs and config --- Dockerfile | 1 + docker-entrypoint.sh | 11 +++++++++++ slurm-cluster-chart/files/slurm.conf | 14 +++++++++++++- .../templates/slurmd-template-configmap.yaml | 4 ++-- 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 167584b..4ae21c3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -92,6 +92,7 @@ RUN mkdir /etc/sysconfig/slurm \ && usermod -p '*' rocky # unlocks account but sets no password VOLUME /etc/slurm +COPY --chown=slurm:slurm k8s-slurmd-* /usr/sbin/ COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index a4ee0bf..807f416 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -38,6 +38,17 @@ fi if [ "$1" = "slurmctld" ] then + echo "---> Installing kubectl ..." + cat <<-EOF > /etc/yum.repos.d/kubernetes.repo + [kubernetes] + name=Kubernetes + baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-\$basearch + enabled=1 + gpgcheck=1 + gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg + EOF + dnf install -y kubectl + start_munge echo "---> Waiting for slurmdbd to become active before starting slurmctld ..." diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index 4c072a7..bacb5cc 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -49,10 +49,22 @@ AccountingStoragePort=6819 # SlurmctldParameters=cloud_dns,cloud_reg_addrs CommunicationParameters=NoAddrCache +ReconfigFlags=KeepPowerSaveSettings +#ResumeFailProgram=TODO? +ResumeProgram=/usr/sbin/k8s-slurmd-create +#ResumeTimeout=60 # default +SlurmctldParameters=idle_on_node_suspend +#SuspendExcNodes= +#SuspendExcParts= +#SuspendExcStates= +SuspendProgram=/usr/sbin/k8s-slurmd-delete +SuspendTime=30 # for debugging +#SuspendTimeout= +TreeWidth=65533 # NODES MaxNodeCount=10 -NodeName=slurmd-[0-9] State=FUTURE +NodeName=slurmd-[0-9] State=CLOUD # PARTITIONS PartitionName=all Default=yes Nodes=ALL diff --git a/slurm-cluster-chart/templates/slurmd-template-configmap.yaml b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml index 0ea8cdc..7c01fca 100644 --- a/slurm-cluster-chart/templates/slurmd-template-configmap.yaml +++ b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml @@ -10,7 +10,7 @@ data: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd - name: slurmd-0 # Irrelevant for DNS but must be be currently-unique so using slurmd name is convenient + name: SLURMD_NODENAME # Irrelevant for DNS but must be be currently-unique so using slurmd name is convenient spec: # topologySpreadConstraints: # - maxSkew: 1 @@ -20,7 +20,7 @@ data: # matchLabels: # app.kubernetes.io/name: slurm # app.kubernetes.io/component: slurmd - hostname: slurmd-0 # required to create DNS records for pod + hostname: SLURMD_NODENAME # required to create DNS records for pod subdomain: slurmd # has to match name of headless service to create DNS records for pod containers: - args: From e2a8041363733a1ebd05de409af3eed17d47cc40 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 16:04:12 +0000 Subject: [PATCH 036/152] try to force rebuild with autoscale scripts --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 4ae21c3..7dd64b2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -92,8 +92,8 @@ RUN mkdir /etc/sysconfig/slurm \ && usermod -p '*' rocky # unlocks account but sets no password VOLUME /etc/slurm -COPY --chown=slurm:slurm k8s-slurmd-* /usr/sbin/ COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh +COPY --chown=slurm:slurm k8s-slurmd-* /usr/sbin/ ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] CMD ["slurmdbd"] From 5b3dc256f0d3569fb889b820038691d322aec82f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 16:10:30 +0000 Subject: [PATCH 037/152] try to build image with autoscale scripts --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 7dd64b2..527a39a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -93,7 +93,7 @@ RUN mkdir /etc/sysconfig/slurm \ VOLUME /etc/slurm COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh -COPY --chown=slurm:slurm k8s-slurmd-* /usr/sbin/ +COPY --chown=slurm:slurm k8s-slurmd-* /usr/local/bin ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] CMD ["slurmdbd"] From 85fe5c5a801b4808cc5578db1892667c6807bc2a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 16:12:57 +0000 Subject: [PATCH 038/152] try to build with autoscale scripts --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 527a39a..ea3a7b5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -93,7 +93,7 @@ RUN mkdir /etc/sysconfig/slurm \ VOLUME /etc/slurm COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh -COPY --chown=slurm:slurm k8s-slurmd-* /usr/local/bin +COPY k8s-slurmd-* /usr/local/bin ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] CMD ["slurmdbd"] From 740d2d4bb60ba50ee48c98bf77b8f3694158a9a4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 16:17:49 +0000 Subject: [PATCH 039/152] try to build with autoscale scripts --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index ea3a7b5..f2f7b99 100644 --- a/Dockerfile +++ b/Dockerfile @@ -93,7 +93,8 @@ RUN mkdir /etc/sysconfig/slurm \ VOLUME /etc/slurm COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh -COPY k8s-slurmd-* /usr/local/bin +COPY k8s-slurmd-create /usr/local/bin/k8s-slurmd-create +COPY k8s-slurmd-delete /usr/local/bin/k8s-slurmd-delete ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] CMD ["slurmdbd"] From d96e1825c53959db470cd4174095bad9c8cc25aa Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 16:20:19 +0000 Subject: [PATCH 040/152] try to build with autoscale scripts --- Dockerfile | 3 +-- k8s-slurmd-create | 11 +++++++++++ k8s-slurmd-delete | 11 +++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 k8s-slurmd-create create mode 100644 k8s-slurmd-delete diff --git a/Dockerfile b/Dockerfile index f2f7b99..8d5e84f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -93,8 +93,7 @@ RUN mkdir /etc/sysconfig/slurm \ VOLUME /etc/slurm COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh -COPY k8s-slurmd-create /usr/local/bin/k8s-slurmd-create -COPY k8s-slurmd-delete /usr/local/bin/k8s-slurmd-delete +COPY --chown=slurm:slurm k8s-slurmd-* /usr/local/bin/ ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] CMD ["slurmdbd"] diff --git a/k8s-slurmd-create b/k8s-slurmd-create new file mode 100644 index 0000000..3108537 --- /dev/null +++ b/k8s-slurmd-create @@ -0,0 +1,11 @@ +#!/usr/bin/bash + +export KUBECONFIG=/etc/slurm/kubectl + +echo "$(date) Resume invoked $0 $*" >> /var/log/power_save.log + +hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes +for host in $hosts +do + sed s/SLURMD_NODENAME/$host/ | kubectl create -f - +done diff --git a/k8s-slurmd-delete b/k8s-slurmd-delete new file mode 100644 index 0000000..86f6622 --- /dev/null +++ b/k8s-slurmd-delete @@ -0,0 +1,11 @@ +#!/usr/bin/bash + +export KUBECONFIG=/etc/slurm/kubectl + +echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log + +hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes +for host in $hosts +do + kubectl delete pod $host +done From 456b5aadc76d60fa6f1da423f7aa054fe99688b4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 16:24:19 +0000 Subject: [PATCH 041/152] try to build with autoscale scripts --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 8d5e84f..19d2638 100644 --- a/Dockerfile +++ b/Dockerfile @@ -93,7 +93,7 @@ RUN mkdir /etc/sysconfig/slurm \ VOLUME /etc/slurm COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh -COPY --chown=slurm:slurm k8s-slurmd-* /usr/local/bin/ +COPY --chown=slurm:slurm --chmod=744 k8s-slurmd-* /usr/local/bin/ ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] CMD ["slurmdbd"] From 4d93a5e0ed4ff613ad8fe66a22587557fb781fb7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 16:32:33 +0000 Subject: [PATCH 042/152] install kubectl in image --- Dockerfile | 3 +++ docker-entrypoint.sh | 11 ----------- kubernetes.repo | 7 +++++++ 3 files changed, 10 insertions(+), 11 deletions(-) create mode 100644 kubernetes.repo diff --git a/Dockerfile b/Dockerfile index 19d2638..2919ccf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -91,6 +91,9 @@ RUN mkdir /etc/sysconfig/slurm \ && useradd -u 1000 rocky \ && usermod -p '*' rocky # unlocks account but sets no password +COPY kubernetes.repo /etc/yum.repos.d/kubernetes.repo +RUN dnf install -y kubectl + VOLUME /etc/slurm COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh COPY --chown=slurm:slurm --chmod=744 k8s-slurmd-* /usr/local/bin/ diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 807f416..a4ee0bf 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -38,17 +38,6 @@ fi if [ "$1" = "slurmctld" ] then - echo "---> Installing kubectl ..." - cat <<-EOF > /etc/yum.repos.d/kubernetes.repo - [kubernetes] - name=Kubernetes - baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-\$basearch - enabled=1 - gpgcheck=1 - gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg - EOF - dnf install -y kubectl - start_munge echo "---> Waiting for slurmdbd to become active before starting slurmctld ..." diff --git a/kubernetes.repo b/kubernetes.repo new file mode 100644 index 0000000..9e28c23 --- /dev/null +++ b/kubernetes.repo @@ -0,0 +1,7 @@ +[kubernetes] +name=Kubernetes +baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-$basearch +enabled=1 +gpgcheck=1 +gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg +EOF From bca49f97f494cecfc1f27eedef01a149d3a7946d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 08:16:44 +0000 Subject: [PATCH 043/152] rename kubectl to kubeconfig --- docker-entrypoint.sh | 6 ++++-- generate-secrets.sh | 3 ++- k8s-slurmd-create | 2 +- k8s-slurmd-delete | 2 +- slurm-cluster-chart/files/kubectl | 19 ------------------- slurm-cluster-chart/templates/kubectl.yml | 4 ++-- .../templates/slurmctld-statefulset.yaml | 10 +++++----- 7 files changed, 15 insertions(+), 31 deletions(-) delete mode 100644 slurm-cluster-chart/files/kubectl diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index a4ee0bf..7e079be 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -49,8 +49,10 @@ then done echo "-- slurmdbd is now active ..." - echo "---> Setting permissions for state directory ..." - chown slurm:slurm /var/spool/slurmctld + echo "---> Setting owernship ..." + chown slurm:slurm \ + /var/spool/slurmctld \ + /etc/slurm/kubeconfig echo "---> Starting the Slurm Controller Daemon (slurmctld) ..." if /usr/sbin/slurmctld -V | grep -q '17.02' ; then diff --git a/generate-secrets.sh b/generate-secrets.sh index a3eeac9..334b8a7 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -12,4 +12,5 @@ kubectl create secret generic munge-key-secret \ -o yaml | \ kubectl apply -f - -cp $KUBECONFIG slurm-cluster-chart/files/kubectl +cp $KUBECONFIG slurm-cluster-chart/files/kubeconfig +echo "copied $KUBECONFIG into slurm-cluster-chart/files/" \ No newline at end of file diff --git a/k8s-slurmd-create b/k8s-slurmd-create index 3108537..b38b37b 100644 --- a/k8s-slurmd-create +++ b/k8s-slurmd-create @@ -1,6 +1,6 @@ #!/usr/bin/bash -export KUBECONFIG=/etc/slurm/kubectl +export KUBECONFIG=/etc/slurm/kubeconfig echo "$(date) Resume invoked $0 $*" >> /var/log/power_save.log diff --git a/k8s-slurmd-delete b/k8s-slurmd-delete index 86f6622..0dff8a0 100644 --- a/k8s-slurmd-delete +++ b/k8s-slurmd-delete @@ -1,6 +1,6 @@ #!/usr/bin/bash -export KUBECONFIG=/etc/slurm/kubectl +export KUBECONFIG=/etc/slurm/kubeconfig echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log diff --git a/slurm-cluster-chart/files/kubectl b/slurm-cluster-chart/files/kubectl deleted file mode 100644 index e51032d..0000000 --- a/slurm-cluster-chart/files/kubectl +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: v1 -clusters: -- cluster: - certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUM2akNDQWRLZ0F3SUJBZ0lCQURBTkJna3Foa2lHOXcwQkFRc0ZBREFWTVJNd0VRWURWUVFERXdwcmRXSmwKY201bGRHVnpNQjRYRFRJek1EY3dOREUxTWpVME5Gb1hEVE16TURjd01URTFNekEwTkZvd0ZURVRNQkVHQTFVRQpBeE1LYTNWaVpYSnVaWFJsY3pDQ0FTSXdEUVlKS29aSWh2Y05BUUVCQlFBRGdnRVBBRENDQVFvQ2dnRUJBTE5QCnZXQ0dVUk5Da09URGw4cnoxeTdyV2V4eVFrVngrOWNQb0xwSGR1cTVnQ2ExRUpiQU5UYTAveVlhY0FQcjhtUGIKUDdsbmRRc3k5WFVKT3NQaXl1ckQwN1lrbllDeE1PZTl5WWtjaExpQ2o1N1dnTnN0QWxmc0VRUWlpLzFFeGw1RgpIeXZQYVlNTERRUjNFcUE1WjYwek9Xd2NTUGt3UlYvM2NjMy9oWlNXcG1uQi9WOGdaTUxjSmFKMDZ2dHFLZDVMCmxQbzFEK0F3a2RKV25GWlQyWWQ2aDQxSlJhUWw4Q1ZLSGVGZ256YkF3K2xnSXRUT0diODlDeS9vWVdxeERzNEYKREtHTitpTjk2WkdpWUJ0KzU3NndPYmVWaml3aW9pNldlUjFCRzZjQnBpdXgzT1pKZ0lkYjdFVDB2Tjl6bkxxMQpkd3NhcERHNFhEazVIb0llZlhNQ0F3RUFBYU5GTUVNd0RnWURWUjBQQVFIL0JBUURBZ0trTUJJR0ExVWRFd0VCCi93UUlNQVlCQWY4Q0FRQXdIUVlEVlIwT0JCWUVGRE5YR2JNMzNoSTBKbk0rc0F0MmtSYzdTWko5TUEwR0NTcUcKU0liM0RRRUJDd1VBQTRJQkFRQWkzWk9kRFR5aVc0Rm9PWU5NbmRhRTIrWGlYOVBPMDR2eFFRRzFUM0pyMlQxaApHQnpuRHVGMkQ0NEcrUVRsVGhUYXZpNGR5WTRMMmlkS29rSVp4U3gyMmZDUXlGaFhlb3dyTEJSaFZXam1BMGVqClhjczJWMkZJeWhTNHYrQkFtMHV4SDZ4S2JnREpDbS90WEpCOFZaMnRNVUZaVHk5T1p6WVFONnNHYi9tMUo2S0sKYUk3cmhjZWp0RVhkemZmUTFxSUZZbVRKQXVYck4rS3N6Wi82LzFPbGR1NkFEamw3M2hzUTNnbU9ka1FmeGdwTgpuSGVBazM1TnBEcitHUSttZ0NqZjZvSDBXWVI4QkMwT09aZGdGeGlXbkwvZXEybjVqUzlPdGh6dm9aZVM0T011CnI2cndEVDZoK2kzV1UvL0NoaTljcXZFeTJLTlJZMGpBYmxpbytxTnkKLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= - server: https://tl4s3ncd18fezjy2ejctxdeem2ggh8zd5lrlf2yrcntgi.apps.hpc.cam.ac.uk:443 - name: sb-test -contexts: -- context: - cluster: sb-test - user: sb-test-admin - name: sb-test-admin@sb-test -current-context: sb-test-admin@sb-test -kind: Config -preferences: {} -users: -- name: sb-test-admin - user: - client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURFekNDQWZ1Z0F3SUJBZ0lJUFFmc2tza25YNlV3RFFZSktvWklodmNOQVFFTEJRQXdGVEVUTUJFR0ExVUUKQXhNS2EzVmlaWEp1WlhSbGN6QWVGdzB5TXpBM01EUXhOVEkxTkRSYUZ3MHlOREEzTURNeE5UTXdORFphTURReApGekFWQmdOVkJBb1REbk41YzNSbGJUcHRZWE4wWlhKek1Sa3dGd1lEVlFRREV4QnJkV0psY201bGRHVnpMV0ZrCmJXbHVNSUlCSWpBTkJna3Foa2lHOXcwQkFRRUZBQU9DQVE4QU1JSUJDZ0tDQVFFQXdxSElVSTZ5U29SaHpJQVcKaFZ1NlNIcnFUdFFZc2t0U0xwbjhTL2RvWTVFbmNWb09Dd2U1OTNneXV1dHVoZ2VoZ1N4RzUzVHY3K0hubjk5YwpOM0pvUHI2em9ybG1Scmg3OHFPRUlZUXVLd3RSSUpZeUJ5UXRmUThMRDExOWQ0Si9nTjc2QVpoNW5RR25UWEk5CnRDSm56ZGZid1FoT1Y2TWkrak1jYW1ZWXZNYjcvWXpZa0ZQRWNsYmZlbzhkTWhCWENKcWZjNHdjY0J0cG9kTngKNmpvZktqMTZGNTRLMmtKcFFnZEZUc1JKTEhtVjdjaWhLSWJVWjBUeWE1YS9ZYitoeU5rS21ad2JhR0NFZEdsWQorNkNmUjlscVhqaFBqVnVFR2Y4em1aQXUyQ2doR0dWSjUwb0xhUWZpVHpudlYwRlVlTlpWSk1UeTdjbDl0ZEZ1CmZMaFlPUUlEQVFBQm8wZ3dSakFPQmdOVkhROEJBZjhFQkFNQ0JhQXdFd1lEVlIwbEJBd3dDZ1lJS3dZQkJRVUgKQXdJd0h3WURWUjBqQkJnd0ZvQVVNMWNac3pmZUVqUW1jejZ3QzNhUkZ6dEprbjB3RFFZSktvWklodmNOQVFFTApCUUFEZ2dFQkFDOGl4Mk91V3p3Zk1YSXV6QlRqM05PTDFIUldidDhWUUxmVFVWMFBJQTVOcEM3S2ZTbzR3TzFXCnh5SjAyS25KclpxUkRpN1RSdXRxcUN5ZDJHV3pmajZmc1AvUXlOWUVlZ3dxVjN1dDZoQVBleUpxTHVLL3pvVWUKRnBZOVZDbUFVT0VnWGd2alExTEpTb1dJZDVGdjRqaW01Mk5JV0ZMdEdaREMxYXF1WjdEditrR0tQamlKb1l3ZQp3OTZkYWlJdDFvOE9KZkY4QW82L0FVSnF3Y05kZTdyUkt3Um5JMXFTVkJ2UGY2WVNLcEZKNjRERnZQL05WQ3pYCm4yQkdDZFZIRnNickpOeEhIbFRqMHhtUDE4VEtRWTJTTXM0MS95bzFLL3pFUENZSjdWSUlOQUdUK0NuQ2lqNysKanVlbWx2VGtWODdpeDFqUm1xenQ1V2EyaFBTdkFyMD0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= - client-key-data: LS0tLS1CRUdJTiBSU0EgUFJJVkFURSBLRVktLS0tLQpNSUlFb3dJQkFBS0NBUUVBd3FISVVJNnlTb1JoeklBV2hWdTZTSHJxVHRRWXNrdFNMcG44Uy9kb1k1RW5jVm9PCkN3ZTU5M2d5dXV0dWhnZWhnU3hHNTNUdjcrSG5uOTljTjNKb1ByNnpvcmxtUnJoNzhxT0VJWVF1S3d0UklKWXkKQnlRdGZROExEMTE5ZDRKL2dONzZBWmg1blFHblRYSTl0Q0puemRmYndRaE9WNk1pK2pNY2FtWVl2TWI3L1l6WQprRlBFY2xiZmVvOGRNaEJYQ0pxZmM0d2NjQnRwb2ROeDZqb2ZLajE2RjU0SzJrSnBRZ2RGVHNSSkxIbVY3Y2loCktJYlVaMFR5YTVhL1liK2h5TmtLbVp3YmFHQ0VkR2xZKzZDZlI5bHFYamhQalZ1RUdmOHptWkF1MkNnaEdHVkoKNTBvTGFRZmlUem52VjBGVWVOWlZKTVR5N2NsOXRkRnVmTGhZT1FJREFRQUJBb0lCQUcxVENuMlZhYmhKbTlXTwpyUmZEYW1PRUIxQzMraGRNRDZGMWhTMzJqb0ErN0hUVExNZ3RVdHdhZkFSYWNmNS9Fc3pIM2h2c3AwbUxEdHZTClRxNG1hVCsxUnBuRW9ocGZUZUFBMFJzeWIreGxzdkFtN1hydGEwK3Z2M3FsL08vQU1YWmx5UEJVZ1JzYjdxbWwKM2RyczZIbkxJZmpQZlpIa1pLVTlTRnpMZEdHME40ODczOUtzSDdHMld4VlNTd2hVcGdGRUxzVHZhR1VQWHE5bQo1KzYwckhveFZEUWJBeGZpYWk4QU56Z3A4dzFHaVoya2tweDBpWWtaQXJZZXpiWmZxSjZVczdKWjRzK2xaMmdwCmJ4L1NGZHpzQWF4N1ZVN3RHdFFkUVJBbGkrdjlMUGEzZ2ZXN250RHAyaEUyUXRkbmg4SVJJUXJZYkRjbjJjME4KRmFIZGo4RUNnWUVBOXh3anNhM0FSVTNnUDZCV0kvS0o2QkhoN2NDa0YvQ2FKbEF2NVpsZ3EyMjBKZlBxOHRKRAp1WGlnOTNNS1JsQk54QWFsWW8rQW1zWitVMDF4UjVJWFMzY1ZaSXZzVlVtMG9iczIwb2JxN2xqMjhlUlpVbWxqCnRvTDdzVE0rSkROVHphcXcvblErQU02SEt1YkRYOUlzb29sWjI5OWNKWGtuRzRmeStCeHkrMjBDZ1lFQXlhSlQKWUxMdERRMVZTMHA3Z214bGJSMVE5dGdjUDZsZkJhRVoyOGNmaDJJWFBBTWFmc3VDbTcrbG9NYzNvMlcrT29WOApGeDVHRkVJbkNXS0NrcDViTlNWYjdsUWkyRVlQRDFTK2lJRnhCdXdsUk45QnoydmkwUjA2OE1TUy9tdm9mSklWCjdZQ3h0enR5aVc3YjE0M3BBQmNYYTdKVXBFb1M1WTE3M0RpeFpIMENnWUJ3K2dtTHMzK2piKzVseUoxNWcrcnYKRWpYMEtFNGRyK0FhUWpFVHpPTDRuWWh1amExT1pUbVhjNEpNZitranFwVlRXU0tHQkV2czkzRk1EcTBLNXMwRgpzS1UwT2hETUVZMm5IOXY1dHJ0MFMzSmp0MTNySXNuMjZMM0FEMGlLN25pVElFWVpuL1cxRXJlVHNydUNkS241CmljaHVrUmtrL05ZWGJUbDFuRFFwcFFLQmdRQ3FINEtYd1ArUEZxUFRqYmxkMXBWUkZmNGM5MFFHVnFJc3ZydHgKbXJVNFpnUFNoNC9RVVdjV2dBR0FBUFlwc0F0cmx2cVhDdHozOU1TNC8xdkoxMEIvTzlFdjZkOG9lUnYxeEh1cAo4d1RwWVU4a3AvWC95emdwVmE5SU82TUdkUWRJSzMzQzBPV2hBdEJsc3BwY2FZaWdvZHNKN0FITVNBOWZqUnRuCk9KSTdoUUtCZ0FHSzlreDJOcFJRMmt3dDl6NHZXNGpoajRRTld2UVkzU0krWnc2ZE9qTGdLQ1IvU1dwU0dGUngKWXN1SVZBTGF4Z0c0STNhQ0V1UjlEaGhGMUFsUjQ1Y2MzL1liRndjYlhqeWtiMnZtR29lMy8velhNN1JKaVJpUApXZ0lwZmwvb1cyYzdxSjJZYjNWemt0TkdyVkdyZmlXRjFyK2d0YzBsbWxockUxNjFVdVpOCi0tLS0tRU5EIFJTQSBQUklWQVRFIEtFWS0tLS0tCg== diff --git a/slurm-cluster-chart/templates/kubectl.yml b/slurm-cluster-chart/templates/kubectl.yml index 7e5d74b..78d865b 100644 --- a/slurm-cluster-chart/templates/kubectl.yml +++ b/slurm-cluster-chart/templates/kubectl.yml @@ -1,8 +1,8 @@ apiVersion: v1 kind: Secret metadata: - name: kubectl-secret + name: kubeconfig-secret data: kubectl: | - {{- .Files.Get "files/kubectl" | b64enc | nindent 4 -}} + {{- .Files.Get "files/kubeconfig" | b64enc | nindent 4 -}} \ No newline at end of file diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml index 52cd6a7..1418ef2 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml @@ -38,9 +38,9 @@ spec: subPath: munge.key - mountPath: /var/spool/slurmctld name: slurmctld-state - - mountPath: /etc/slurm/kubectl - name: kubectl-secret - subPath: kubectl + - mountPath: /etc/slurm/kubeconfig + name: kubeconfig-secret + subPath: kubeconfig readOnly: true - mountPath: /etc/slurm/slurmd-pod-template.yml name: slurmd-pod-template @@ -63,9 +63,9 @@ spec: secret: secretName: {{ .Values.secrets.mungeKey }} defaultMode: 0400 - - name: kubectl-secret + - name: kubeconfig-secret secret: - secretName: kubectl-secret + secretName: kubeconfig-secret defaultMode: 0400 - name: slurmd-pod-template configMap: From 68b77e5b0dfa37b625429a2688dacf655b07c5bf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 08:27:25 +0000 Subject: [PATCH 044/152] fix kubernetes repo --- kubernetes.repo | 1 - 1 file changed, 1 deletion(-) diff --git a/kubernetes.repo b/kubernetes.repo index 9e28c23..f4ae4ff 100644 --- a/kubernetes.repo +++ b/kubernetes.repo @@ -4,4 +4,3 @@ baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-$basearch enabled=1 gpgcheck=1 gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg -EOF From b13010082b546ddbe82187276569eb096340f3f8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 08:31:29 +0000 Subject: [PATCH 045/152] move docker build into directory --- .github/workflows/build-containers.yml | 4 ++-- Dockerfile => image/Dockerfile | 0 docker-entrypoint.sh => image/docker-entrypoint.sh | 0 k8s-slurmd-create => image/k8s-slurmd-create | 0 k8s-slurmd-delete => image/k8s-slurmd-delete | 0 kubernetes.repo => image/kubernetes.repo | 0 6 files changed, 2 insertions(+), 2 deletions(-) rename Dockerfile => image/Dockerfile (100%) rename docker-entrypoint.sh => image/docker-entrypoint.sh (100%) rename k8s-slurmd-create => image/k8s-slurmd-create (100%) rename k8s-slurmd-delete => image/k8s-slurmd-delete (100%) rename kubernetes.repo => image/kubernetes.repo (100%) diff --git a/.github/workflows/build-containers.yml b/.github/workflows/build-containers.yml index db15721..dfda115 100644 --- a/.github/workflows/build-containers.yml +++ b/.github/workflows/build-containers.yml @@ -3,8 +3,7 @@ on: push: paths: - .github/workflows/build-containers.yml - - Dockerfile - - docker-entrypoint.sh + - image/ workflow_dispatch: jobs: @@ -49,6 +48,7 @@ jobs: with: provenance: false push: true + context: image/ tags: ${{ steps.image-meta.outputs.tags }} labels: ${{ steps.image-meta.outputs.labels }} cache-from: type=local,src=/tmp/.buildx-cache diff --git a/Dockerfile b/image/Dockerfile similarity index 100% rename from Dockerfile rename to image/Dockerfile diff --git a/docker-entrypoint.sh b/image/docker-entrypoint.sh similarity index 100% rename from docker-entrypoint.sh rename to image/docker-entrypoint.sh diff --git a/k8s-slurmd-create b/image/k8s-slurmd-create similarity index 100% rename from k8s-slurmd-create rename to image/k8s-slurmd-create diff --git a/k8s-slurmd-delete b/image/k8s-slurmd-delete similarity index 100% rename from k8s-slurmd-delete rename to image/k8s-slurmd-delete diff --git a/kubernetes.repo b/image/kubernetes.repo similarity index 100% rename from kubernetes.repo rename to image/kubernetes.repo From a459fd839e1d5e424e83b4d6a43ad5eea2a2b6c8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 09:00:29 +0000 Subject: [PATCH 046/152] change ownership of kubeconfig --- image/docker-entrypoint.sh | 1 + slurm-cluster-chart/templates/slurmctld-statefulset.yaml | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 7e079be..63ee90c 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -50,6 +50,7 @@ then echo "-- slurmdbd is now active ..." echo "---> Setting owernship ..." + cp /tmp/kubeconfig /etc/slurm/kubeconfig chown slurm:slurm \ /var/spool/slurmctld \ /etc/slurm/kubeconfig diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml index 1418ef2..63d1ec2 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml @@ -38,10 +38,9 @@ spec: subPath: munge.key - mountPath: /var/spool/slurmctld name: slurmctld-state - - mountPath: /etc/slurm/kubeconfig + - mountPath: /tmp/kubeconfig name: kubeconfig-secret subPath: kubeconfig - readOnly: true - mountPath: /etc/slurm/slurmd-pod-template.yml name: slurmd-pod-template subPath: podTemplate From 2d9cb4c97da807082b791f08b0387e941695239f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 09:02:10 +0000 Subject: [PATCH 047/152] fix workflow path filter for image build --- .github/workflows/build-containers.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-containers.yml b/.github/workflows/build-containers.yml index dfda115..e69797c 100644 --- a/.github/workflows/build-containers.yml +++ b/.github/workflows/build-containers.yml @@ -3,7 +3,7 @@ on: push: paths: - .github/workflows/build-containers.yml - - image/ + - image/** workflow_dispatch: jobs: From d487326dc7f4b1f5ff5e4308ab298c50c1c402ee Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 12:48:05 +0000 Subject: [PATCH 048/152] move kubeconfig out of /etc/slurm volume --- image/docker-entrypoint.sh | 11 ++++++----- image/k8s-slurmd-create | 2 +- image/k8s-slurmd-delete | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 8ec2d0f..276949f 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -48,11 +48,12 @@ then done echo "-- slurmdbd is now active ..." - echo "---> Setting owernship ..." - cp /tmp/kubeconfig /etc/slurm/kubeconfig - chown slurm:slurm \ - /var/spool/slurmctld \ - /etc/slurm/kubeconfig + echo "---> Setting ownership for state directory ..." + chown slurm:slurm /var/spool/slurmctld + + echo "---> Copying Kubeconfig ..." + install -o slurm -g slurm -m u=rw,go= -d /var/lib/slurmctld/ + install -o slurm -g slurm u=r,go= /tmp/kubeconfig /var/lib/slurmctld/ echo "---> Starting the Slurm Controller Daemon (slurmctld) ..." if /usr/sbin/slurmctld -V | grep -q '17.02' ; then diff --git a/image/k8s-slurmd-create b/image/k8s-slurmd-create index b38b37b..de63505 100644 --- a/image/k8s-slurmd-create +++ b/image/k8s-slurmd-create @@ -1,6 +1,6 @@ #!/usr/bin/bash -export KUBECONFIG=/etc/slurm/kubeconfig +export KUBECONFIG=/var/lib/slurmctld/kubeconfig echo "$(date) Resume invoked $0 $*" >> /var/log/power_save.log diff --git a/image/k8s-slurmd-delete b/image/k8s-slurmd-delete index 0dff8a0..ddbd7b7 100644 --- a/image/k8s-slurmd-delete +++ b/image/k8s-slurmd-delete @@ -1,6 +1,6 @@ #!/usr/bin/bash -export KUBECONFIG=/etc/slurm/kubeconfig +export KUBECONFIG=/var/lib/slurmctld/kubeconfig echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log From 800ab39f4cddecf7728604062658dcfeb0a413b4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 14:12:41 +0000 Subject: [PATCH 049/152] move non-secrets to projected /etc/slurm volume on slurmctld, use /var/lib/slurmctd for secrets owned by slurm --- image/docker-entrypoint.sh | 2 +- image/k8s-slurmd-create | 2 +- .../templates/{kubectl.yml => kubeconfig.yml} | 2 +- .../templates/slurmctld-statefulset.yaml | 14 ++++++-------- .../templates/slurmd-template-configmap.yaml | 2 +- 5 files changed, 10 insertions(+), 12 deletions(-) rename slurm-cluster-chart/templates/{kubectl.yml => kubeconfig.yml} (89%) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 276949f..35e07ef 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -53,7 +53,7 @@ then echo "---> Copying Kubeconfig ..." install -o slurm -g slurm -m u=rw,go= -d /var/lib/slurmctld/ - install -o slurm -g slurm u=r,go= /tmp/kubeconfig /var/lib/slurmctld/ + install -o slurm -g slurm -m u=r,go= /tmp/kubeconfig /var/lib/slurmctld/ echo "---> Starting the Slurm Controller Daemon (slurmctld) ..." if /usr/sbin/slurmctld -V | grep -q '17.02' ; then diff --git a/image/k8s-slurmd-create b/image/k8s-slurmd-create index de63505..048b3d6 100644 --- a/image/k8s-slurmd-create +++ b/image/k8s-slurmd-create @@ -7,5 +7,5 @@ echo "$(date) Resume invoked $0 $*" >> /var/log/power_save.log hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes for host in $hosts do - sed s/SLURMD_NODENAME/$host/ | kubectl create -f - + sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | kubectl create -f - done diff --git a/slurm-cluster-chart/templates/kubectl.yml b/slurm-cluster-chart/templates/kubeconfig.yml similarity index 89% rename from slurm-cluster-chart/templates/kubectl.yml rename to slurm-cluster-chart/templates/kubeconfig.yml index 78d865b..4938798 100644 --- a/slurm-cluster-chart/templates/kubectl.yml +++ b/slurm-cluster-chart/templates/kubeconfig.yml @@ -3,6 +3,6 @@ kind: Secret metadata: name: kubeconfig-secret data: - kubectl: | + kubeconfig: | {{- .Files.Get "files/kubeconfig" | b64enc | nindent 4 -}} \ No newline at end of file diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml index 6dacb87..0274652 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml @@ -40,9 +40,6 @@ spec: - mountPath: /tmp/kubeconfig name: kubeconfig-secret subPath: kubeconfig - - mountPath: /etc/slurm/slurmd-pod-template.yml - name: slurmd-pod-template - subPath: podTemplate dnsConfig: searches: - slurmd.default.svc.cluster.local @@ -55,8 +52,12 @@ spec: persistentVolumeClaim: claimName: var-spool-slurmctld - name: slurm-config-volume - configMap: - name: {{ .Values.configmaps.slurmConf }} + projected: + sources: + - configMap: + name: {{ .Values.configmaps.slurmConf }} + - configMap: + name: slurmd-pod-template - name: munge-key-secret secret: secretName: {{ .Values.secrets.mungeKey }} @@ -65,6 +66,3 @@ spec: secret: secretName: kubeconfig-secret defaultMode: 0400 - - name: slurmd-pod-template - configMap: - name: slurmd-pod-template diff --git a/slurm-cluster-chart/templates/slurmd-template-configmap.yaml b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml index 125e318..e5a60bf 100644 --- a/slurm-cluster-chart/templates/slurmd-template-configmap.yaml +++ b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml @@ -3,7 +3,7 @@ kind: ConfigMap metadata: name: slurmd-pod-template data: - podTemplate: | + slurmd-pod-template.yml: | apiVersion: v1 kind: Pod metadata: From bf4ec14e0fa3fbd1c68369f020be852c6a6975a1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 15:04:17 +0000 Subject: [PATCH 050/152] fix perms on slurm secrets dir --- image/docker-entrypoint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 35e07ef..67e5484 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -52,7 +52,7 @@ then chown slurm:slurm /var/spool/slurmctld echo "---> Copying Kubeconfig ..." - install -o slurm -g slurm -m u=rw,go= -d /var/lib/slurmctld/ + install -o slurm -g slurm -m u=rwX,go= -d /var/lib/slurmctld/ install -o slurm -g slurm -m u=r,go= /tmp/kubeconfig /var/lib/slurmctld/ echo "---> Starting the Slurm Controller Daemon (slurmctld) ..." From ecd9cb0c775f1e8b74a1bfac7d243ad1821ced0b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 15:04:50 +0000 Subject: [PATCH 051/152] resume/suspend programs write logs to directory with correct permissions --- image/k8s-slurmd-create | 2 +- image/k8s-slurmd-delete | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/image/k8s-slurmd-create b/image/k8s-slurmd-create index 048b3d6..676796f 100644 --- a/image/k8s-slurmd-create +++ b/image/k8s-slurmd-create @@ -2,7 +2,7 @@ export KUBECONFIG=/var/lib/slurmctld/kubeconfig -echo "$(date) Resume invoked $0 $*" >> /var/log/power_save.log +echo "$(date) Resume invoked $0 $*" >> /var/log/slurm/power_save.log hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes for host in $hosts diff --git a/image/k8s-slurmd-delete b/image/k8s-slurmd-delete index ddbd7b7..19a1828 100644 --- a/image/k8s-slurmd-delete +++ b/image/k8s-slurmd-delete @@ -2,7 +2,7 @@ export KUBECONFIG=/var/lib/slurmctld/kubeconfig -echo "$(date) Suspend invoked $0 $*" >> /var/log/power_save.log +echo "$(date) Suspend invoked $0 $*" >> /var/log/slurm/power_save.log hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes for host in $hosts From d8eeb38fc9148e4c92cc2a5d95927a3c42938b42 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 15:05:47 +0000 Subject: [PATCH 052/152] fix duplicate SlurmctldParameters --- slurm-cluster-chart/files/slurm.conf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index bacb5cc..1f021ae 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -47,13 +47,12 @@ AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=slurmdbd AccountingStoragePort=6819 # -SlurmctldParameters=cloud_dns,cloud_reg_addrs +SlurmctldParameters=cloud_dns,cloud_reg_addrs,idle_on_node_suspend CommunicationParameters=NoAddrCache ReconfigFlags=KeepPowerSaveSettings #ResumeFailProgram=TODO? ResumeProgram=/usr/sbin/k8s-slurmd-create #ResumeTimeout=60 # default -SlurmctldParameters=idle_on_node_suspend #SuspendExcNodes= #SuspendExcParts= #SuspendExcStates= From f80b5a8321f710fc8d850c35f03921854a532aac Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 15:06:13 +0000 Subject: [PATCH 053/152] fix paths for resume/suspend programs --- slurm-cluster-chart/files/slurm.conf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index 1f021ae..0246bf6 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -36,6 +36,7 @@ SlurmctldDebug=3 SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmdDebug=3 SlurmdLogFile=/var/log/slurm/slurmd.log +DebugFlags=Power JobCompType=jobcomp/filetxt JobCompLoc=/var/log/slurm/jobcomp.log # @@ -51,12 +52,12 @@ SlurmctldParameters=cloud_dns,cloud_reg_addrs,idle_on_node_suspend CommunicationParameters=NoAddrCache ReconfigFlags=KeepPowerSaveSettings #ResumeFailProgram=TODO? -ResumeProgram=/usr/sbin/k8s-slurmd-create +ResumeProgram=/usr/local/bin/k8s-slurmd-create #ResumeTimeout=60 # default #SuspendExcNodes= #SuspendExcParts= #SuspendExcStates= -SuspendProgram=/usr/sbin/k8s-slurmd-delete +SuspendProgram=/usr/local/bin/k8s-slurmd-delete SuspendTime=30 # for debugging #SuspendTimeout= TreeWidth=65533 From 79eaf11512e0887e46f4909b774b1a1d3cc328b2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 15:06:55 +0000 Subject: [PATCH 054/152] bump image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index d1dd881..b0bc24c 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:aa80f98 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:f80b5a8 replicas: slurmd: 2 From 688885bf62d5158dadb4c88f017bf4d91197f5d0 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 07:53:16 +0000 Subject: [PATCH 055/152] pass slurmd flags via container args --- image/docker-entrypoint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 67e5484..c289c96 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -82,7 +82,7 @@ then echo "-- slurmctld is now active ..." echo "---> Starting the Slurm Node Daemon (slurmd) ..." - exec /usr/sbin/slurmd -F -Dvvv + exec /usr/sbin/slurmd "${@:2}" elif [ "$1" = "login" ] then From c95b5e4a78d371d171fb629dd1f282b10d309b99 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 07:56:38 +0000 Subject: [PATCH 056/152] bump image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index b0bc24c..ff7fbb5 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:f80b5a8 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:688885b replicas: slurmd: 2 From a9b7d4ab935efd0b7fde1a2595477ffa7236715b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 09:43:37 +0000 Subject: [PATCH 057/152] pass options to all slurm deamons via container args, set to max debug verbosity --- image/docker-entrypoint.sh | 6 +++--- slurm-cluster-chart/templates/slurmctld-statefulset.yaml | 1 + .../templates/slurmd-template-configmap.yaml | 2 ++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index c289c96..7af4c51 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -57,9 +57,9 @@ then echo "---> Starting the Slurm Controller Daemon (slurmctld) ..." if /usr/sbin/slurmctld -V | grep -q '17.02' ; then - exec gosu slurm /usr/sbin/slurmctld -Dvvv + exec gosu slurm /usr/sbin/slurmctld -D "${@:2}" else - exec gosu slurm /usr/sbin/slurmctld -i -Dvvv + exec gosu slurm /usr/sbin/slurmctld -i -D "${@:2}" fi elif [ "$1" = "slurmd" ] @@ -82,7 +82,7 @@ then echo "-- slurmctld is now active ..." echo "---> Starting the Slurm Node Daemon (slurmd) ..." - exec /usr/sbin/slurmd "${@:2}" + exec /usr/sbin/slurmd -D "${@:2}" elif [ "$1" = "login" ] then diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml index 0274652..2654107 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml @@ -22,6 +22,7 @@ spec: containers: - args: - slurmctld + - -vvvvv image: {{ .Values.slurmImage }} name: slurmctld ports: diff --git a/slurm-cluster-chart/templates/slurmd-template-configmap.yaml b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml index e5a60bf..960e1a9 100644 --- a/slurm-cluster-chart/templates/slurmd-template-configmap.yaml +++ b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml @@ -25,6 +25,8 @@ data: containers: - args: - slurmd + - -vvvvv + image: {{ .Values.slurmImage }} name: slurmd ports: From 3d393f02020c5a122f48e40bfb257b3dcd2d92e3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 09:46:21 +0000 Subject: [PATCH 058/152] bump image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index ff7fbb5..13966e6 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:688885b +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:a9b7d4a replicas: slurmd: 2 From f2c222e8ed45704770f71ecfc538d530a234931e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 10:12:35 +0000 Subject: [PATCH 059/152] add h/w definition for nodes --- slurm-cluster-chart/files/slurm.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index 0246bf6..6ecc335 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -64,7 +64,7 @@ TreeWidth=65533 # NODES MaxNodeCount=10 -NodeName=slurmd-[0-9] State=CLOUD +NodeName=slurmd-[0-9] State=CLOUD CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=2 ThreadsPerCore=2 # PARTITIONS PartitionName=all Default=yes Nodes=ALL From a908255f38a3ad63a38f314d59d1ef98465bbaf8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 11:00:01 +0000 Subject: [PATCH 060/152] use reboot flag on slurmd start to make resume work --- slurm-cluster-chart/templates/slurmd-template-configmap.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/slurm-cluster-chart/templates/slurmd-template-configmap.yaml b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml index 960e1a9..c02bfd7 100644 --- a/slurm-cluster-chart/templates/slurmd-template-configmap.yaml +++ b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml @@ -25,6 +25,7 @@ data: containers: - args: - slurmd + - -b - -vvvvv image: {{ .Values.slurmImage }} From 0963ea9205e9a46f281065591a3f9992a91ddba2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 13:14:45 +0000 Subject: [PATCH 061/152] fix NFS-mounted /home permissions --- image/docker-entrypoint.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 7af4c51..e2540ed 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -87,6 +87,9 @@ then elif [ "$1" = "login" ] then + chown root:root /home + chmod 755 /home + mkdir -p /home/rocky/.ssh cp /tmp/authorized_keys /home/rocky/.ssh/authorized_keys From 348c7eaeeca6d21f5744718b61cec37088866f78 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 13:15:45 +0000 Subject: [PATCH 062/152] bump image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 13966e6..95dbefb 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:a9b7d4a +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:0963ea9 replicas: slurmd: 2 From 4dba961837c096bd0d3c7496b93b19f850c56e06 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 13:16:40 +0000 Subject: [PATCH 063/152] remove cpu definition from slurm.conf --- slurm-cluster-chart/files/slurm.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index 6ecc335..0246bf6 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -64,7 +64,7 @@ TreeWidth=65533 # NODES MaxNodeCount=10 -NodeName=slurmd-[0-9] State=CLOUD CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=2 ThreadsPerCore=2 +NodeName=slurmd-[0-9] State=CLOUD # PARTITIONS PartitionName=all Default=yes Nodes=ALL From 1af9d0872e3092c2b82dc6c3e4426ed7cabbbb0b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 14:54:47 +0000 Subject: [PATCH 064/152] bump image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 95dbefb..ad16ab7 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:0963ea9 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:07b2502 replicas: slurmd: 2 From d93320760213dd0cb4b7823007ea3a744906cf8d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 16:21:18 +0000 Subject: [PATCH 065/152] don't use DNS for nodes --- slurm-cluster-chart/files/slurm.conf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index 0246bf6..61a3d15 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -48,8 +48,7 @@ AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=slurmdbd AccountingStoragePort=6819 # -SlurmctldParameters=cloud_dns,cloud_reg_addrs,idle_on_node_suspend -CommunicationParameters=NoAddrCache +SlurmctldParameters=cloud_reg_addrs,idle_on_node_suspend ReconfigFlags=KeepPowerSaveSettings #ResumeFailProgram=TODO? ResumeProgram=/usr/local/bin/k8s-slurmd-create From b8b7d4891d0c366aab74d7ab51a3f6a043b57b2d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 16:22:45 +0000 Subject: [PATCH 066/152] use host network for slurmd --- slurm-cluster-chart/templates/slurmd-template-configmap.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/slurm-cluster-chart/templates/slurmd-template-configmap.yaml b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml index 822747b..cba1681 100644 --- a/slurm-cluster-chart/templates/slurmd-template-configmap.yaml +++ b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml @@ -27,6 +27,8 @@ data: - slurmd - -b - -vvvvv + - -N + - SLURMD_NODENAME image: {{ .Values.slurmImage }} name: slurmd ports: @@ -42,6 +44,8 @@ data: subPath: munge.key securityContext: privileged: true + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet dnsConfig: searches: - slurmd.default.svc.cluster.local From 186db3ca8ae51b373f058a9fee35e34b5eabfd05 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 16:37:30 +0000 Subject: [PATCH 067/152] add hostPort to slurmd pods to avoid multiple on one k8s-node --- slurm-cluster-chart/templates/slurmd-template-configmap.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/slurm-cluster-chart/templates/slurmd-template-configmap.yaml b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml index cba1681..dca08eb 100644 --- a/slurm-cluster-chart/templates/slurmd-template-configmap.yaml +++ b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml @@ -33,6 +33,7 @@ data: name: slurmd ports: - containerPort: 6818 + hostPort: 6818 resources: {} volumeMounts: - mountPath: /etc/slurm/ From 416eccdce120ffb5b95bd547d6774a0a688f4d90 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 28 Jul 2023 07:56:39 +0000 Subject: [PATCH 068/152] don't default to 1x CPU --- slurm-cluster-chart/files/slurm.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index 61a3d15..2871ef0 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -63,7 +63,7 @@ TreeWidth=65533 # NODES MaxNodeCount=10 -NodeName=slurmd-[0-9] State=CLOUD +NodeName=slurmd-[0-9] State=CLOUD CPUs=4 # PARTITIONS PartitionName=all Default=yes Nodes=ALL From 06fd90418932e70fa1e6b1ce4c3bb54253a2389c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 28 Jul 2023 08:28:30 +0000 Subject: [PATCH 069/152] add back in noaddrcache --- slurm-cluster-chart/files/slurm.conf | 1 + 1 file changed, 1 insertion(+) diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index 2871ef0..5bc40f7 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -49,6 +49,7 @@ AccountingStorageHost=slurmdbd AccountingStoragePort=6819 # SlurmctldParameters=cloud_reg_addrs,idle_on_node_suspend +CommunicationParameters=NoAddrCache ReconfigFlags=KeepPowerSaveSettings #ResumeFailProgram=TODO? ResumeProgram=/usr/local/bin/k8s-slurmd-create From 9391fdcfba6dc1b0ac4bcb7e280d4647832345b7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 28 Jul 2023 08:38:43 +0000 Subject: [PATCH 070/152] remove commented-out topology constraints on slurmd --- .../templates/slurmd-template-configmap.yaml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/slurm-cluster-chart/templates/slurmd-template-configmap.yaml b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml index dca08eb..5ff5614 100644 --- a/slurm-cluster-chart/templates/slurmd-template-configmap.yaml +++ b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml @@ -12,14 +12,6 @@ data: app.kubernetes.io/component: slurmd name: SLURMD_NODENAME # Irrelevant for DNS but must be be currently-unique so using slurmd name is convenient spec: - # topologySpreadConstraints: - # - maxSkew: 1 - # whenUnsatisfiable: ScheduleAnyway - # topologyKey: kubernetes.io/hostname - # labelSelector: - # matchLabels: - # app.kubernetes.io/name: slurm - # app.kubernetes.io/component: slurmd hostname: SLURMD_NODENAME # required to create DNS records for pod subdomain: slurmd # has to match name of headless service to create DNS records for pod containers: From 7fd2796ff1ce6a2c2e3e9f40bbef4c050762cadd Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 8 Aug 2023 12:35:37 +0100 Subject: [PATCH 071/152] Changed hook to drain nodes before checking for jobs --- docker-entrypoint.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 23ad303..1345c20 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -110,10 +110,21 @@ then gosu munge /usr/sbin/munged echo "---> MUNGE Complete" + ALL_NODES=$( sinfo --Node --noheader --Format=NodeList ) + + for i in $ALL_NODES + do + scontrol update NodeName=$i State=DRAIN Reason="Preventing new jobs running before upgrade" + done + RUNNING_JOBS=$(squeue --states=RUNNING,COMPLETING,CONFIGURING,RESIZING,SIGNALING,STAGE_OUT,STOPPED,SUSPENDED --noheader --array | wc --lines) if [[ $RUNNING_JOBS -eq 0 ]] then + for i in $ALL_NODES + do + scontrol update NodeName=$i State=RESUME + done exit 0 else exit 1 From 18be119fad8a37139cda606f1765f61f6325b1f4 Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 8 Aug 2023 12:52:21 +0100 Subject: [PATCH 072/152] Updated tag and docs --- README.md | 2 ++ slurm-cluster-chart/values.yaml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 34abe12..2ccbda1 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,8 @@ Subsequent releases can be deployed using: helm upgrade slurm-cluster-chart ``` +Note: When updating the cluster with `helm upgrade`, a pre-upgrade hook will prevent upgrades if there are running jobs in the Slurm queue. Attempting to upgrade will set all Slurm nodes to `DRAINED` state. If an upgrade fails due to running jobs, you can undrain the nodes either by waiting for running jobs to complete and then retrying the upgrade or by manually undraining them by accessing the cluster as a privileged user. Alternatively you can bypass the hook by running `helm upgrade` with the `--no-hooks` flag (may result in running jobs being lost) + ## Accessing the Cluster Retrieve the external IP address of the login node using: diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index cd9d34d..4e2ef24 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:c12d04e +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:7fd2796 replicas: slurmd: 2 From d2531aff4f8c06fefd7e4106375021ee95283d19 Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 8 Aug 2023 14:51:02 +0100 Subject: [PATCH 073/152] Tweaks + now undrains rather than resuming drained nodes --- docker-entrypoint.sh | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 1345c20..61957a0 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -110,24 +110,16 @@ then gosu munge /usr/sbin/munged echo "---> MUNGE Complete" - ALL_NODES=$( sinfo --Node --noheader --Format=NodeList ) - - for i in $ALL_NODES - do - scontrol update NodeName=$i State=DRAIN Reason="Preventing new jobs running before upgrade" - done + scontrol update NodeName=all State=DRAIN Reason="Preventing new jobs running before upgrade" RUNNING_JOBS=$(squeue --states=RUNNING,COMPLETING,CONFIGURING,RESIZING,SIGNALING,STAGE_OUT,STOPPED,SUSPENDED --noheader --array | wc --lines) if [[ $RUNNING_JOBS -eq 0 ]] then - for i in $ALL_NODES - do - scontrol update NodeName=$i State=RESUME - done - exit 0 + scontrol update NodeName=all State=UNDRAIN + exit 0 else - exit 1 + exit 1 fi fi From 856d837a61345a4bb1cb9bef936a350d81d9add5 Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 8 Aug 2023 15:19:43 +0100 Subject: [PATCH 074/152] Update tag --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 4e2ef24..807e47b 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:7fd2796 +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:d2531af replicas: slurmd: 2 From 540ed62e354631052c648a681cd1045e563d67b2 Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 8 Aug 2023 15:53:37 +0100 Subject: [PATCH 075/152] Updated tag --- slurm-cluster-chart/values.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index f40ee09..8912f99 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,5 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7f4d64e -#OUTDATED, CHANGE AFTER REBUILD +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:2c59b39 replicas: slurmd: 2 From 845584b019a2560d5931632e48d911b6c988e16e Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 8 Aug 2023 15:59:46 +0100 Subject: [PATCH 076/152] Added entrypoint for post-upgrade hook --- image/docker-entrypoint.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 174a61c..0397b03 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -115,12 +115,17 @@ then if [[ $RUNNING_JOBS -eq 0 ]] then - scontrol update NodeName=all State=UNDRAIN exit 0 else exit 1 fi +elif [ "$1" = "undrain-nodes-hook" ] +then + start_munge + scontrol update NodeName=all State=UNDRAIN + exit 0 + elif [ "$1" = "debug" ] then start_munge --foreground From 7a70ba33aa80656fd158a1adc570e6d36b6274da Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 8 Aug 2023 16:05:44 +0100 Subject: [PATCH 077/152] Added post-upgrade hook to undrain nodes --- .../templates/undrain-nodes-hook.yaml | 34 +++++++++++++++++++ slurm-cluster-chart/values.yaml | 2 +- 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 slurm-cluster-chart/templates/undrain-nodes-hook.yaml diff --git a/slurm-cluster-chart/templates/undrain-nodes-hook.yaml b/slurm-cluster-chart/templates/undrain-nodes-hook.yaml new file mode 100644 index 0000000..3c0f189 --- /dev/null +++ b/slurm-cluster-chart/templates/undrain-nodes-hook.yaml @@ -0,0 +1,34 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: undrain-nodes-hook + annotations: + "helm.sh/hook": post-upgrade + "helm.sh/hook-delete-policy": hook-succeeded +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 0 + template: + metadata: + name: undrain-nodes-hook + spec: + restartPolicy: Never + containers: + - name: undrain-nodes-hook + image: {{ .Values.slurmImage }} + args: + - undrain-nodes-hook + volumeMounts: + - mountPath: /tmp/munge.key + name: munge-key-secret + subPath: munge.key + - mountPath: /etc/slurm/ + name: slurm-config-volume + volumes: + - name: munge-key-secret + secret: + secretName: {{ .Values.secrets.mungeKey }} + defaultMode: 0400 + - name: slurm-config-volume + configMap: + name: {{ .Values.configmaps.slurmConf }} diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 8912f99..f329e01 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:2c59b39 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:845584b replicas: slurmd: 2 From 057651abe653c9f943010cdfe27fd51c450a5f6a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 8 Aug 2023 15:20:24 +0000 Subject: [PATCH 078/152] bump image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 7873e5c..8c57722 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7f4d64e +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:9e4598e replicas: slurmd: 2 From f52e91848584aa261dda5ed5cb2c0b4211d0dc7a Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 8 Aug 2023 16:47:25 +0100 Subject: [PATCH 079/152] Fixed munge --- image/docker-entrypoint.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 9e6b085..14b511c 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -106,7 +106,7 @@ then chmod 600 $DIR/.ssh/authorized_keys || echo "Couldn't set permissions for .ssh/authorized_keys for $DIR" done popd > /dev/null - + echo "---> Complete" echo "---> Starting sshd" cp /tempmounts/etc/ssh/* /etc/ssh/ @@ -116,9 +116,7 @@ then chmod 600 /etc/ssh/ssh_host_rsa_key /usr/sbin/sshd - echo "---> Starting the MUNGE Authentication service (munged) ..." - gosu munge /usr/sbin/munged - echo "---> MUNGE Complete" + start_munge echo "---> Setting up self ssh capabilities for OOD" ssh-keyscan localhost > /etc/ssh/ssh_known_hosts From 303e6f0de44ba692c2b01d621596e4cb4cb7029b Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 8 Aug 2023 16:51:13 +0100 Subject: [PATCH 080/152] Updated tag --- slurm-cluster-chart/values.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index f40ee09..6a394cb 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,5 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7f4d64e -#OUTDATED, CHANGE AFTER REBUILD +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:f52e918 replicas: slurmd: 2 From 7ca06682d58e3bd05ff32bbbd5e719036c686f2a Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 10 Aug 2023 11:59:23 +0100 Subject: [PATCH 081/152] Moved database auth to helm templating --- generate-secrets.sh | 6 ------ slurm-cluster-chart/templates/database-auth-secret.yaml | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) create mode 100644 slurm-cluster-chart/templates/database-auth-secret.yaml diff --git a/generate-secrets.sh b/generate-secrets.sh index e98b97e..b4cc01c 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -1,11 +1,5 @@ #!/bin/bash -kubectl create secret generic database-auth-secret \ ---dry-run=client \ ---from-literal=password=$(tr -dc 'A-Za-z0-9' /dev/null | base64 -w 0) \ diff --git a/slurm-cluster-chart/templates/database-auth-secret.yaml b/slurm-cluster-chart/templates/database-auth-secret.yaml new file mode 100644 index 0000000..27c4e3f --- /dev/null +++ b/slurm-cluster-chart/templates/database-auth-secret.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Secret +metadata: + name: database-auth-secret +data: + password: {{ randAlphaNum 32 | b64enc }} \ No newline at end of file From 656aa6c058ca0cc337ed72ac83879098110f4a34 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 10 Aug 2023 12:16:08 +0100 Subject: [PATCH 082/152] Moved munge key generation to helm --- generate-secrets.sh | 6 ------ slurm-cluster-chart/templates/munge-key-secret.yaml | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) create mode 100644 slurm-cluster-chart/templates/munge-key-secret.yaml diff --git a/generate-secrets.sh b/generate-secrets.sh index b4cc01c..f64c116 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -1,11 +1,5 @@ #!/bin/bash -kubectl create secret generic munge-key-secret \ ---dry-run=client \ ---from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64 -w 0) \ --o yaml | \ -kubectl apply -f - - mkdir -p ./temphostkeys/etc/ssh ssh-keygen -A -f ./temphostkeys kubectl create secret generic host-keys-secret \ diff --git a/slurm-cluster-chart/templates/munge-key-secret.yaml b/slurm-cluster-chart/templates/munge-key-secret.yaml new file mode 100644 index 0000000..153b5fe --- /dev/null +++ b/slurm-cluster-chart/templates/munge-key-secret.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Secret +metadata: + name: munge-key-secret +data: + munge.key: {{ randAscii 128 | b64enc }} \ No newline at end of file From a9003f7a17aba22cc1c0e373f8fbfa6e73c6f742 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 10 Aug 2023 13:37:40 +0100 Subject: [PATCH 083/152] Moved OOD password to values/yaml --- generate-secrets.sh | 13 ------------- slurm-cluster-chart/templates/login-deployment.yaml | 5 +---- slurm-cluster-chart/values.yaml | 5 ++++- 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/generate-secrets.sh b/generate-secrets.sh index f64c116..b6d4267 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -8,16 +8,3 @@ kubectl create secret generic host-keys-secret \ -o yaml | \ kubectl apply -f - rm -rf ./temphostkeys - -OOD_PASS=$(tr -dc 'A-Za-z0-9' Date: Thu, 10 Aug 2023 14:15:09 +0100 Subject: [PATCH 084/152] Random secrets now generated pre-install only --- slurm-cluster-chart/templates/database-auth-secret.yaml | 5 ++++- slurm-cluster-chart/templates/munge-key-secret.yaml | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/slurm-cluster-chart/templates/database-auth-secret.yaml b/slurm-cluster-chart/templates/database-auth-secret.yaml index 27c4e3f..6133576 100644 --- a/slurm-cluster-chart/templates/database-auth-secret.yaml +++ b/slurm-cluster-chart/templates/database-auth-secret.yaml @@ -2,5 +2,8 @@ apiVersion: v1 kind: Secret metadata: name: database-auth-secret + annotations: + helm.sh/hook: pre-install + helm.sh/resource-policy: keep data: - password: {{ randAlphaNum 32 | b64enc }} \ No newline at end of file + password: {{ randAlphaNum 32 | b64enc }} diff --git a/slurm-cluster-chart/templates/munge-key-secret.yaml b/slurm-cluster-chart/templates/munge-key-secret.yaml index 153b5fe..65825d6 100644 --- a/slurm-cluster-chart/templates/munge-key-secret.yaml +++ b/slurm-cluster-chart/templates/munge-key-secret.yaml @@ -2,5 +2,8 @@ apiVersion: v1 kind: Secret metadata: name: munge-key-secret + annotations: + helm.sh/hook: pre-install + helm.sh/resource-policy: keep data: - munge.key: {{ randAscii 128 | b64enc }} \ No newline at end of file + munge.key: {{ randAscii 128 | b64enc }} From e0514f6c47bfc000264708d2be82151a805a16c1 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 10 Aug 2023 14:31:46 +0100 Subject: [PATCH 085/152] Added kubectl to image --- image/Dockerfile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/image/Dockerfile b/image/Dockerfile index 855a1cc..dceaeeb 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -18,6 +18,14 @@ RUN set -ex \ && yum -y module enable ruby:2.7 nodejs:14 \ && yum -y install https://yum.osc.edu/ondemand/2.0/ondemand-release-web-2.0-1.noarch.rpm \ && yum -y module install ruby nodejs \ + && cat < Date: Thu, 10 Aug 2023 14:35:29 +0100 Subject: [PATCH 086/152] Fixed Dockerfile --- image/Dockerfile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/image/Dockerfile b/image/Dockerfile index dceaeeb..bcc3fdb 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -18,13 +18,13 @@ RUN set -ex \ && yum -y module enable ruby:2.7 nodejs:14 \ && yum -y install https://yum.osc.edu/ondemand/2.0/ondemand-release-web-2.0-1.noarch.rpm \ && yum -y module install ruby nodejs \ - && cat < Date: Thu, 10 Aug 2023 14:46:31 +0100 Subject: [PATCH 087/152] Testing with separate command --- image/Dockerfile | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/image/Dockerfile b/image/Dockerfile index bcc3fdb..14ad93b 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -9,6 +9,15 @@ LABEL org.opencontainers.image.source="https://github.com/stackhpc/slurm-docker- ARG SLURM_TAG=slurm-23.02 ARG GOSU_VERSION=1.11 +RUN cat < Date: Thu, 10 Aug 2023 14:52:37 +0100 Subject: [PATCH 088/152] Revert "Testing with separate command" This reverts commit cd0d1afb5cfaae3bd234dccb26c673435da21fd0. --- image/Dockerfile | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/image/Dockerfile b/image/Dockerfile index 14ad93b..bcc3fdb 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -9,15 +9,6 @@ LABEL org.opencontainers.image.source="https://github.com/stackhpc/slurm-docker- ARG SLURM_TAG=slurm-23.02 ARG GOSU_VERSION=1.11 -RUN cat < Date: Thu, 10 Aug 2023 14:53:58 +0100 Subject: [PATCH 089/152] Removed sudo from dockerfile --- image/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image/Dockerfile b/image/Dockerfile index bcc3fdb..ee14ea3 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -18,7 +18,7 @@ RUN set -ex \ && yum -y module enable ruby:2.7 nodejs:14 \ && yum -y install https://yum.osc.edu/ondemand/2.0/ondemand-release-web-2.0-1.noarch.rpm \ && yum -y module install ruby nodejs \ - && cat < Date: Thu, 10 Aug 2023 15:00:29 +0100 Subject: [PATCH 090/152] Moved kubernetes repo to separate file --- image/Dockerfile | 9 ++------- image/kubernetes.repo | 6 ++++++ 2 files changed, 8 insertions(+), 7 deletions(-) create mode 100644 image/kubernetes.repo diff --git a/image/Dockerfile b/image/Dockerfile index ee14ea3..9874e58 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -9,6 +9,8 @@ LABEL org.opencontainers.image.source="https://github.com/stackhpc/slurm-docker- ARG SLURM_TAG=slurm-23.02 ARG GOSU_VERSION=1.11 +COPY kubernetes.repo /etc/yum.repos.d/kubernetes.repo + RUN set -ex \ && yum makecache \ && yum -y update \ @@ -19,13 +21,6 @@ RUN set -ex \ && yum -y install https://yum.osc.edu/ondemand/2.0/ondemand-release-web-2.0-1.noarch.rpm \ && yum -y module install ruby nodejs \ && cat < Date: Thu, 10 Aug 2023 15:02:45 +0100 Subject: [PATCH 091/152] Fixed leftover commands --- image/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/image/Dockerfile b/image/Dockerfile index 9874e58..0d00a6a 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -20,7 +20,6 @@ RUN set -ex \ && yum -y module enable ruby:2.7 nodejs:14 \ && yum -y install https://yum.osc.edu/ondemand/2.0/ondemand-release-web-2.0-1.noarch.rpm \ && yum -y module install ruby nodejs \ - && cat < Date: Thu, 10 Aug 2023 16:18:53 +0100 Subject: [PATCH 092/152] Updated tag and created service account to modify host-keys-secret --- .../templates/secret-generator-role.yaml | 22 +++++++++++++++++++ .../secret-generator-serviceaccount.yaml | 10 +++++++++ slurm-cluster-chart/values.yaml | 2 +- 3 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 slurm-cluster-chart/templates/secret-generator-role.yaml create mode 100644 slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml diff --git a/slurm-cluster-chart/templates/secret-generator-role.yaml b/slurm-cluster-chart/templates/secret-generator-role.yaml new file mode 100644 index 0000000..67de05e --- /dev/null +++ b/slurm-cluster-chart/templates/secret-generator-role.yaml @@ -0,0 +1,22 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: secret-generator-role +rules: +- apiGroups: [""] # "" indicates the core API group + resources: ["secrets"] + verbs: ["get","apply","create", "patch"] + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: secret-generator-rolebinding +subjects: + - kind: ServiceAccount + name: secret-generator-account +roleRef: + kind: Role + name: secret-generator-role + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml b/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml new file mode 100644 index 0000000..6510cb9 --- /dev/null +++ b/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: secret-generator-account + annotations: + "kubernetes.io/enforce-mountable-secrets": "true" +automountServiceAccountToken: True +secrets: + - name: host-keys-secret + \ No newline at end of file diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index e2aed84..c0b0360 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:f52e918 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:763de73 replicas: slurmd: 2 From d58f819e1ed9e46f9cd71e3432c20928f2922887 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 10 Aug 2023 16:24:38 +0100 Subject: [PATCH 093/152] Added entrypoint for host key generation hook --- image/docker-entrypoint.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 14b511c..01d3519 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -148,6 +148,18 @@ then exit 1 fi +elif [ "$1" = "generate-keys-hook" ] +then + mkdir -p ./temphostkeys/etc/ssh + ssh-keygen -A -f ./temphostkeys + kubectl create secret generic host-keys-secret \ + --dry-run=client \ + --from-file=./temphostkeys/etc/ssh \ + -o yaml | \ + kubectl apply -f - + + exit 0 + elif [ "$1" = "debug" ] then start_munge --foreground From 16ee05dbb6ef16c234fe1b491a904e4190693a14 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 10 Aug 2023 16:44:42 +0100 Subject: [PATCH 094/152] Added pre-install hook to generate host keys --- .../templates/generate-keys-hook.yaml | 22 +++++++++++++++++++ .../templates/secret-generator-role.yaml | 6 +++++ .../secret-generator-serviceaccount.yaml | 3 ++- slurm-cluster-chart/values.yaml | 2 +- 4 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 slurm-cluster-chart/templates/generate-keys-hook.yaml diff --git a/slurm-cluster-chart/templates/generate-keys-hook.yaml b/slurm-cluster-chart/templates/generate-keys-hook.yaml new file mode 100644 index 0000000..c05e7f2 --- /dev/null +++ b/slurm-cluster-chart/templates/generate-keys-hook.yaml @@ -0,0 +1,22 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: generate-keys-hook + annotations: + "helm.sh/hook": pre-install + "helm.sh/hook-delete-policy": hook-succeeded + "helm.sh/hook-weight": "3" +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 0 + template: + metadata: + name: generate-keys-hook + spec: + serviceAccountName: secret-generator-account + restartPolicy: Never + containers: + - name: generate-keys-hook + image: {{ .Values.slurmImage }} + args: + - generate-keys-hook diff --git a/slurm-cluster-chart/templates/secret-generator-role.yaml b/slurm-cluster-chart/templates/secret-generator-role.yaml index 67de05e..da914be 100644 --- a/slurm-cluster-chart/templates/secret-generator-role.yaml +++ b/slurm-cluster-chart/templates/secret-generator-role.yaml @@ -2,6 +2,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: secret-generator-role + annotations: + "helm.sh/hook": pre-install + "helm.sh/hook-weight": "1" rules: - apiGroups: [""] # "" indicates the core API group resources: ["secrets"] @@ -13,6 +16,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: secret-generator-rolebinding + annotations: + "helm.sh/hook": pre-install + "helm.sh/hook-weight": "2" subjects: - kind: ServiceAccount name: secret-generator-account diff --git a/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml b/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml index 6510cb9..ce860b0 100644 --- a/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml +++ b/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml @@ -4,7 +4,8 @@ metadata: name: secret-generator-account annotations: "kubernetes.io/enforce-mountable-secrets": "true" + "helm.sh/hook": pre-install + "helm.sh/hook-weight": "0" automountServiceAccountToken: True secrets: - name: host-keys-secret - \ No newline at end of file diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index c0b0360..0421371 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:763de73 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:d58f819 replicas: slurmd: 2 From 15b07a671b8e49a4e0d73bd6899a4290510bc065 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 10 Aug 2023 16:47:52 +0100 Subject: [PATCH 095/152] Removed generate-secrets.sh --- generate-secrets.sh | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100755 generate-secrets.sh diff --git a/generate-secrets.sh b/generate-secrets.sh deleted file mode 100755 index b6d4267..0000000 --- a/generate-secrets.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -mkdir -p ./temphostkeys/etc/ssh -ssh-keygen -A -f ./temphostkeys -kubectl create secret generic host-keys-secret \ ---dry-run=client \ ---from-file=./temphostkeys/etc/ssh \ --o yaml | \ -kubectl apply -f - -rm -rf ./temphostkeys From 4b8e114aed5468a9cc74b68b0118272c194279cc Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 11 Aug 2023 09:52:16 +0100 Subject: [PATCH 096/152] Now option to give public key explicitly through values.yaml --- .../templates/helm-authorized-keys-configmap.yaml | 9 +++++++++ slurm-cluster-chart/templates/login-deployment.yaml | 6 +++++- slurm-cluster-chart/values.yaml | 4 +++- 3 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 slurm-cluster-chart/templates/helm-authorized-keys-configmap.yaml diff --git a/slurm-cluster-chart/templates/helm-authorized-keys-configmap.yaml b/slurm-cluster-chart/templates/helm-authorized-keys-configmap.yaml new file mode 100644 index 0000000..75ad249 --- /dev/null +++ b/slurm-cluster-chart/templates/helm-authorized-keys-configmap.yaml @@ -0,0 +1,9 @@ +#Only applied if sshPublicKey provided in values.yaml, if not assumes you have run publish-keys.sh prior to helm release +{{ if .Values.sshPublicKey }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: helm-authorized-keys-configmap +data: + authorized_keys: {{ .Values.sshPublicKey }} +{{ end }} diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index 37fb46b..0984560 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -80,7 +80,11 @@ spec: defaultMode: 0400 - name: authorized-keys configMap: - name: {{ .Values.configmaps.authorizedKeys }} + {{ if .Values.sshPublicKey }} + name: helm-authorized-keys-configmap + {{ else }} + name: authorized-keys-configmap + {{ end }} - name: cluster-config configMap: name: cluster-config diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 0421371..d7fc033 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -13,11 +13,13 @@ sqlImage: mariadb:10.10 databaseStorage: 100Mi configmaps: - authorizedKeys: authorized-keys-configmap slurmConf: slurm-conf-configmap slurmdbdConf: slurmdbd-conf-configmap sshdConfig: sshd-config-configmap +# If let undefined, assumes you have run publish-keys.sh to publish your public key prior to deployment +sshPublicKey: + secrets: databaseAuth: database-auth-secret mungeKey: munge-key-secret From c7a724886ba2aaea3b65c4d228d3391717d8c0d7 Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 11 Aug 2023 10:42:00 +0100 Subject: [PATCH 097/152] Added custom packaging to workflow --- .github/workflows/publish-helm-chart.yml | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/publish-helm-chart.yml b/.github/workflows/publish-helm-chart.yml index 8ce0698..1806817 100644 --- a/.github/workflows/publish-helm-chart.yml +++ b/.github/workflows/publish-helm-chart.yml @@ -1,9 +1,6 @@ name: Release Charts -on: - push: - branches: - - main +on: push jobs: release: @@ -17,6 +14,11 @@ jobs: uses: actions/checkout@v3 with: fetch-depth: 0 + submodules: true + + - name: Get SemVer version for current commit + id: semver + uses: stackhpc/github-actions/semver@master - name: Configure Git run: | @@ -28,10 +30,15 @@ jobs: env: GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + - name: "Package Chart" + run: | + helm package slurm-cluster-chart --version ${{ steps.semver.outputs.version }} + - name: Run chart-releaser uses: helm/chart-releaser-action@v1.5.0 with: charts_dir: . + skip_packaging: True env: CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" From 69122f7b503dfd55c21bad56d077e6eec0b957a9 Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 11 Aug 2023 10:46:19 +0100 Subject: [PATCH 098/152] Trying adding charts to cr packages --- .github/workflows/publish-helm-chart.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/publish-helm-chart.yml b/.github/workflows/publish-helm-chart.yml index 1806817..8a6f4f7 100644 --- a/.github/workflows/publish-helm-chart.yml +++ b/.github/workflows/publish-helm-chart.yml @@ -32,7 +32,8 @@ jobs: - name: "Package Chart" run: | - helm package slurm-cluster-chart --version ${{ steps.semver.outputs.version }} + mkdir -p .cr-release-packages + helm package slurm-cluster-chart --version ${{ steps.semver.outputs.version }} --destination .cr-release-packages - name: Run chart-releaser uses: helm/chart-releaser-action@v1.5.0 From ca27405f537d3eff24f9d6201f0c614961ddaa7b Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 11 Aug 2023 10:54:02 +0100 Subject: [PATCH 099/152] Added source in slurm-cluster-chart/files/httpd.conf Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/files/httpd.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/files/httpd.conf b/slurm-cluster-chart/files/httpd.conf index 6d3783a..248afb2 100644 --- a/slurm-cluster-chart/files/httpd.conf +++ b/slurm-cluster-chart/files/httpd.conf @@ -1,4 +1,4 @@ -# +# Modified from file installed by httpd package # This is the main Apache HTTP server configuration file. It contains the # configuration directives that give the server its instructions. # See for detailed information. From 1a3c3adb269f6a2161942fe3000d5eeadf30b022 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 11 Aug 2023 10:54:25 +0100 Subject: [PATCH 100/152] Added source in slurm-cluster-chart/files/ood_portal.yaml Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/files/ood_portal.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/slurm-cluster-chart/files/ood_portal.yaml b/slurm-cluster-chart/files/ood_portal.yaml index 9be3295..d5227b2 100644 --- a/slurm-cluster-chart/files/ood_portal.yaml +++ b/slurm-cluster-chart/files/ood_portal.yaml @@ -1,3 +1,4 @@ +# Modified from file installed by ondemand package --- # # Portal configuration From 09d25127aa6a4825d772c444e74b08020497943d Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Fri, 11 Aug 2023 11:29:43 +0100 Subject: [PATCH 101/152] Add Known Issues heading to start documenting these --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 2edf8a0..11fe8b8 100644 --- a/README.md +++ b/README.md @@ -171,3 +171,5 @@ and then restart the other dependent deployments to propagate changes: ```console kubectl rollout restart deployment slurmd slurmctld login slurmdbd ``` + +# Known Issues From 9979627bbe7c4a5f993a23ce5ca3ba7aacf17f21 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Fri, 11 Aug 2023 12:24:59 +0100 Subject: [PATCH 102/152] Convert Rook NFS to Helm chart - Adds Rook NFS Helm chart as dependency of Slurm cluster chart - Refactors main values file to allow additional customisation - Adds cleanup job as pre-delete hook to fix uninstall behaviour --- .gitignore | 3 + nfs/deploy-nfs.sh | 11 ---- nfs/pvc.yaml | 11 ---- nfs/sc.yaml | 13 ----- nfs/teardown-nfs.sh | 16 ------ rooknfs/Chart.yaml | 4 ++ rooknfs/README.md | 0 {nfs => rooknfs/crds}/crds.yaml | 0 {nfs => rooknfs/templates}/nfs.yaml | 18 +++--- {nfs => rooknfs/templates}/operator.yaml | 12 ++-- {nfs => rooknfs/templates}/rbac.yaml | 10 ++-- rooknfs/templates/sc.yaml | 17 ++++++ rooknfs/values.yaml | 30 ++++++++++ slurm-cluster-chart/Chart.yaml | 7 ++- .../templates/hooks/pre-delete.yaml | 55 +++++++++++++++++++ .../{login-deployment.yaml => login.yaml} | 8 +-- slurm-cluster-chart/templates/pvc.yaml | 14 +++++ ...rmctld-statefulset.yaml => slurmctld.yaml} | 6 +- .../{slurmd-deployment.yaml => slurmd.yaml} | 9 +-- slurm-cluster-chart/values.yaml | 50 +++++++++++++++-- 20 files changed, 211 insertions(+), 83 deletions(-) create mode 100644 .gitignore delete mode 100755 nfs/deploy-nfs.sh delete mode 100644 nfs/pvc.yaml delete mode 100644 nfs/sc.yaml delete mode 100755 nfs/teardown-nfs.sh create mode 100644 rooknfs/Chart.yaml create mode 100644 rooknfs/README.md rename {nfs => rooknfs/crds}/crds.yaml (100%) rename {nfs => rooknfs/templates}/nfs.yaml (61%) rename {nfs => rooknfs/templates}/operator.yaml (91%) rename {nfs => rooknfs/templates}/rbac.yaml (88%) create mode 100644 rooknfs/templates/sc.yaml create mode 100644 rooknfs/values.yaml create mode 100644 slurm-cluster-chart/templates/hooks/pre-delete.yaml rename slurm-cluster-chart/templates/{login-deployment.yaml => login.yaml} (90%) create mode 100644 slurm-cluster-chart/templates/pvc.yaml rename slurm-cluster-chart/templates/{slurmctld-statefulset.yaml => slurmctld.yaml} (91%) rename slurm-cluster-chart/templates/{slurmd-deployment.yaml => slurmd.yaml} (88%) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0ba5327 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Build artifacts from local helm install +slurm-cluster-chart/Chart.lock +slurm-cluster-chart/charts/ diff --git a/nfs/deploy-nfs.sh b/nfs/deploy-nfs.sh deleted file mode 100755 index b2d2f75..0000000 --- a/nfs/deploy-nfs.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -# Based on https://rook.io/docs/nfs/v1.7/quickstart.html -# Manifests listed explicitly here to guarantee ordering - -kubectl create -f nfs/crds.yaml -kubectl create -f nfs/operator.yaml -kubectl create -f nfs/rbac.yaml -kubectl create -f nfs/nfs.yaml -kubectl create -f nfs/sc.yaml -kubectl create -f nfs/pvc.yaml diff --git a/nfs/pvc.yaml b/nfs/pvc.yaml deleted file mode 100644 index 7f0a3d7..0000000 --- a/nfs/pvc.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: rook-nfs-pv-claim -spec: - storageClassName: "rook-nfs-share1" - accessModes: - - ReadWriteMany - resources: - requests: - storage: 10Gi diff --git a/nfs/sc.yaml b/nfs/sc.yaml deleted file mode 100644 index 6f9e3ae..0000000 --- a/nfs/sc.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - labels: - app: rook-nfs - name: rook-nfs-share1 -parameters: - exportName: share1 - nfsServerName: rook-nfs - nfsServerNamespace: rook-nfs -provisioner: nfs.rook.io/rook-nfs-provisioner -reclaimPolicy: Delete -volumeBindingMode: Immediate diff --git a/nfs/teardown-nfs.sh b/nfs/teardown-nfs.sh deleted file mode 100755 index 4dde364..0000000 --- a/nfs/teardown-nfs.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -kubectl delete -f web-service.yaml -kubectl delete -f web-rc.yaml -kubectl delete -f busybox-rc.yaml -kubectl delete -f pvc.yaml -kubectl delete -f pv.yaml -kubectl delete -f nfs.yaml -kubectl delete -f nfs-xfs.yaml -kubectl delete -f nfs-ceph.yaml -kubectl delete -f rbac.yaml -kubectl delete -f psp.yaml -kubectl delete -f scc.yaml # if deployed -kubectl delete -f operator.yaml -kubectl delete -f webhook.yaml # if deployed -kubectl delete -f crds.yaml diff --git a/rooknfs/Chart.yaml b/rooknfs/Chart.yaml new file mode 100644 index 0000000..83a2a11 --- /dev/null +++ b/rooknfs/Chart.yaml @@ -0,0 +1,4 @@ +apiVersion: v2 +name: rooknfs +version: 0.0.1 +description: An packaged installation of Rook NFS for Kubernetes. \ No newline at end of file diff --git a/rooknfs/README.md b/rooknfs/README.md new file mode 100644 index 0000000..e69de29 diff --git a/nfs/crds.yaml b/rooknfs/crds/crds.yaml similarity index 100% rename from nfs/crds.yaml rename to rooknfs/crds/crds.yaml diff --git a/nfs/nfs.yaml b/rooknfs/templates/nfs.yaml similarity index 61% rename from nfs/nfs.yaml rename to rooknfs/templates/nfs.yaml index 742fa34..6fde553 100644 --- a/nfs/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -1,32 +1,36 @@ +{{- if .Values.enabled }} --- # A default storageclass must be present apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: nfs-default-claim - namespace: rook-nfs + name: {{ .Values.claimName}} + namespace: {{ .Values.serverNamespace }} spec: accessModes: - ReadWriteMany resources: requests: - storage: 1Gi + storage: {{ .Values.storageCapacity }} --- apiVersion: nfs.rook.io/v1alpha1 kind: NFSServer metadata: - name: rook-nfs - namespace: rook-nfs + name: {{ .Values.serverName }} + namespace: {{ .Values.serverNamespace }} spec: replicas: 1 exports: - - name: share1 + - name: {{ .Values.shareName }} server: accessMode: ReadWrite squash: "none" # A Persistent Volume Claim must be created before creating NFS CRD instance. persistentVolumeClaim: - claimName: nfs-default-claim + claimName: {{ .Values.claimName }} # A key/value list of annotations annotations: rook: nfs +--- +{{- end }} + diff --git a/nfs/operator.yaml b/rooknfs/templates/operator.yaml similarity index 91% rename from nfs/operator.yaml rename to rooknfs/templates/operator.yaml index b289909..4a1d542 100644 --- a/nfs/operator.yaml +++ b/rooknfs/templates/operator.yaml @@ -1,13 +1,15 @@ +{{- if .Values.enabled }} +--- apiVersion: v1 kind: Namespace metadata: - name: rook-nfs-system # namespace:operator + name: {{ .Values.systemNamespace }} --- apiVersion: v1 kind: ServiceAccount metadata: name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 @@ -20,7 +22,7 @@ roleRef: subjects: - kind: ServiceAccount name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -106,7 +108,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} labels: app: rook-nfs-operator spec: @@ -134,3 +136,5 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace +--- +{{- end}} diff --git a/nfs/rbac.yaml b/rooknfs/templates/rbac.yaml similarity index 88% rename from nfs/rbac.yaml rename to rooknfs/templates/rbac.yaml index 8e3d9f7..b327740 100644 --- a/nfs/rbac.yaml +++ b/rooknfs/templates/rbac.yaml @@ -1,14 +1,15 @@ +{{- if .Values.enabled }} --- apiVersion: v1 kind: Namespace metadata: - name: rook-nfs + name: {{ .Values.serverNamespace }} --- apiVersion: v1 kind: ServiceAccount metadata: name: rook-nfs-server - namespace: rook-nfs + namespace: {{ .Values.serverNamespace }} --- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 @@ -51,9 +52,10 @@ metadata: subjects: - kind: ServiceAccount name: rook-nfs-server - # replace with namespace where provisioner is deployed - namespace: rook-nfs + namespace: {{ .Values.serverNamespace }} roleRef: kind: ClusterRole name: rook-nfs-provisioner-runner apiGroup: rbac.authorization.k8s.io +--- +{{- end }} \ No newline at end of file diff --git a/rooknfs/templates/sc.yaml b/rooknfs/templates/sc.yaml new file mode 100644 index 0000000..0ad75fe --- /dev/null +++ b/rooknfs/templates/sc.yaml @@ -0,0 +1,17 @@ +{{- if .Values.enabled }} +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + labels: + app: rook-nfs + name: {{ .Values.storageClassName }} +parameters: + exportName: {{ .Values.shareName }} + nfsServerName: {{ .Values.serverName }} + nfsServerNamespace: {{ .Values.serverNamespace }} +provisioner: nfs.rook.io/rook-nfs-provisioner +reclaimPolicy: Delete +volumeBindingMode: Immediate +--- +{{- end }} \ No newline at end of file diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml new file mode 100644 index 0000000..1961fa6 --- /dev/null +++ b/rooknfs/values.yaml @@ -0,0 +1,30 @@ +# Global flag for enabling/disabling all chart resources +# This is useful for allowing charts which use this chart +# as a dependency to toggle usage of this chart based on +# values in the parent chart +enabled: true + +# Name for the NFSServer resource created by rook +serverName: rook-nfs + +# Name for the created storage class +storageClassName: rook-nfs + +# Name for the Read-Write-Once backing PVC created by Rook +claimName: rook-nfs-backing-pv + +# Name for the NFS share within the NFS Resource instance +shareName: share-1 + +# Size of the Read-Write-Once backing storage volume +storageCapacity: 10Gi + +# Image to use for the Rook NFS operator +operatorImage: rook/nfs:master + +# NOTE: For some reason deploying everything in the default +# namespace leads to R-W-M PVCs getting stuck in 'pending' +# state indefinitely, so here we separate out namespaces as +# of various components in the same way as the Rook docs +serverNamespace: rook-nfs +systemNamespace: rook-nfs-system \ No newline at end of file diff --git a/slurm-cluster-chart/Chart.yaml b/slurm-cluster-chart/Chart.yaml index 9e592c0..4dad59b 100644 --- a/slurm-cluster-chart/Chart.yaml +++ b/slurm-cluster-chart/Chart.yaml @@ -21,4 +21,9 @@ version: 0.1.0 # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.16.0" \ No newline at end of file +appVersion: "1.16.0" + +dependencies: + - name: rooknfs + version: 0.0.1 + repository: file://../rooknfs \ No newline at end of file diff --git a/slurm-cluster-chart/templates/hooks/pre-delete.yaml b/slurm-cluster-chart/templates/hooks/pre-delete.yaml new file mode 100644 index 0000000..8cdb1f3 --- /dev/null +++ b/slurm-cluster-chart/templates/hooks/pre-delete.yaml @@ -0,0 +1,55 @@ +{{- if .Values.rooknfs.enabled }} +# NOTE: The cleanup jobs defined here are required to ensure that things which +# Rook NFS is responsible for cleaning up are deleted before deleting the Rook +# pods which do the actual clean up of NFS resources. For example, the RWM PVC +# must be deleted before the Rook StorageClass and provisioner pod. However, +# the PVC cannot be deleted until the pods which are using it are deleted, so +# the various Slurm node pods must actually be the first resources deleted. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: rook-nfs-cleanup +--- +# TODO: Create a job-specific ClusterRole for the ServiceAccount +# instead of using the cluster-admin role here +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: rook-nfs-cleanup +subjects: +- kind: ServiceAccount + name: rook-nfs-cleanup + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: cluster-admin +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: rook-nfs-pre-delete-cleanup + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": hook-succeeded +spec: + template: + metadata: + name: rook-nfs-pre-delete-cleanup + spec: + serviceAccountName: rook-nfs-cleanup + containers: + - name: tester + image: bitnami/kubectl + command: + - "bin/bash" + - "-c" + - | + kubectl delete -n {{ .Release.Namespace }} deployment {{ .Values.login.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmctld.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmd.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} pvc {{ .Values.storage.claimName }} --wait + kubectl delete -n {{ .Values.rooknfs.serverNamespace }} nfsservers {{ .Values.rooknfs.serverName }} --wait + restartPolicy: Never +--- +{{- end }} diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login.yaml similarity index 90% rename from slurm-cluster-chart/templates/login-deployment.yaml rename to slurm-cluster-chart/templates/login.yaml index 48f8f17..ca63392 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login.yaml @@ -5,9 +5,9 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: login - name: login + name: {{ .Values.login.name }} spec: - replicas: {{ .Values.replicas.login }} + replicas: {{ .Values.login.replicas }} selector: matchLabels: app.kubernetes.io/name: slurm @@ -29,7 +29,7 @@ spec: ports: - containerPort: 22 volumeMounts: - - mountPath: {{ .Values.nfs.mountPath }} + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /etc/slurm/ name: slurm-config-volume @@ -51,7 +51,7 @@ spec: volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} diff --git a/slurm-cluster-chart/templates/pvc.yaml b/slurm-cluster-chart/templates/pvc.yaml new file mode 100644 index 0000000..c5d5955 --- /dev/null +++ b/slurm-cluster-chart/templates/pvc.yaml @@ -0,0 +1,14 @@ +{{- if .Values.rooknfs.enabled }} +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.storage.claimName }} +spec: + storageClassName: {{ .Values.storageClassName }} + accessModes: + - ReadWriteMany + resources: + requests: + storage: {{ .Values.storage.capacity }} +{{- end }} \ No newline at end of file diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld.yaml similarity index 91% rename from slurm-cluster-chart/templates/slurmctld-statefulset.yaml rename to slurm-cluster-chart/templates/slurmctld.yaml index dc0bf90..f919c5f 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld.yaml @@ -5,7 +5,7 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmctld - name: slurmctld + name: {{ .Values.slurmctld.name }} spec: replicas: 1 selector: @@ -29,7 +29,7 @@ spec: - containerPort: 6817 resources: {} volumeMounts: - - mountPath: {{ .Values.nfs.mountPath }} + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /etc/slurm/ name: slurm-config-volume @@ -45,7 +45,7 @@ spec: volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurmctld-state persistentVolumeClaim: claimName: var-spool-slurmctld diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd.yaml similarity index 88% rename from slurm-cluster-chart/templates/slurmd-deployment.yaml rename to slurm-cluster-chart/templates/slurmd.yaml index 4c2396e..4775748 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -5,9 +5,9 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd - name: slurmd + name: {{ .Values.slurmd.name }} spec: - replicas: {{ .Values.replicas.slurmd }} + replicas: {{ .Values.slurmd.replicas }} selector: matchLabels: app.kubernetes.io/name: slurm @@ -41,7 +41,8 @@ spec: volumeMounts: - mountPath: /etc/slurm/ name: slurm-config-volume - - mountPath: {{ .Values.nfs.mountPath }} + subPath: slurm.conf + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /tmp/munge.key name: munge-key-secret @@ -55,7 +56,7 @@ spec: volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 7873e5c..eb9501c 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,12 +1,52 @@ slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7f4d64e -replicas: - slurmd: 2 - login: 1 +login: + # Deployment resource name + name: login + replicas: 1 -nfs: +slurmd: + # StatefulSet resource name + name: slurmd + replicas: 2 + +slurmctld: + # StatefulSet resource name + name: slurmctld + # NOTE: We don't include a replicas field here because + # replicas > 1 for slurmctld needs extra Slurm config + +storage: mountPath: /home - claimName: rook-nfs-pv-claim + # The name of a Read-Write-Many StorageClass to use for + # the persistent volume which is shared across Slurm nodes + # Note: If using the default value then you must set + # rooknfs.enabled = true below to ensure that Rook NFS is + # installed on the cluster as a dependency of this Slurm + # chart. If you are using a separate RWM StorageClass, then + # set rooknfs.enabled = false + storageClassName: &storageclassname slurm-rook-nfs + # Name for the R-W-M volume to provision + claimName: slurm-shared-storage + # Capacite of the R-W-M volume + capacity: &capacity 10Gi + + +# Values to be passed to the rook-nfs sub-chart +# See rook-nfs sub-chart for full set of available config values +rooknfs: + enabled: true + storageClassName: *storageclassname + # Name for the NFSServer resource created by Rook + serverName: rook-nfs + # Capacity for the backing Read-Write-*Once* volume + # than Rook will create to provide the actual storage to + # the NFS server. Since we're using the Rook NFS in a + # slightly unconventional way here, we just want to anchor + # this value to the requested storage capacity for the RWM + # volume specified in storage.capacity + storageCapacity: *capacity + sqlImage: mariadb:10.10 From a4727da91d175bf1a6a45264104a6b1045a8940f Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 11 Aug 2023 13:51:58 +0100 Subject: [PATCH 103/152] Removed quotes --- slurm-cluster-chart/templates/login-service.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/slurm-cluster-chart/templates/login-service.yaml b/slurm-cluster-chart/templates/login-service.yaml index f5f8aa3..df8892d 100644 --- a/slurm-cluster-chart/templates/login-service.yaml +++ b/slurm-cluster-chart/templates/login-service.yaml @@ -11,11 +11,11 @@ spec: - name: ssh port: 22 targetPort: 22 - - name: "apache" + - name: apache port: 80 targetPort: 80 protocol: TCP - - name: "https" + - name: https port: 443 targetPort: 443 protocol: TCP From 62c6f3431740bb80d744666bc627ab0f5d738c43 Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 11 Aug 2023 13:56:50 +0100 Subject: [PATCH 104/152] Testing without env file for shell --- image/docker-entrypoint.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 14b511c..c0f854d 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -125,8 +125,8 @@ then echo "---> Starting Apache Server" - mkdir --parents /etc/ood/config/apps/shell - env > /etc/ood/config/apps/shell/env + # mkdir --parents /etc/ood/config/apps/shell + # env > /etc/ood/config/apps/shell/env /usr/libexec/httpd-ssl-gencerts /opt/ood/ood-portal-generator/sbin/update_ood_portal From 4d90e24398aa3c8ab53d1c46e7f3eb83c0c30f8e Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 11 Aug 2023 14:16:31 +0100 Subject: [PATCH 105/152] Moved rocky ssh generation to make purpose clearer --- image/docker-entrypoint.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index c0f854d..55bc66d 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -91,12 +91,6 @@ then mkdir -p /home/rocky/.ssh cp /tmp/authorized_keys /home/rocky/.ssh/authorized_keys - if [ -f /home/rocky/.ssh/id_rsa.pub ]; then - echo "ssh keys already found" - else - ssh-keygen -t rsa -f /home/rocky/.ssh/id_rsa -N "" - fi - echo "---> Setting permissions for user home directories" pushd /home > /dev/null for DIR in * @@ -119,6 +113,13 @@ then start_munge echo "---> Setting up self ssh capabilities for OOD" + + if [ -f /home/rocky/.ssh/id_rsa.pub ]; then + echo "ssh keys already found" + else + ssh-keygen -t rsa -f /home/rocky/.ssh/id_rsa -N "" + fi + ssh-keyscan localhost > /etc/ssh/ssh_known_hosts echo "" >> /home/rocky/.ssh/authorized_keys #Adding newline to avoid breaking authorized_keys file cat /home/rocky/.ssh/id_rsa.pub >> /home/rocky/.ssh/authorized_keys From 1a4a3e44ba23f0c2a1b817edbb2e17fff7476f74 Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 11 Aug 2023 14:19:53 +0100 Subject: [PATCH 106/152] Updated tag --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 6a394cb..0ca35c9 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:f52e918 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:4d90e24 replicas: slurmd: 2 From edfdd7c1fe8e14e889f7632249c16b3bb580dcf3 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Fri, 11 Aug 2023 14:26:24 +0100 Subject: [PATCH 107/152] Fix storageClassName templating typo --- slurm-cluster-chart/templates/pvc.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/templates/pvc.yaml b/slurm-cluster-chart/templates/pvc.yaml index c5d5955..5e934ef 100644 --- a/slurm-cluster-chart/templates/pvc.yaml +++ b/slurm-cluster-chart/templates/pvc.yaml @@ -5,7 +5,7 @@ kind: PersistentVolumeClaim metadata: name: {{ .Values.storage.claimName }} spec: - storageClassName: {{ .Values.storageClassName }} + storageClassName: {{ .Values.storage.storageClassName }} accessModes: - ReadWriteMany resources: From 4407fbe486a3b78bda85f93ac39fc9adda94d0f6 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Fri, 11 Aug 2023 14:55:20 +0100 Subject: [PATCH 108/152] Remove broken subPath spec --- slurm-cluster-chart/templates/slurmd.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/slurm-cluster-chart/templates/slurmd.yaml b/slurm-cluster-chart/templates/slurmd.yaml index 4775748..ff13019 100644 --- a/slurm-cluster-chart/templates/slurmd.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -41,7 +41,6 @@ spec: volumeMounts: - mountPath: /etc/slurm/ name: slurm-config-volume - subPath: slurm.conf - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /tmp/munge.key From f9d4f9a95ea28c999cc80076f949a712735f8b45 Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 14 Aug 2023 13:53:22 +0100 Subject: [PATCH 109/152] Changed OOD key names --- .gitignore | 3 +++ slurm-cluster-chart/templates/login-deployment.yaml | 2 +- slurm-cluster-chart/values.yaml | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0ba5327 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Build artifacts from local helm install +slurm-cluster-chart/Chart.lock +slurm-cluster-chart/charts/ diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index 0984560..64a6469 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -28,7 +28,7 @@ spec: name: login env: - name: ROCKY_OOD_PASS - value: {{ .Values.openOndemand.password }} + value: {{ .Values.openOnDemand.password }} ports: - containerPort: 22 - containerPort: 80 diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index d7fc033..c555b98 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -25,5 +25,5 @@ secrets: mungeKey: munge-key-secret #OOD username is rocky -openOndemand: +openOnDemand: password: password From 2ac2fd5aae4a3cd7fb824662e87fdf9b6071c384 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 14 Aug 2023 13:57:03 +0100 Subject: [PATCH 110/152] Working Helm chart publisher workflow (#25) * Added custom packaging to workflow * Trying adding charts to cr packages * Now publishes rook chart * Temporarily removed slurm chart from publisher to publish initial rook chart to repo * Trying with new workflow and temporarily removing dependency * Re-added rook dependency * Added upterm debugging * Changed rooknfs version * Removed debug --- .github/workflows/publish-helm-chart.yml | 47 +++++++------------ rooknfs/values.yaml | 2 +- slurm-cluster-chart/Chart.yaml | 4 +- .../{ => hooks}/check-jobs-finished-hook.yaml | 0 4 files changed, 21 insertions(+), 32 deletions(-) rename slurm-cluster-chart/templates/{ => hooks}/check-jobs-finished-hook.yaml (100%) diff --git a/.github/workflows/publish-helm-chart.yml b/.github/workflows/publish-helm-chart.yml index 8ce0698..516e388 100644 --- a/.github/workflows/publish-helm-chart.yml +++ b/.github/workflows/publish-helm-chart.yml @@ -1,37 +1,26 @@ -name: Release Charts - -on: - push: - branches: - - main - +name: Publish charts +# Run the tasks on every push +on: push jobs: - release: - # depending on default permission settings for your org (contents being read-only or read-write for workloads), you will have to add permissions - # see: https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token - permissions: - contents: write + publish_charts: + name: Build and push Helm charts runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v3 + - name: Check out the repository + uses: actions/checkout@v2 with: + # This is important for the semver action to work correctly + # when determining the number of commits since the last tag fetch-depth: 0 + submodules: true - - name: Configure Git - run: | - git config user.name "$GITHUB_ACTOR" - git config user.email "$GITHUB_ACTOR@users.noreply.github.com" - - - name: Install Helm - uses: azure/setup-helm@v3 - env: - GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + - name: Get SemVer version for current commit + id: semver + uses: stackhpc/github-actions/semver@master - - name: Run chart-releaser - uses: helm/chart-releaser-action@v1.5.0 + - name: Publish Helm charts + uses: stackhpc/github-actions/helm-publish@master with: - charts_dir: . - env: - CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" - + token: ${{ secrets.GITHUB_TOKEN }} + version: ${{ steps.semver.outputs.version }} + app-version: ${{ steps.semver.outputs.short-sha }} diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml index 1961fa6..00a3e7f 100644 --- a/rooknfs/values.yaml +++ b/rooknfs/values.yaml @@ -27,4 +27,4 @@ operatorImage: rook/nfs:master # state indefinitely, so here we separate out namespaces as # of various components in the same way as the Rook docs serverNamespace: rook-nfs -systemNamespace: rook-nfs-system \ No newline at end of file +systemNamespace: rook-nfs-system diff --git a/slurm-cluster-chart/Chart.yaml b/slurm-cluster-chart/Chart.yaml index 4dad59b..0177e24 100644 --- a/slurm-cluster-chart/Chart.yaml +++ b/slurm-cluster-chart/Chart.yaml @@ -25,5 +25,5 @@ appVersion: "1.16.0" dependencies: - name: rooknfs - version: 0.0.1 - repository: file://../rooknfs \ No newline at end of file + version: ">=0-0" + repository: file://../rooknfs diff --git a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml b/slurm-cluster-chart/templates/hooks/check-jobs-finished-hook.yaml similarity index 100% rename from slurm-cluster-chart/templates/check-jobs-finished-hook.yaml rename to slurm-cluster-chart/templates/hooks/check-jobs-finished-hook.yaml From f25fe6ec0df1c4df5ad79c744570a7d7a28fb447 Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 14 Aug 2023 14:29:19 +0100 Subject: [PATCH 111/152] Removed resource policies --- slurm-cluster-chart/templates/database-auth-secret.yaml | 1 - slurm-cluster-chart/templates/munge-key-secret.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/slurm-cluster-chart/templates/database-auth-secret.yaml b/slurm-cluster-chart/templates/database-auth-secret.yaml index 6133576..1a1d6ea 100644 --- a/slurm-cluster-chart/templates/database-auth-secret.yaml +++ b/slurm-cluster-chart/templates/database-auth-secret.yaml @@ -4,6 +4,5 @@ metadata: name: database-auth-secret annotations: helm.sh/hook: pre-install - helm.sh/resource-policy: keep data: password: {{ randAlphaNum 32 | b64enc }} diff --git a/slurm-cluster-chart/templates/munge-key-secret.yaml b/slurm-cluster-chart/templates/munge-key-secret.yaml index 65825d6..df97e19 100644 --- a/slurm-cluster-chart/templates/munge-key-secret.yaml +++ b/slurm-cluster-chart/templates/munge-key-secret.yaml @@ -4,6 +4,5 @@ metadata: name: munge-key-secret annotations: helm.sh/hook: pre-install - helm.sh/resource-policy: keep data: munge.key: {{ randAscii 128 | b64enc }} From af39470ad767c002050cc6be9dce364f0da7eb2f Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:31:04 +0100 Subject: [PATCH 112/152] Fix typo Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- rooknfs/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rooknfs/Chart.yaml b/rooknfs/Chart.yaml index 83a2a11..b8abd25 100644 --- a/rooknfs/Chart.yaml +++ b/rooknfs/Chart.yaml @@ -1,4 +1,4 @@ apiVersion: v2 name: rooknfs version: 0.0.1 -description: An packaged installation of Rook NFS for Kubernetes. \ No newline at end of file +description: A packaged installation of Rook NFS for Kubernetes. \ No newline at end of file From 336f95f01c26924faf2c51c8864f1b656df10dcc Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:31:52 +0100 Subject: [PATCH 113/152] Remove yaml anchor Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index eb9501c..e8e6e09 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -25,7 +25,7 @@ storage: # installed on the cluster as a dependency of this Slurm # chart. If you are using a separate RWM StorageClass, then # set rooknfs.enabled = false - storageClassName: &storageclassname slurm-rook-nfs + storageClassName: slurm-rook-nfs # Name for the R-W-M volume to provision claimName: slurm-shared-storage # Capacite of the R-W-M volume From 5f121966277344c7fe0834c4895cf2ac4f50c9d3 Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:32:29 +0100 Subject: [PATCH 114/152] Remove anchor ref and add explanatory comment Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index e8e6e09..98fe170 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -36,7 +36,7 @@ storage: # See rook-nfs sub-chart for full set of available config values rooknfs: enabled: true - storageClassName: *storageclassname + storageClassName: slurm-rook-nfs # NB this must match storage.storageClassName when using rook # Name for the NFSServer resource created by Rook serverName: rook-nfs # Capacity for the backing Read-Write-*Once* volume From 350d39b4b9a6fe56a7daa0c217156e026f4a16cd Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:33:06 +0100 Subject: [PATCH 115/152] Add yaml anchor explanation Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 98fe170..2a9eaf8 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -29,7 +29,7 @@ storage: # Name for the R-W-M volume to provision claimName: slurm-shared-storage # Capacite of the R-W-M volume - capacity: &capacity 10Gi + capacity: &capacity 10Gi # NB yaml anchor used so this value is also set for `rooknfs.storageCapacity` if necessary. # Values to be passed to the rook-nfs sub-chart From 58a89d4b27e7cabf5d5203ab9b0d3294a08c1b15 Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:33:36 +0100 Subject: [PATCH 116/152] Add comment about name constraints Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 2a9eaf8..b89ca85 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -7,7 +7,7 @@ login: slurmd: # StatefulSet resource name - name: slurmd + name: slurmd # NB this must match NodeName= in slurm-cluster-chart/files/slurm.conf replicas: 2 slurmctld: From 474450b7e68b0272a53e65bae9cb75ff8b30bb64 Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 14 Aug 2023 14:53:49 +0100 Subject: [PATCH 117/152] Refactored and documented values.yaml --- .github/workflows/publish-helm-chart.yml | 46 ++++++------------- .../templates/mysql-deployment.yaml | 2 +- .../var-lib-mysql-persistentvolumeclaim.yaml | 2 +- slurm-cluster-chart/values.yaml | 19 +++++--- 4 files changed, 28 insertions(+), 41 deletions(-) diff --git a/.github/workflows/publish-helm-chart.yml b/.github/workflows/publish-helm-chart.yml index 8a6f4f7..99e4c45 100644 --- a/.github/workflows/publish-helm-chart.yml +++ b/.github/workflows/publish-helm-chart.yml @@ -1,18 +1,16 @@ -name: Release Charts - +name: Publish charts +# Run the tasks on every push on: push - jobs: - release: - # depending on default permission settings for your org (contents being read-only or read-write for workloads), you will have to add permissions - # see: https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token - permissions: - contents: write + publish_charts: + name: Build and push Helm charts runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v3 + - name: Check out the repository + uses: actions/checkout@v2 with: + # This is important for the semver action to work correctly + # when determining the number of commits since the last tag fetch-depth: 0 submodules: true @@ -20,26 +18,10 @@ jobs: id: semver uses: stackhpc/github-actions/semver@master - - name: Configure Git - run: | - git config user.name "$GITHUB_ACTOR" - git config user.email "$GITHUB_ACTOR@users.noreply.github.com" - - - name: Install Helm - uses: azure/setup-helm@v3 - env: - GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" - - - name: "Package Chart" - run: | - mkdir -p .cr-release-packages - helm package slurm-cluster-chart --version ${{ steps.semver.outputs.version }} --destination .cr-release-packages - - - name: Run chart-releaser - uses: helm/chart-releaser-action@v1.5.0 + - name: Publish Helm charts + uses: stackhpc/github-actions/helm-publish@master with: - charts_dir: . - skip_packaging: True - env: - CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" - + token: ${{ secrets.GITHUB_TOKEN }} + version: ${{ steps.semver.outputs.version }} + app-version: ${{ steps.semver.outputs.short-sha }} + \ No newline at end of file diff --git a/slurm-cluster-chart/templates/mysql-deployment.yaml b/slurm-cluster-chart/templates/mysql-deployment.yaml index 8ffd49e..debf962 100644 --- a/slurm-cluster-chart/templates/mysql-deployment.yaml +++ b/slurm-cluster-chart/templates/mysql-deployment.yaml @@ -34,7 +34,7 @@ spec: value: "yes" - name: MYSQL_USER value: "slurm" - image: {{ .Values.sqlImage }} + image: {{ .Values.mySQL.image }} name: mysql ports: - containerPort: 3306 diff --git a/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml b/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml index 841bb0f..56fc7dd 100644 --- a/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml +++ b/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml @@ -11,4 +11,4 @@ spec: - ReadWriteOnce resources: requests: - storage: {{ .Values.databaseStorage }} + storage: {{ .Values.mySQL.storage }} diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index c555b98..63e3531 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -8,22 +8,27 @@ nfs: mountPath: /home claimName: rook-nfs-pv-claim -sqlImage: mariadb:10.10 - -databaseStorage: 100Mi - +# Values for Slurm's database container +mySQL: + #Database image to be used + image: mariadb:10.10 + #Storage requested by the var-lib-mysql volume backing the database + storage: 100Mi + +# Configmap resource names configmaps: slurmConf: slurm-conf-configmap slurmdbdConf: slurmdbd-conf-configmap sshdConfig: sshd-config-configmap -# If let undefined, assumes you have run publish-keys.sh to publish your public key prior to deployment +# Public key used for ssh access to the login node +# If let undefined, assumes you have run the provided publish-keys.sh script to publish your public key prior to deployment sshPublicKey: +# Secret resource names secrets: - databaseAuth: database-auth-secret mungeKey: munge-key-secret -#OOD username is rocky openOnDemand: + #Password for default Open OnDemand user 'rocky' password: password From 908f808efd07c1c66653a53b78c8e2d1ca7d9a6a Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Mon, 14 Aug 2023 17:06:32 +0100 Subject: [PATCH 118/152] Add namespace as command line arg --- publish-keys.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/publish-keys.sh b/publish-keys.sh index d293e81..bdd4e0f 100755 --- a/publish-keys.sh +++ b/publish-keys.sh @@ -1,3 +1,8 @@ -kubectl create configmap authorized-keys-configmap \ +NAMESPACE="$1" +if [[ -z $1 ]]; then + NAMESPACE=default +fi +echo Installing in namespace $NAMESPACE +kubectl -n $NAMESPACE create configmap authorized-keys-configmap \ "--from-literal=authorized_keys=$(cat ~/.ssh/*.pub)" --dry-run=client -o yaml | \ -kubectl apply -f - \ No newline at end of file +kubectl -n $NAMESPACE apply -f - \ No newline at end of file From 925ad806fe072878206310db0422f34039723b91 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 11:12:44 +0100 Subject: [PATCH 119/152] Add namespace as script arg --- generate-secrets.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/generate-secrets.sh b/generate-secrets.sh index db64a53..10b7f98 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -1,13 +1,17 @@ #!/bin/bash +NAMESPACE="$1" +if [[ -z $1 ]]; then + NAMESPACE=default +fi -kubectl create secret generic database-auth-secret \ +kubectl -n $NAMESPACE create secret generic database-auth-secret \ --dry-run=client \ --from-literal=password=$(tr -dc 'A-Za-z0-9' /dev/null | base64 -w 0) \ -o yaml | \ -kubectl apply -f - \ No newline at end of file +kubectl -n $NAMESPACE apply -f - \ No newline at end of file From e6c5275179a62dbf7bb86d3d8bda8e60f3600d0b Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 15 Aug 2023 11:35:10 +0100 Subject: [PATCH 120/152] Now gives ownership to rocky affter keygen --- image/docker-entrypoint.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 55bc66d..2f87d39 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -118,6 +118,7 @@ then echo "ssh keys already found" else ssh-keygen -t rsa -f /home/rocky/.ssh/id_rsa -N "" + chown rocky:rocky id_rsa id_rsa.pub fi ssh-keyscan localhost > /etc/ssh/ssh_known_hosts From f32b4f1fdfeb830569ba63446de22fec6db3ac98 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 11:36:52 +0100 Subject: [PATCH 121/152] Fix dnsConfig namespace --- slurm-cluster-chart/templates/login.yaml | 2 +- slurm-cluster-chart/templates/slurmctld.yaml | 2 +- slurm-cluster-chart/templates/slurmd.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/slurm-cluster-chart/templates/login.yaml b/slurm-cluster-chart/templates/login.yaml index ca63392..d8a813c 100644 --- a/slurm-cluster-chart/templates/login.yaml +++ b/slurm-cluster-chart/templates/login.yaml @@ -46,7 +46,7 @@ spec: hostname: login dnsConfig: searches: - - slurmd.default.svc.cluster.local + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir diff --git a/slurm-cluster-chart/templates/slurmctld.yaml b/slurm-cluster-chart/templates/slurmctld.yaml index f919c5f..1644463 100644 --- a/slurm-cluster-chart/templates/slurmctld.yaml +++ b/slurm-cluster-chart/templates/slurmctld.yaml @@ -40,7 +40,7 @@ spec: name: slurmctld-state dnsConfig: searches: - - slurmd.default.svc.cluster.local + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir diff --git a/slurm-cluster-chart/templates/slurmd.yaml b/slurm-cluster-chart/templates/slurmd.yaml index ff13019..62646b7 100644 --- a/slurm-cluster-chart/templates/slurmd.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -50,7 +50,7 @@ spec: privileged: true dnsConfig: searches: - - slurmd.default.svc.cluster.local + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir From 7c0e2d9a79b62be220da2808c20c887f6db0e3a8 Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 15 Aug 2023 11:45:54 +0100 Subject: [PATCH 122/152] Fixed path --- image/docker-entrypoint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 2f87d39..f2b0bbc 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -118,7 +118,7 @@ then echo "ssh keys already found" else ssh-keygen -t rsa -f /home/rocky/.ssh/id_rsa -N "" - chown rocky:rocky id_rsa id_rsa.pub + chown rocky:rocky /home/rocky/.ssh/id_rsa /home/rocky/.ssh/id_rsa.pub fi ssh-keyscan localhost > /etc/ssh/ssh_known_hosts From 171010d0a523a51d3efd6bbfee4672bbfd4a917e Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 15 Aug 2023 11:51:19 +0100 Subject: [PATCH 123/152] Updated values.yaml --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 0ca35c9..56a5e38 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:4d90e24 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7c0e2d9 replicas: slurmd: 2 From a33790b2a35cf4b94aebd4bcaa977e69269d2d89 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 12:06:50 +0100 Subject: [PATCH 124/152] Use builtin Helm optional dependency feature --- rooknfs/templates/nfs.yaml | 3 --- rooknfs/templates/operator.yaml | 2 -- rooknfs/templates/rbac.yaml | 4 +--- rooknfs/templates/sc.yaml | 4 +--- rooknfs/values.yaml | 5 ----- slurm-cluster-chart/Chart.yaml | 1 + slurm-cluster-chart/templates/pvc.yaml | 4 +--- 7 files changed, 4 insertions(+), 19 deletions(-) diff --git a/rooknfs/templates/nfs.yaml b/rooknfs/templates/nfs.yaml index 6fde553..1da86bc 100644 --- a/rooknfs/templates/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -1,4 +1,3 @@ -{{- if .Values.enabled }} --- # A default storageclass must be present apiVersion: v1 @@ -32,5 +31,3 @@ spec: annotations: rook: nfs --- -{{- end }} - diff --git a/rooknfs/templates/operator.yaml b/rooknfs/templates/operator.yaml index 4a1d542..56318f6 100644 --- a/rooknfs/templates/operator.yaml +++ b/rooknfs/templates/operator.yaml @@ -1,4 +1,3 @@ -{{- if .Values.enabled }} --- apiVersion: v1 kind: Namespace @@ -137,4 +136,3 @@ spec: fieldRef: fieldPath: metadata.namespace --- -{{- end}} diff --git a/rooknfs/templates/rbac.yaml b/rooknfs/templates/rbac.yaml index b327740..422a43b 100644 --- a/rooknfs/templates/rbac.yaml +++ b/rooknfs/templates/rbac.yaml @@ -1,4 +1,3 @@ -{{- if .Values.enabled }} --- apiVersion: v1 kind: Namespace @@ -57,5 +56,4 @@ roleRef: kind: ClusterRole name: rook-nfs-provisioner-runner apiGroup: rbac.authorization.k8s.io ---- -{{- end }} \ No newline at end of file +--- \ No newline at end of file diff --git a/rooknfs/templates/sc.yaml b/rooknfs/templates/sc.yaml index 0ad75fe..505bd44 100644 --- a/rooknfs/templates/sc.yaml +++ b/rooknfs/templates/sc.yaml @@ -1,4 +1,3 @@ -{{- if .Values.enabled }} --- apiVersion: storage.k8s.io/v1 kind: StorageClass @@ -13,5 +12,4 @@ parameters: provisioner: nfs.rook.io/rook-nfs-provisioner reclaimPolicy: Delete volumeBindingMode: Immediate ---- -{{- end }} \ No newline at end of file +--- \ No newline at end of file diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml index 00a3e7f..4150967 100644 --- a/rooknfs/values.yaml +++ b/rooknfs/values.yaml @@ -1,8 +1,3 @@ -# Global flag for enabling/disabling all chart resources -# This is useful for allowing charts which use this chart -# as a dependency to toggle usage of this chart based on -# values in the parent chart -enabled: true # Name for the NFSServer resource created by rook serverName: rook-nfs diff --git a/slurm-cluster-chart/Chart.yaml b/slurm-cluster-chart/Chart.yaml index 0177e24..e3d003c 100644 --- a/slurm-cluster-chart/Chart.yaml +++ b/slurm-cluster-chart/Chart.yaml @@ -27,3 +27,4 @@ dependencies: - name: rooknfs version: ">=0-0" repository: file://../rooknfs + condition: rooknfs.enabled diff --git a/slurm-cluster-chart/templates/pvc.yaml b/slurm-cluster-chart/templates/pvc.yaml index 5e934ef..aab0856 100644 --- a/slurm-cluster-chart/templates/pvc.yaml +++ b/slurm-cluster-chart/templates/pvc.yaml @@ -1,4 +1,3 @@ -{{- if .Values.rooknfs.enabled }} --- apiVersion: v1 kind: PersistentVolumeClaim @@ -10,5 +9,4 @@ spec: - ReadWriteMany resources: requests: - storage: {{ .Values.storage.capacity }} -{{- end }} \ No newline at end of file + storage: {{ .Values.storage.capacity }} \ No newline at end of file From f86952f405ee251f212024a58d8ec6dd75e40314 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 12:22:21 +0100 Subject: [PATCH 125/152] Separate Rook cleanup into correct chart --- generate-secrets.sh | 4 +- rooknfs/templates/hooks/pre-delete.yaml | 50 +++++++++++++++++++ .../templates/hooks/pre-delete.yaml | 14 +++--- 3 files changed, 59 insertions(+), 9 deletions(-) create mode 100644 rooknfs/templates/hooks/pre-delete.yaml diff --git a/generate-secrets.sh b/generate-secrets.sh index 10b7f98..5956181 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -6,12 +6,12 @@ fi kubectl -n $NAMESPACE create secret generic database-auth-secret \ --dry-run=client \ ---from-literal=password=$(tr -dc 'A-Za-z0-9' /dev/null | base64 -w 0) \ +--from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64) \ -o yaml | \ kubectl -n $NAMESPACE apply -f - \ No newline at end of file diff --git a/rooknfs/templates/hooks/pre-delete.yaml b/rooknfs/templates/hooks/pre-delete.yaml new file mode 100644 index 0000000..2c75c89 --- /dev/null +++ b/rooknfs/templates/hooks/pre-delete.yaml @@ -0,0 +1,50 @@ +# NOTE: The cleanup jobs defined here are required to ensure that things which +# Rook NFS is responsible for cleaning up are deleted before deleting the Rook +# pods which do the actual clean up of NFS resources. For example, the RWM PVC +# must be deleted before the Rook StorageClass and provisioner pod. However, +# the PVC cannot be deleted until the pods which are using it are deleted, so +# the various Slurm node pods must actually be the first resources deleted. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: rook-nfs-cleanup +--- +# TODO: Create a job-specific ClusterRole for the ServiceAccount +# instead of using the cluster-admin role here +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: rook-nfs-cleanup +subjects: +- kind: ServiceAccount + name: rook-nfs-cleanup + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: cluster-admin +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: rook-nfs-pre-delete-cleanup + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": hook-succeeded + "helm.sh/hook-weight": "10" +spec: + template: + metadata: + name: rook-nfs-pre-delete-cleanup + spec: + serviceAccountName: rook-nfs-cleanup + containers: + - name: tester + image: bitnami/kubectl + command: + - "bin/bash" + - "-c" + - | + kubectl delete -n {{ .Values.serverNamespace }} nfsservers {{ .Values.serverName }} --wait + restartPolicy: Never +--- \ No newline at end of file diff --git a/slurm-cluster-chart/templates/hooks/pre-delete.yaml b/slurm-cluster-chart/templates/hooks/pre-delete.yaml index 8cdb1f3..868cbbd 100644 --- a/slurm-cluster-chart/templates/hooks/pre-delete.yaml +++ b/slurm-cluster-chart/templates/hooks/pre-delete.yaml @@ -9,17 +9,17 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: rook-nfs-cleanup + name: slurm-k8s-cleanup --- # TODO: Create a job-specific ClusterRole for the ServiceAccount # instead of using the cluster-admin role here apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: rook-nfs-cleanup + name: slurm-k8s-cleanup subjects: - kind: ServiceAccount - name: rook-nfs-cleanup + name: slurm-k8s-cleanup namespace: {{ .Release.Namespace }} roleRef: kind: ClusterRole @@ -28,16 +28,17 @@ roleRef: apiVersion: batch/v1 kind: Job metadata: - name: rook-nfs-pre-delete-cleanup + name: slurm-k8s-pre-delete-cleanup annotations: "helm.sh/hook": pre-delete "helm.sh/hook-delete-policy": hook-succeeded + "helm.sh/hook-weight": "1" spec: template: metadata: - name: rook-nfs-pre-delete-cleanup + name: slurm-k8s-pre-delete-cleanup spec: - serviceAccountName: rook-nfs-cleanup + serviceAccountName: slurm-k8s-cleanup containers: - name: tester image: bitnami/kubectl @@ -49,7 +50,6 @@ spec: kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmctld.name }} --wait --cascade=foreground kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmd.name }} --wait --cascade=foreground kubectl delete -n {{ .Release.Namespace }} pvc {{ .Values.storage.claimName }} --wait - kubectl delete -n {{ .Values.rooknfs.serverNamespace }} nfsservers {{ .Values.rooknfs.serverName }} --wait restartPolicy: Never --- {{- end }} From 1371681210c766da9871ec90ba2e140f645522be Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 13:40:08 +0100 Subject: [PATCH 126/152] Update docs --- README.md | 30 +++++++++++++++++------------- rooknfs/README.md | 3 +++ 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 11fe8b8..7411656 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ # Slurm Docker Cluster -This is a multi-container Slurm cluster using Kubernetes. The Helm chart -creates a named volume for persistent storage of MySQL data files as well as -an NFS volume for shared storage. +This is a multi-container Slurm cluster using Kubernetes. The Slurm cluster Helm chart creates a named volume for persistent storage of MySQL data files. By default, it also installs the +RookNFS Helm chart (also in this repo) to provide shared storage across the Slurm cluster nodes. ## Dependencies @@ -27,12 +26,11 @@ The Helm chart will create the following named volumes: * var_lib_mysql ( -> /var/lib/mysql ) -A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the scripts in the `/nfs` directory (See "Deploying the Cluster") +A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the provided `rooknfs` chart directory (See "Deploying the Cluster") ## Configuring the Cluster -All config files in `slurm-cluster-chart/files` will be mounted into the container to configure their respective services on startup. Note that changes to these files will not all be propagated to existing deployments (see "Reconfiguring the Cluster"). -Additional parameters can be found in the `values.yaml` file, which will be applied on a Helm chart deployment. Note that some of these values will also not propagate until the cluster is restarted (see "Reconfiguring the Cluster"). +All config files in `slurm-cluster-chart/files` will be mounted into the container to configure their respective services on startup. Note that changes to these files will not all be propagated to existing deployments (see "Reconfiguring the Cluster"). Additional parameters can be found in the `values.yaml` file for the Helm chart. Note that some of these values will also not propagate until the cluster is restarted (see "Reconfiguring the Cluster"). ## Deploying the Cluster @@ -44,21 +42,20 @@ On initial deployment ONLY, run ``` This generates a set of secrets. If these need to be regenerated, see "Reconfiguring the Cluster" -### Connecting RWX Volume +### Connecting a RWX Volume -A ReadWriteMany (RWX) volume is required, if a named volume exists, set `nfs.claimName` in the `values.yaml` file to its name. If not, manifests to deploy a Rook NFS volume are provided in the `/nfs` directory. You can deploy this by running -```console -./nfs/deploy-nfs.sh -``` -and leaving `nfs.claimName` as the provided value. +A ReadWriteMany (RWX) volume is required for shared storage across cluster nodes. By default, the Rook NFS Helm chart is installed as a dependency of the Slurm cluster chart in order to provide a RWX capable Storage Class for the required shared volume. If the target Kubernetes cluster has an existing storage class which should be used instead, then `storageClass` in `values.yaml` should be set to the name of this existing class and the RookNFS dependency should be disabled by setting `rooknfs.enabled = false`. + +See the separate RookNFS chart [values.yaml](./rooknfs/values.yaml) for further configuration options when using the RookNFS to provide the shared storage volume. ### Supplying Public Keys To access the cluster via `ssh`, you will need to make your public keys available. All your public keys from localhost can be added by running ```console -./publish-keys.sh +./publish-keys.sh ``` +where `` is the namespace in which the Slurm cluster chart will be deployed (i.e. using `helm install -n ...`). This will create a Kubernetes Secret in the appropriate namespace for the Slurm cluster to use. Omitting the namespace arg will install the secrets in the default namespace. ### Deploying with Helm @@ -66,6 +63,12 @@ After configuring `kubectl` with the appropriate `kubeconfig` file, deploy the c ```console helm install slurm-cluster-chart ``` + +NOTE: If using the RookNFS dependency, then the following must be run before installing the Slurm cluster chart +```console +helm dependency update slurm-cluster-chart +``` + Subsequent releases can be deployed using: ```console @@ -128,6 +131,7 @@ srun singularity exec docker://ghcr.io/stackhpc/mpitests-container:${MPI_CONTAIN ``` Note: The mpirun script assumes you are running as user 'rocky'. If you are running as root, you will need to include the --allow-run-as-root argument + ## Reconfiguring the Cluster ### Changes to config files diff --git a/rooknfs/README.md b/rooknfs/README.md index e69de29..5b7ad6d 100644 --- a/rooknfs/README.md +++ b/rooknfs/README.md @@ -0,0 +1,3 @@ +# RookNFS Helm Chart + +See `values.yaml` for available config options. \ No newline at end of file From fe58891e7ccd9de6cb87f92083db59841f98e7e1 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 13:54:30 +0100 Subject: [PATCH 127/152] Make backing RWO storage class configurable --- rooknfs/templates/nfs.yaml | 1 + rooknfs/values.yaml | 3 +++ slurm-cluster-chart/values.yaml | 7 ++++++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/rooknfs/templates/nfs.yaml b/rooknfs/templates/nfs.yaml index 1da86bc..a88fb6f 100644 --- a/rooknfs/templates/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -6,6 +6,7 @@ metadata: name: {{ .Values.claimName}} namespace: {{ .Values.serverNamespace }} spec: + storageClassName: {{ .Values.backingStorageClass }} accessModes: - ReadWriteMany resources: diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml index 4150967..4ada627 100644 --- a/rooknfs/values.yaml +++ b/rooknfs/values.yaml @@ -8,6 +8,9 @@ storageClassName: rook-nfs # Name for the Read-Write-Once backing PVC created by Rook claimName: rook-nfs-backing-pv +# Storage class to use for the Read-Write-Once backing PVC +backingStorageClass: + # Name for the NFS share within the NFS Resource instance shareName: share-1 diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index b89ca85..1f59a5a 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -36,7 +36,9 @@ storage: # See rook-nfs sub-chart for full set of available config values rooknfs: enabled: true - storageClassName: slurm-rook-nfs # NB this must match storage.storageClassName when using rook + # Name given to the RWM StorageClass created by Rook + # NB this must match storage.storageClassName when using Rook + storageClassName: slurm-rook-nfs # Name for the NFSServer resource created by Rook serverName: rook-nfs # Capacity for the backing Read-Write-*Once* volume @@ -46,6 +48,9 @@ rooknfs: # this value to the requested storage capacity for the RWM # volume specified in storage.capacity storageCapacity: *capacity + # Storage class to use for the Read-Write-Once backing PVC + # backingStorageClass: + sqlImage: mariadb:10.10 From 303d156f78087eefa761aab7746fd6fafbab5399 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 13:57:00 +0100 Subject: [PATCH 128/152] Mention storage capacity config --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7411656..5ac48f2 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ This generates a set of secrets. If these need to be regenerated, see "Reconfigu ### Connecting a RWX Volume -A ReadWriteMany (RWX) volume is required for shared storage across cluster nodes. By default, the Rook NFS Helm chart is installed as a dependency of the Slurm cluster chart in order to provide a RWX capable Storage Class for the required shared volume. If the target Kubernetes cluster has an existing storage class which should be used instead, then `storageClass` in `values.yaml` should be set to the name of this existing class and the RookNFS dependency should be disabled by setting `rooknfs.enabled = false`. +A ReadWriteMany (RWX) volume is required for shared storage across cluster nodes. By default, the Rook NFS Helm chart is installed as a dependency of the Slurm cluster chart in order to provide a RWX capable Storage Class for the required shared volume. If the target Kubernetes cluster has an existing storage class which should be used instead, then `storageClass` in `values.yaml` should be set to the name of this existing class and the RookNFS dependency should be disabled by setting `rooknfs.enabled = false`. In either case, the storage capacity of the provisioned RWX volume can be configured by setting the value of `storage.capacity`. See the separate RookNFS chart [values.yaml](./rooknfs/values.yaml) for further configuration options when using the RookNFS to provide the shared storage volume. From 1debdedcd97a78f17ab5dc6884ce1c26400cf624 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 14:00:44 +0100 Subject: [PATCH 129/152] Add note on target namespace --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5ac48f2..4e21c3d 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,9 @@ All config files in `slurm-cluster-chart/files` will be mounted into the contain On initial deployment ONLY, run ```console -./generate-secrets.sh +./generate-secrets.sh ``` -This generates a set of secrets. If these need to be regenerated, see "Reconfiguring the Cluster" +This generates a set of secrets in the target namespace to be used by the Slurm cluster. If these need to be regenerated, see "Reconfiguring the Cluster" ### Connecting a RWX Volume From 8818a94a30df63d312b645b0450e090c6f9f1587 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 14:20:56 +0100 Subject: [PATCH 130/152] Revert to randomly generated DB password --- generate-secrets.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generate-secrets.sh b/generate-secrets.sh index 261f3be..dab0688 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -6,7 +6,7 @@ fi kubectl -n $NAMESPACE create secret generic database-auth-secret \ --dry-run=client \ ---from-literal=password=abcdefghijklmnopqrstuvwxyz123456 \ +--from-literal=password=$(tr -dc 'A-Za-z0-9' Date: Tue, 15 Aug 2023 14:49:06 +0100 Subject: [PATCH 131/152] Conditionally include backing storage class field --- rooknfs/templates/nfs.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rooknfs/templates/nfs.yaml b/rooknfs/templates/nfs.yaml index a88fb6f..cf7b1de 100644 --- a/rooknfs/templates/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -3,10 +3,12 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: {{ .Values.claimName}} + name: {{ .Values.claimName }} namespace: {{ .Values.serverNamespace }} spec: + {{- if .Values.backingStorageClass }} storageClassName: {{ .Values.backingStorageClass }} + {{- end }} accessModes: - ReadWriteMany resources: From 4c7f875813917e9753542d4e600591efef016e23 Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 15 Aug 2023 16:03:08 +0100 Subject: [PATCH 132/152] Changed database template name --- slurm-cluster-chart/templates/mysql-deployment.yaml | 2 +- .../templates/var-lib-mysql-persistentvolumeclaim.yaml | 2 +- slurm-cluster-chart/values.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/slurm-cluster-chart/templates/mysql-deployment.yaml b/slurm-cluster-chart/templates/mysql-deployment.yaml index debf962..96dc88f 100644 --- a/slurm-cluster-chart/templates/mysql-deployment.yaml +++ b/slurm-cluster-chart/templates/mysql-deployment.yaml @@ -34,7 +34,7 @@ spec: value: "yes" - name: MYSQL_USER value: "slurm" - image: {{ .Values.mySQL.image }} + image: {{ .Values.database.image }} name: mysql ports: - containerPort: 3306 diff --git a/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml b/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml index 56fc7dd..a5f4503 100644 --- a/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml +++ b/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml @@ -11,4 +11,4 @@ spec: - ReadWriteOnce resources: requests: - storage: {{ .Values.mySQL.storage }} + storage: {{ .Values.database.storage }} diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 63e3531..7c3a481 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -9,7 +9,7 @@ nfs: claimName: rook-nfs-pv-claim # Values for Slurm's database container -mySQL: +database: #Database image to be used image: mariadb:10.10 #Storage requested by the var-lib-mysql volume backing the database From 50e728515c3f2416a508d121ca6ed180278cab43 Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Wed, 16 Aug 2023 10:13:36 +0100 Subject: [PATCH 133/152] Punctuation Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2458e39..aad9b4b 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ The Helm chart will create the following named volumes: * var_lib_mysql ( -> /var/lib/mysql ) -A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the provided `rooknfs` chart directory (See "Deploying the Cluster") +A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the provided `rooknfs` chart directory (See "Deploying the Cluster"). ## Configuring the Cluster From 729e43c0f07aad5f114131be8dbc5e0096b0cb76 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Wed, 16 Aug 2023 10:17:10 +0100 Subject: [PATCH 134/152] Clarify namespace arg as optional --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index aad9b4b..c0b7d61 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ All config files in `slurm-cluster-chart/files` will be mounted into the contain On initial deployment ONLY, run ```console -./generate-secrets.sh +./generate-secrets.sh [] ``` This generates a set of secrets in the target namespace to be used by the Slurm cluster. If these need to be regenerated, see "Reconfiguring the Cluster" @@ -55,7 +55,7 @@ See the separate RookNFS chart [values.yaml](./rooknfs/values.yaml) for further To access the cluster via `ssh`, you will need to make your public keys available. All your public keys from localhost can be added by running ```console -./publish-keys.sh +./publish-keys.sh [] ``` where `` is the namespace in which the Slurm cluster chart will be deployed (i.e. using `helm install -n ...`). This will create a Kubernetes Secret in the appropriate namespace for the Slurm cluster to use. Omitting the namespace arg will install the secrets in the default namespace. From 43a5dd7232c5bc149a232bfe9acb71f732b08f40 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Wed, 16 Aug 2023 10:18:07 +0100 Subject: [PATCH 135/152] Re-disable line wrapping --- generate-secrets.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generate-secrets.sh b/generate-secrets.sh index dab0688..a49ede2 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -12,7 +12,7 @@ kubectl -n $NAMESPACE apply -f - kubectl -n $NAMESPACE create secret generic munge-key-secret \ --dry-run=client \ ---from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64) \ +--from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64 -w 0) \ -o yaml | \ kubectl -n $NAMESPACE apply -f - From 7c5b6c4cbb2b4f3f7055fe7f0a325f1a2252ab1a Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 16 Aug 2023 12:12:30 +0100 Subject: [PATCH 136/152] Updated image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index df19aa5..3d41248 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7c0e2d9 #OUTDATED, DON'T USE! +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:d3daba4 login: # Deployment resource name From 968515ee55e84d1b40a15414e6e84bd7b329a2d5 Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 16 Aug 2023 16:16:27 +0100 Subject: [PATCH 137/152] Replaced kubeconfig mount with ServiceAccount --- generate-secrets.sh | 3 -- image/Dockerfile | 6 ++-- image/docker-entrypoint.sh | 4 --- image/k8s-slurmd-create | 2 -- image/k8s-slurmd-delete | 2 -- slurm-cluster-chart/templates/kubeconfig.yml | 8 ----- .../slurm-autoscaler-service-account.yaml | 30 +++++++++++++++++++ .../templates/slurmctld-statefulset.yaml | 8 +---- 8 files changed, 34 insertions(+), 29 deletions(-) delete mode 100644 slurm-cluster-chart/templates/kubeconfig.yml create mode 100644 slurm-cluster-chart/templates/slurm-autoscaler-service-account.yaml diff --git a/generate-secrets.sh b/generate-secrets.sh index 334b8a7..70dcb22 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -11,6 +11,3 @@ kubectl create secret generic munge-key-secret \ --from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64 -w 0) \ -o yaml | \ kubectl apply -f - - -cp $KUBECONFIG slurm-cluster-chart/files/kubeconfig -echo "copied $KUBECONFIG into slurm-cluster-chart/files/" \ No newline at end of file diff --git a/image/Dockerfile b/image/Dockerfile index 2919ccf..41e6e5e 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -9,6 +9,8 @@ LABEL org.opencontainers.image.source="https://github.com/stackhpc/slurm-docker- ARG SLURM_TAG=slurm-23.02 ARG GOSU_VERSION=1.11 +COPY kubernetes.repo /etc/yum.repos.d/kubernetes.repo + RUN set -ex \ && yum makecache \ && yum -y update \ @@ -42,6 +44,7 @@ RUN set -ex \ hwloc-devel \ openssh-server \ apptainer \ + kubectl \ && yum clean all \ && rm -rf /var/cache/yum @@ -91,9 +94,6 @@ RUN mkdir /etc/sysconfig/slurm \ && useradd -u 1000 rocky \ && usermod -p '*' rocky # unlocks account but sets no password -COPY kubernetes.repo /etc/yum.repos.d/kubernetes.repo -RUN dnf install -y kubectl - VOLUME /etc/slurm COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh COPY --chown=slurm:slurm --chmod=744 k8s-slurmd-* /usr/local/bin/ diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 0c33fb1..6e0fdad 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -51,10 +51,6 @@ then echo "---> Setting ownership for state directory ..." chown slurm:slurm /var/spool/slurmctld - echo "---> Copying Kubeconfig ..." - install -o slurm -g slurm -m u=rwX,go= -d /var/lib/slurmctld/ - install -o slurm -g slurm -m u=r,go= /tmp/kubeconfig /var/lib/slurmctld/ - echo "---> Starting the Slurm Controller Daemon (slurmctld) ..." if /usr/sbin/slurmctld -V | grep -q '17.02' ; then exec gosu slurm /usr/sbin/slurmctld -D "${@:2}" diff --git a/image/k8s-slurmd-create b/image/k8s-slurmd-create index 676796f..b35fb1d 100644 --- a/image/k8s-slurmd-create +++ b/image/k8s-slurmd-create @@ -1,7 +1,5 @@ #!/usr/bin/bash -export KUBECONFIG=/var/lib/slurmctld/kubeconfig - echo "$(date) Resume invoked $0 $*" >> /var/log/slurm/power_save.log hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes diff --git a/image/k8s-slurmd-delete b/image/k8s-slurmd-delete index 19a1828..a026bfe 100644 --- a/image/k8s-slurmd-delete +++ b/image/k8s-slurmd-delete @@ -1,7 +1,5 @@ #!/usr/bin/bash -export KUBECONFIG=/var/lib/slurmctld/kubeconfig - echo "$(date) Suspend invoked $0 $*" >> /var/log/slurm/power_save.log hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes diff --git a/slurm-cluster-chart/templates/kubeconfig.yml b/slurm-cluster-chart/templates/kubeconfig.yml deleted file mode 100644 index 4938798..0000000 --- a/slurm-cluster-chart/templates/kubeconfig.yml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: kubeconfig-secret -data: - kubeconfig: | - {{- .Files.Get "files/kubeconfig" | b64enc | nindent 4 -}} - \ No newline at end of file diff --git a/slurm-cluster-chart/templates/slurm-autoscaler-service-account.yaml b/slurm-cluster-chart/templates/slurm-autoscaler-service-account.yaml new file mode 100644 index 0000000..162366f --- /dev/null +++ b/slurm-cluster-chart/templates/slurm-autoscaler-service-account.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: slurm-autoscaler-account +automountServiceAccountToken: True + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: slurm-autoscaler-role +rules: +- apiGroups: [""] # "" indicates the core API group + resources: ["pods"] + verbs: ["get","apply","create", "patch", "delete" ] + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: slurm-autoscaler-rolebinding +subjects: + - kind: ServiceAccount + name: slurm-autoscaler-account +roleRef: + kind: Role + name: slurm-autoscaler-role + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml index 2654107..05df745 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml @@ -19,6 +19,7 @@ spec: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmctld spec: + serviceAccountName: slurm-autoscaler-account containers: - args: - slurmctld @@ -38,9 +39,6 @@ spec: subPath: munge.key - mountPath: /var/spool/slurmctld name: slurmctld-state - - mountPath: /tmp/kubeconfig - name: kubeconfig-secret - subPath: kubeconfig dnsConfig: searches: - slurmd.default.svc.cluster.local @@ -63,7 +61,3 @@ spec: secret: secretName: {{ .Values.secrets.mungeKey }} defaultMode: 0400 - - name: kubeconfig-secret - secret: - secretName: kubeconfig-secret - defaultMode: 0400 From e25332e722a4e7d12b6ad52d084d69d9198e2de2 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 17 Aug 2023 09:03:32 +0100 Subject: [PATCH 138/152] Added debug to k8s files --- image/k8s-slurmd-create | 1 + image/k8s-slurmd-delete | 1 + 2 files changed, 2 insertions(+) diff --git a/image/k8s-slurmd-create b/image/k8s-slurmd-create index b35fb1d..a32de97 100644 --- a/image/k8s-slurmd-create +++ b/image/k8s-slurmd-create @@ -1,4 +1,5 @@ #!/usr/bin/bash +set -euo pipefail echo "$(date) Resume invoked $0 $*" >> /var/log/slurm/power_save.log diff --git a/image/k8s-slurmd-delete b/image/k8s-slurmd-delete index a026bfe..3f611de 100644 --- a/image/k8s-slurmd-delete +++ b/image/k8s-slurmd-delete @@ -1,4 +1,5 @@ #!/usr/bin/bash +set -euo pipefail echo "$(date) Suspend invoked $0 $*" >> /var/log/slurm/power_save.log From d31306322c8ecd5d9e4326c11f6f2ccf0d930b9c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 17 Aug 2023 08:22:42 +0000 Subject: [PATCH 139/152] only permit one slurmd pod per k8s node --- slurm-cluster-chart/templates/slurmd.yaml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/slurm-cluster-chart/templates/slurmd.yaml b/slurm-cluster-chart/templates/slurmd.yaml index 62646b7..b017093 100644 --- a/slurm-cluster-chart/templates/slurmd.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -20,14 +20,6 @@ spec: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd spec: - topologySpreadConstraints: - - maxSkew: 1 - whenUnsatisfiable: ScheduleAnyway - topologyKey: kubernetes.io/hostname - labelSelector: - matchLabels: - app.kubernetes.io/name: slurm - app.kubernetes.io/component: slurmd containers: - args: - slurmd @@ -37,6 +29,7 @@ spec: name: slurmd ports: - containerPort: 6818 + hostPort: 6818 resources: {} volumeMounts: - mountPath: /etc/slurm/ From e90f227bb1f8747414e79aed258257532bbf76c3 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 17 Aug 2023 09:44:54 +0100 Subject: [PATCH 140/152] Added more debugging for k8s --- image/k8s-slurmd-create | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/image/k8s-slurmd-create b/image/k8s-slurmd-create index a32de97..b45adfc 100644 --- a/image/k8s-slurmd-create +++ b/image/k8s-slurmd-create @@ -3,8 +3,13 @@ set -euo pipefail echo "$(date) Resume invoked $0 $*" >> /var/log/slurm/power_save.log +echo "Arguments: $* $0 $1" + hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes +echo "Powering up hosts: $hosts" for host in $hosts do + echo "Creating $host" sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | kubectl create -f - + echo "done" done From 6530f783a681a4758b767b23bc5e79997d9f8011 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 17 Aug 2023 08:46:13 +0000 Subject: [PATCH 141/152] use host networking --- slurm-cluster-chart/templates/slurmd.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/slurm-cluster-chart/templates/slurmd.yaml b/slurm-cluster-chart/templates/slurmd.yaml index b017093..bec55ce 100644 --- a/slurm-cluster-chart/templates/slurmd.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -25,6 +25,13 @@ spec: - slurmd - -F - -vvv + - -N + - "$(POD_NAME)" + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name image: {{ .Values.slurmImage }} name: slurmd ports: @@ -41,6 +48,8 @@ spec: subPath: munge.key securityContext: privileged: true + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet dnsConfig: searches: - slurmd.{{ .Release.Namespace }}.svc.cluster.local From f5c1261fefe2d54626f31f1040553c5a8b6db05b Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 17 Aug 2023 09:51:25 +0100 Subject: [PATCH 142/152] Sending debug to log files --- image/k8s-slurmd-create | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/image/k8s-slurmd-create b/image/k8s-slurmd-create index b45adfc..cf0c9a5 100644 --- a/image/k8s-slurmd-create +++ b/image/k8s-slurmd-create @@ -3,13 +3,13 @@ set -euo pipefail echo "$(date) Resume invoked $0 $*" >> /var/log/slurm/power_save.log -echo "Arguments: $* $0 $1" +echo "Arguments: $* $0 $1" >> /var/log/slurm/power_save.log hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes -echo "Powering up hosts: $hosts" +echo "Powering up hosts: $hosts" >> /var/log/slurm/power_save.log for host in $hosts do - echo "Creating $host" + echo "Creating $host" >> /var/log/slurm/power_save.log sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | kubectl create -f - - echo "done" + echo "done" >> /var/log/slurm/power_save.log done From 10b8e8e94dd5e4ecca3ffdd7bdb4f187ea5cacb3 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 17 Aug 2023 10:08:17 +0100 Subject: [PATCH 143/152] Adding kubectl output to logs --- image/k8s-slurmd-create | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image/k8s-slurmd-create b/image/k8s-slurmd-create index cf0c9a5..63dfc7b 100644 --- a/image/k8s-slurmd-create +++ b/image/k8s-slurmd-create @@ -10,6 +10,6 @@ echo "Powering up hosts: $hosts" >> /var/log/slurm/power_save.log for host in $hosts do echo "Creating $host" >> /var/log/slurm/power_save.log - sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | kubectl create -f - + sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | kubectl create -f - >> /var/log/slurm/power_save.log echo "done" >> /var/log/slurm/power_save.log done From ef184aa01f64b1149334279f2b1768ee461554d3 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 17 Aug 2023 10:18:25 +0100 Subject: [PATCH 144/152] Adding error check --- image/k8s-slurmd-create | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image/k8s-slurmd-create b/image/k8s-slurmd-create index 63dfc7b..22e9f7c 100644 --- a/image/k8s-slurmd-create +++ b/image/k8s-slurmd-create @@ -10,6 +10,6 @@ echo "Powering up hosts: $hosts" >> /var/log/slurm/power_save.log for host in $hosts do echo "Creating $host" >> /var/log/slurm/power_save.log - sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | kubectl create -f - >> /var/log/slurm/power_save.log + sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | kubectl create -f - || echo "kubectl error" >> /var/log/slurm/power_save.log echo "done" >> /var/log/slurm/power_save.log done From a9ea92b96cb1dc0546163184ef3d442c3d95b99f Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 17 Aug 2023 10:24:29 +0100 Subject: [PATCH 145/152] Adding /dev/tty pipes --- image/k8s-slurmd-create | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image/k8s-slurmd-create b/image/k8s-slurmd-create index 22e9f7c..7f12e1c 100644 --- a/image/k8s-slurmd-create +++ b/image/k8s-slurmd-create @@ -10,6 +10,6 @@ echo "Powering up hosts: $hosts" >> /var/log/slurm/power_save.log for host in $hosts do echo "Creating $host" >> /var/log/slurm/power_save.log - sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | kubectl create -f - || echo "kubectl error" >> /var/log/slurm/power_save.log + sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | tee /dev/tty | kubectl create -f - | tee /dev/tty echo "done" >> /var/log/slurm/power_save.log done From be00d249f500d419bb7d8e3cdf27625b9c8992df Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 17 Aug 2023 10:31:55 +0100 Subject: [PATCH 146/152] Debug --- image/k8s-slurmd-create | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/image/k8s-slurmd-create b/image/k8s-slurmd-create index 7f12e1c..fb92963 100644 --- a/image/k8s-slurmd-create +++ b/image/k8s-slurmd-create @@ -10,6 +10,9 @@ echo "Powering up hosts: $hosts" >> /var/log/slurm/power_save.log for host in $hosts do echo "Creating $host" >> /var/log/slurm/power_save.log - sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | tee /dev/tty | kubectl create -f - | tee /dev/tty + touch tmpfile.yml + sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml > tmpfile.yml + echo "sed successful" >> /var/log/slurm/power_save.log + kubectl create -f tmpfile.yml &> /var/log/slurm/power_save.log echo "done" >> /var/log/slurm/power_save.log done From 63795d3056fa765a009a2defe8c8816224a1236f Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 17 Aug 2023 10:48:11 +0100 Subject: [PATCH 147/152] Added error redirection --- image/k8s-slurmd-create | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/image/k8s-slurmd-create b/image/k8s-slurmd-create index fb92963..d397771 100644 --- a/image/k8s-slurmd-create +++ b/image/k8s-slurmd-create @@ -1,18 +1,14 @@ #!/usr/bin/bash -set -euo pipefail -echo "$(date) Resume invoked $0 $*" >> /var/log/slurm/power_save.log +echo "$(date) Resume invoked $0 $*" &>> /var/log/slurm/power_save.log -echo "Arguments: $* $0 $1" >> /var/log/slurm/power_save.log +echo "Arguments: $* $0 $1" &>> /var/log/slurm/power_save.log hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes -echo "Powering up hosts: $hosts" >> /var/log/slurm/power_save.log +echo "Powering up hosts: $hosts" &>> /var/log/slurm/power_save.log for host in $hosts do - echo "Creating $host" >> /var/log/slurm/power_save.log - touch tmpfile.yml - sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml > tmpfile.yml - echo "sed successful" >> /var/log/slurm/power_save.log - kubectl create -f tmpfile.yml &> /var/log/slurm/power_save.log - echo "done" >> /var/log/slurm/power_save.log -done + echo "Creating $host" &>> /var/log/slurm/power_save.log + ( sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | kubectl create -f - ) &>> /var/log/slurm/power_save.log + echo "done" &>> /var/log/slurm/power_save.log +done \ No newline at end of file From a731c60b8edd7b07d3d780fcc6904e8c80a39363 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 17 Aug 2023 15:12:44 +0100 Subject: [PATCH 148/152] Fixed missing environment variables in power up/down scripts --- image/k8s-slurmd-create | 12 +++++++----- image/k8s-slurmd-delete | 9 +++++++-- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/image/k8s-slurmd-create b/image/k8s-slurmd-create index d397771..4a99918 100644 --- a/image/k8s-slurmd-create +++ b/image/k8s-slurmd-create @@ -2,13 +2,15 @@ echo "$(date) Resume invoked $0 $*" &>> /var/log/slurm/power_save.log -echo "Arguments: $* $0 $1" &>> /var/log/slurm/power_save.log +APISERVER=https://kubernetes.default.svc +SERVICEACCOUNT=/var/run/secrets/kubernetes.io/serviceaccount +NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace) +TOKEN=$(cat ${SERVICEACCOUNT}/token) +CACERT=${SERVICEACCOUNT}/ca.crt hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes -echo "Powering up hosts: $hosts" &>> /var/log/slurm/power_save.log for host in $hosts do - echo "Creating $host" &>> /var/log/slurm/power_save.log - ( sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | kubectl create -f - ) &>> /var/log/slurm/power_save.log - echo "done" &>> /var/log/slurm/power_save.log + ( sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | \ + kubectl --server $APISERVER --token $TOKEN --certificate-authority $CACERT create -f - ) done \ No newline at end of file diff --git a/image/k8s-slurmd-delete b/image/k8s-slurmd-delete index 3f611de..da4e438 100644 --- a/image/k8s-slurmd-delete +++ b/image/k8s-slurmd-delete @@ -1,10 +1,15 @@ #!/usr/bin/bash -set -euo pipefail echo "$(date) Suspend invoked $0 $*" >> /var/log/slurm/power_save.log +APISERVER=https://kubernetes.default.svc +SERVICEACCOUNT=/var/run/secrets/kubernetes.io/serviceaccount +NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace) +TOKEN=$(cat ${SERVICEACCOUNT}/token) +CACERT=${SERVICEACCOUNT}/ca.crt + hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes for host in $hosts do - kubectl delete pod $host + kubectl --server $APISERVER --token $TOKEN --certificate-authority $CACERT delete pod $host done From a2ca5e30de730119076ac2a48b7273d2885bb669 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 17 Aug 2023 15:38:07 +0100 Subject: [PATCH 149/152] Updated values.yaml and gave all pod permissions to account --- .../templates/slurm-autoscaler-service-account.yaml | 2 +- slurm-cluster-chart/values.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/slurm-cluster-chart/templates/slurm-autoscaler-service-account.yaml b/slurm-cluster-chart/templates/slurm-autoscaler-service-account.yaml index 162366f..8bb98c9 100644 --- a/slurm-cluster-chart/templates/slurm-autoscaler-service-account.yaml +++ b/slurm-cluster-chart/templates/slurm-autoscaler-service-account.yaml @@ -13,7 +13,7 @@ metadata: rules: - apiGroups: [""] # "" indicates the core API group resources: ["pods"] - verbs: ["get","apply","create", "patch", "delete" ] + verbs: ["get","apply","create", "patch", "delete", "list", "watch"] --- diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 8c57722..6ad7359 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:9e4598e +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:a731c60 replicas: slurmd: 2 From 89981e61c740cc74ea3a14247048584d30e59e87 Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 18 Aug 2023 13:45:56 +0100 Subject: [PATCH 150/152] Updated image tag --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 0fd341a..086ccf1 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:d3daba4 #OUTDATED, DO NOT USE THIS COMMIT! +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:1f51003 login: # Deployment resource name From 3ebcfe412c7e3afbf574f83915588c98deafe104 Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 18 Aug 2023 15:33:24 +0100 Subject: [PATCH 151/152] Updated image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 4b4b32d..182816a 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:d3daba4 #OUTDATED, DON'T USE! +slurmImage: ghcr.io/stackhpc/slurm-k8s-cluster:6ca2cd0 login: # Deployment resource name From 344b9b2419d8f0f78a81cb71c183c97db09579df Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 18 Aug 2023 15:47:40 +0100 Subject: [PATCH 152/152] Updated image tag --- slurm-cluster-chart/values.yaml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 19746e0..a88f282 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,8 +1,4 @@ -<<<<<<< HEAD -slurmImage: ghcr.io/stackhpc/slurm-k8s-cluster:6ca2cd0 -======= -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:1f51003 ->>>>>>> main +slurmImage: ghcr.io/stackhpc/slurm-k8s-cluster:0602876 login: # Deployment resource name