diff --git a/.gitignore b/.gitignore index 6544b994b..6b028ec3a 100755 --- a/.gitignore +++ b/.gitignore @@ -38,6 +38,7 @@ src/WebUI/dotnet/WebPortal/userconfig.json src/WebUI/dotnet/WebPortal/configAuth.json src/WebUI/dotnet/WebPortal/Master-Templates.json src/WebUI/dotnet/WebPortal/hosting.json +**package-lock.json **/wwwroot/* **/bin/Release/* **/bin/Debug/* @@ -61,3 +62,8 @@ src/WebUI/dotnet/WebPortal/hosting.json /.vs/DLWorkspace/v15/.suo cluster-autoscaler +src/ClusterBootstrap/services/monitor/grafana-config.yaml +src/ClusterBootstrap/services/monitor/prometheus-alerting.yaml +src/ClusterBootstrap/services/monitor/alert-templates.yaml +src/ClusterBootstrap/services/jobmanager/dlws-scripts.yaml +src/ClusterBootstrap/services/monitor/alerting/kill-idle.rules diff --git a/azure-pipelines.yml b/azure-pipelines.yml new file mode 100644 index 000000000..f87ed15ca --- /dev/null +++ b/azure-pipelines.yml @@ -0,0 +1,40 @@ +# Starter pipeline +# Start with a minimal pipeline that you can customize to build and deploy your code. +# Add steps that build, run tests, deploy, and more: +# https://aka.ms/yaml + +trigger: +- dltsdev + +pool: + name: 'DLTS-Platform' + +# container: ubuntu:18.04 + +variables: { SUBSCRIPTION_NAME: "'Bing DLTS'" } + +steps: +- script: | + cd src/ClusterBootstrap/ + sudo ./install_prerequisites.sh + az account set --subscription $(SUBSCRIPTION_NAME) + az account list | grep -A5 -B5 '"isDefault": true' + displayName: 'Install prerequisites' + +- script: | + cd src/ClusterBootstrap/ + cp /mnt/_work/dlts_ci_config.yaml config.yaml + ./bash_step_by_step_deploy.sh + displayName: 'Deploy DLWorkspace' + +- script: | + echo TODO: verify the cluster is ready! + displayName: 'Verify deployment' + +- script: | + echo TODO: RUN functional tests! + displayName: 'Functional tests' + +- script: | + echo TODO: cleanup the deployment! + displayName: 'Cleanup' diff --git a/docs/deployment/Azure/FAQ.md b/docs/deployment/Azure/FAQ.md index 4debf3dcf..139abe035 100755 --- a/docs/deployment/Azure/FAQ.md +++ b/docs/deployment/Azure/FAQ.md @@ -1,32 +1,64 @@ -# Frequently Asked Questions (FAQ) for Azure Cluster Deployment. - -Please refer to [this](../knownissues/Readme.md) for more general deployment issues. - -## After setup, I cannot visit the deployed DL Workspace portal. - -* Please wait a few minutes after the deployment script runs through to allow the portal container to be pulled and scheduled for execution. - -## I can't execute Spark job on Azure. - -The current default deployment procedure on Azure doesn't deploy HDFS/Spark. So Spark job execution is not available. - -## For 'az login', when I type in the device code, the web page prompt me again for the code. - -It seems that sometime the browser (Edge, Chrome) cache another identity not intended to be used with az login. To get around, please start the browser in (in-private) or (incognito) mode, you may then enter the proper device code. - -## I have launched a job (e.g., TensorFlow-iPython-GPU). However, I am unable to access the endpoint with error - - ```This site can’t be reached - ....cloudapp.azure.com refused to connect. - ``` - - Please check the docker image of the job you are running. Sometime, the iPython (or SSH server) hasn't been properly started, which caused the endpoint to be not accessible. - -## I notice that my azure command is failing. - -Azure CLI may time out after inactivity. You may need to re-login via 'az login'. - -## Common configuration errors. - -* "merge_config( config["azure_cluster"], tmpconfig["azure_cluster"][config["azure_cluster"]["cluster_name"]], verbose )" +# Frequently Asked Questions (FAQ) for Azure Cluster Deployment. + +Please refer to [this](../knownissues/Readme.md) for more general deployment issues. + +## After setup, I cannot visit the deployed DL Workspace portal. + +* Please wait a few minutes after the deployment script runs through to allow the portal container to be pulled and scheduled for execution. + +## sudo ./az_tools.py create failed. + +* Check whether your subscription is correct. Always execute ```az account list | grep -A5 -B5 '"isDefault": true'``` to double check. + +## Lost connection at the very first step of deploying infra node to Azure, or ```./deploy.py runscriptonall ./scripts/prepare_vm_disk.sh``` + +* Check whether hostname and source address in config.yaml are correctly set. Also try to make sure that you can ssh to the node. + +## I cannot ssh to the node when my devbox is a physical server instead of a virtual one. + +* Source IP address in config.yaml should probably be public IP, which could be derived by ```curl ifconfig.me```, instead of private IP you use to ssh to the devbox deriving from ```hostname -I```. If you cannot even ssh to the node after creating it, try to first set a new rule in Azure portal, allowing any source and destination IP, and set destination portal ranges to 22. Then ssh to the node, and type ```who``` to get the actual IP that is used to login to the node. Delete the temporary rule and in Networking setting, add /16 to valid source IP, where is the ```who``` IP with last two numbers set to 0. (e.g., 167.220.2.105 to 167.220.0.0/16) + +## How do I know the node has been deployed? + +* You can log into the master node: ```./deploy.py connect master``` + +## I could not build docker image/No such image/An image does not exist locally with the tag/The repository XXX does not have a Release file + +* Check whether your docker is able to correctly resolve dns. First try on your devbox to ping a certain website, then do it in docker, such as `docker run -it busybox`, + if the former setting can ping but not the later one, try to figure out whether your devbox need to visit public Internet via some private DNS. Then edit it in `/etc/docker/daemon.json` on your devbox. refer to [this article](https://medium.com/@faithfulanere/solved-docker-build-could-not-resolve-archive-ubuntu-com-apt-get-fails-to-install-anything-9ea4dfdcdcf2) + use `systemd-resolve --status` to get more info about DNS if it is not managed by network-manager + +## I can connect master/infra node, but the UI is not working (cannot access from browser), how to debug? + +* Login to the master node, and use ```docker ps | grep web``` to get the ID corresponding to Web UI, then use ```docker logs --follow ``` to figure out what happened. + a better way is to use ```sudo docker logs --tail 100 --follow $(sudo docker ps | grep webui | awk '{print $1}') ``` since the ID would change everytime the docker is restarted. + Everytime after modifying /etc/WebUI/userconfig.json etc., remember to restart that docker image: ```docker rm -f ``` + +## finished all deployment, but not able to connect to master node via ```./deploy.py connect master```, ssh denied even with ``` ssh -i deploy/sshkey/id_rsa core@```. + +* Need to change owner ```sudo chown -R : DLWorkspace/```, can check ownership using ```ls -l``` + +## I can't execute Spark job on Azure. + +* The current default deployment procedure on Azure doesn't deploy HDFS/Spark. So Spark job execution is not available. + +## For 'az login', when I type in the device code, the web page prompt me again for the code. + +* It seems that sometime the browser (Edge, Chrome) cache another identity not intended to be used with az login. To get around, please start the browser in (in-private) or (incognito) mode, you may then enter the proper device code. + +## I have launched a job (e.g., TensorFlow-iPython-GPU). However, I am unable to access the endpoint with error + + ```This site can’t be reached + ....cloudapp.azure.com refused to connect. + ``` + + Please check the docker image of the job you are running. Sometime, the iPython (or SSH server) hasn't been properly started, which caused the endpoint to be not accessible. + +## I notice that my azure command is failing. + +Azure CLI may time out after inactivity. You may need to re-login via 'az login'. + +## Common configuration errors. + +* "merge_config( config["azure_cluster"], tmpconfig["azure_cluster"][config["azure_cluster"]["cluster_name"]], verbose )" Please check if the cluster_name used in azure_cluster is the same as the DL workspace cluster name. \ No newline at end of file diff --git a/docs/deployment/Azure/Readme.md b/docs/deployment/Azure/Readme.md index 726de1e0a..d009ab424 100755 --- a/docs/deployment/Azure/Readme.md +++ b/docs/deployment/Azure/Readme.md @@ -4,20 +4,36 @@ This document describes the procedure to deploy a DL Workspace cluster on Azure. Please note that the procedure below doesn't deploy HDFS/Spark on DLWorkspace cluster on Azure (Spark job execution is not available on Azure Cluster). -1. Follow [this document](../../DevEnvironment/Readme.md) to setup the dev environment of DLWorkspace. Login to your Azure subscription on your dev machine via: +Prerequisite steps: +First require the manager to add you into a subscription group., then either +1. go to that group from Azure Portal and add ubuntu server from resources, this virtual server is your devbox, or +2. if you have a physical machine, install ubuntu server system(18.04) on that and use it as your devbox +then use the devbox to deploy node on cloud. +Workflow: +1. Please [configure](configure.md) your azure cluster. Put config.yaml under src/ClusterBootstrap + +2. Change directory to src/ClusterBootstrap on devbox, and install prerequisite packages: +``` +cd src/ClusterBootstrap/ +sudo ./install_prerequisites.sh ``` +3. Login to Azure, setup proper subscription and confirm +``` +SUBSCRIPTION_NAME="" az login +az account set --subscription "${SUBSCRIPTION_NAME}" +az account list | grep -A5 -B5 '"isDefault": true' ``` - -2. Please [configure](configure.md) your azure cluster. - -3. Set proper [authentication](../authentication/Readme.md). - -4. Initial cluster and generate certificates and keys: +Configure your location, should be the same as you specified in config.yaml file: +```AZ_LOCATION=""``` +Execute this command, log out(exit) and log in back +```sudo usermod -aG docker zhe_ms``` +4. Initiate cluster and generate certificates and keys: ``` ./deploy.py -y build ``` + 5. Create Azure Cluster: ``` ./az_tools.py create @@ -40,9 +56,10 @@ Please note that if you are not Microsoft user, you should remove the ``` where machine1 is your azure infrastructure node. (you may get the address by ./deploy.py display) - The script block execute the following command in sequences: (you do NOT need to run the following commands if you have run step 5) - 1. Setup basic tools on the Ubuntu image. + This command sequetially execute following steps: + 1. Setup basic tools on VM and on the Ubuntu image. ``` + ./deploy.py runscriptonall ./scripts/prepare_vm_disk.sh ./deploy.py runscriptonall ./scripts/prepare_ubuntu.sh ``` @@ -57,16 +74,28 @@ Please note that if you are not Microsoft user, you should remove the ./deploy.py -y kubernetes labels ``` - 4. Build and deploy jobmanager, restfulapi, and webportal. Mount storage. + 4. Start Nvidia device plugins: ``` + ./deploy.py kubernetes start nvidia-device-plugin + ``` + + 5. Build and deploy jobmanager, restfulapi, and webportal. Mount storage. + ``` + ./deploy.py webui ./deploy.py docker push restfulapi ./deploy.py docker push webui - ./deploy.py webui ./deploy.py mount ./deploy.py kubernetes start jobmanager restfulapi webportal ``` -8. If you run into a deployment issue, please check [here](FAQ.md) first. - -9. If you want to deploy a DLWorkspace cluster that can be autoscaled (i.e., automatically create/release VM when needed), please follow the following additional steps. - +8. Manually connect to the infrastructure/master node: + ```./deploy.py connect master``` + On master node(log in from devbox by ./deploy.py connect master), manually add ```"Grafana": "",``` to /etc/WebUI/userconfig.json, under "Restapi" entry. + Restart the WebUI docker: + Login to the master node, and use + ```docker ps | grep web``` + to get the ID corresponding to Web UI, then restart that docker image: + ```docker rm -f ``` + Wait for minutes for it to restart (can follow by using ```docker logs --follow ```) and visit the infra node from web browser. + +9. If you run into a deployment issue, please check [here](FAQ.md) first. \ No newline at end of file diff --git a/docs/deployment/Azure/auto_scale.md b/docs/deployment/Azure/auto_scale.md index 8816c721e..0570b487c 100755 --- a/docs/deployment/Azure/auto_scale.md +++ b/docs/deployment/Azure/auto_scale.md @@ -1,31 +1,31 @@ -# The following describe the procedures to autoscale a DLWorkspace cluster (i.e., automatically create/release VM when needed). - -1. Download auto_scaler binary - ``` - wget https://github.com/DLWorkspace/autoscaler/releases/download/v1.9.0/cluster-autoscaler - ``` - -2. Setup azure running environment and login (via az login) - -3. For the Azure machine types supported, please check the document at: - ``` - src/ClusterBootstrap/templates/machine-types/azure/machineTypes.yaml - ``` - - A sample template is as follows. Please fill in additional worker VM SKUs if you need. - - ``` - --- - Standard_NC6: - cpu: 6 - memoryInMb: 56339 - gpu: 1 - Standard_D3_v2: - cpu: 4 - memoryInMb: 14339 - ``` - -4. Start auto_scaler: - ``` - ./cluster-autoscaler --v=5 --stderrthreshold=error --logtostderr=true --cloud-provider=aztools --skip-nodes-with-local-storage=false --nodes=0:10:Standard_NC6 --nodes=0:10:Standard_D3_v2 --leader-elect=false --scale-down-enabled=true --kubeconfig=./deploy/kubeconfig/kubeconfig.yaml --expander=least-waste +# The following describe the procedures to autoscale a DLWorkspace cluster (i.e., automatically create/release VM when needed). + +1. Download auto_scaler binary + ``` + wget https://github.com/DLWorkspace/autoscaler/releases/download/v1.9.0/cluster-autoscaler + ``` + +2. Setup azure running environment and login (via az login) + +3. For the Azure machine types supported, please check the document at: + ``` + src/ClusterBootstrap/templates/machine-types/azure/machineTypes.yaml + ``` + + A sample template is as follows. Please fill in additional worker VM SKUs if you need. + + ``` + --- + Standard_NC6: + cpu: 6 + memoryInMb: 56339 + gpu: 1 + Standard_D3_v2: + cpu: 4 + memoryInMb: 14339 + ``` + +4. Start auto_scaler: + ``` + ./cluster-autoscaler --v=5 --stderrthreshold=error --logtostderr=true --cloud-provider=aztools --skip-nodes-with-local-storage=false --nodes=0:10:Standard_NC6 --nodes=0:10:Standard_D3_v2 --leader-elect=false --scale-down-enabled=true --kubeconfig=./deploy/kubeconfig/kubeconfig.yaml --expander=least-waste ``` \ No newline at end of file diff --git a/docs/deployment/Azure/configure.md b/docs/deployment/Azure/configure.md index 6e9222b85..92532d09d 100755 --- a/docs/deployment/Azure/configure.md +++ b/docs/deployment/Azure/configure.md @@ -1,58 +1,80 @@ -# Configuration: Azure Cluster - -For more customized configuration, please refer to the [Configuration Section](../configuration/Readme.md). - -## Azure Cluster specific configuration - -We have greatly simplified Azure Cluster Configuration. As a minimum, you will only need to create a config.yaml file under src/ClusterBootstrap, with the cluster name. - -### Cluster Name - -Cluster name must be unique, and should be specified as: - -``` -cluster_name: [your cluster name] -``` - - -### Authentication -If you are not building a cluster for Microsoft employee usage, you will also need to configure [Authentication](../authentication/Readme.md). - -### Additional configuration. - -You may provide/change the specification of the deployed Azure cluster by adding the following information on config.yaml file: - -``` -azure_cluster: - <>: - "infra_node_num": 1, - "worker_node_num": 2, - "azure_location": "westus2", - "infra_vm_size" : "Standard_D1_v2", - "worker_vm_size": "Standard_NC6", -``` - -* infra_node_num: should be odd (1, 3 or 5), number of infrastructure node for the deployment. 3 infrastructure nodes tolerate 1 failure, and 5 infrastructure nodes tolerate 2 failures. However, more infrastructure nodes (and more failure tolerance) will reduce performance of the node. - -* worker_node_num: number of worker node used for deployment. - -* azure_location: - -Please use the following to find all available azure locations. -``` -az account list-locations -``` - -* infra_vm_size, worker_vm_size: infrastructure and worker VM size. - -Usually, a CPU VM will be used for infra_vm_size, and a GPU VM will be used for worker_vm_size. Please use the following to find all available Azure VM size. -``` -az vm list-sizes --location westus2 -``` - -* Configure MySql as follows: - -``` -datasource: MySQL -mysql_password: <> -``` +# Configuration: Azure Cluster + +For more customized configuration, please refer to the [Configuration Section](../configuration/Readme.md) and [Azure doc](https://docs.microsoft.com/en-us/cli/azure/vm?view=azure-cli-latest). + +## Azure Cluster specific configuration + +We have greatly simplified Azure Cluster Configuration. As a minimum, you will only need to create a config.yaml file under src/ClusterBootstrap, with the cluster name. + +### Cluster Name + +Cluster name must be unique, and should be specified as: + +``` +cluster_name: +``` + + +### Authentication +If you are not building a cluster for Microsoft employee usage, you will also need to configure [Authentication](../authentication/Readme.md). + +### Additional configuration. + +You may provide/change the specification of the deployed Azure cluster by adding the following information on config.yaml file: + +``` +cluster_name: exitedtoad +#vm size: westus:Standard_DS1, westus2:Standard_DS1_v2 +azure_cluster: + : + "infra_node_num": 1 + "worker_node_num": 2 + "azure_location": "westus2" + "infra_vm_size": "Standard_DS1_v2" + "worker_vm_size": "Standard_NC6" + "vm_image" : "Canonical:UbuntuServer:18.04-LTS:18.04.201907221" +datasource: MySQL +webuiport: 80 +mysql_password: +cloud_config: + default_admin_username: core + dev_network: + source_addresses_prefixes: + # These are the dev box of the cluster, only the machine in the IP address below will have access to the cluster. + - "/32" +registry_credential: + : + username: + password: +``` + +* cluster_name: A name without underscore or numbers (purely consisting of lower case letters) is recommended. + +* infra_node_num: Should be odd (1, 3 or 5), number of infrastructure node for the deployment. 3 infrastructure nodes tolerate 1 failure, and 5 infrastructure nodes tolerate 2 failures. However, more infrastructure nodes (and more failure tolerance) will reduce performance of the node. + +* worker_node_num: Number of worker node used for deployment. + +* vm_image: Used to fix the image version if the changing LTS is breaking the consistency of the deployment. + +* azure_location: + +Please use the following to find all available azure locations. +``` +az account list-locations +``` + +* infra_vm_size, worker_vm_size: infrastructure and worker VM size. + +Usually, a CPU VM will be used for infra_vm_size, and a GPU VM will be used for worker_vm_size. Please use the following to find all available Azure VM size. +``` +az vm list-sizes --location westus2 +``` + +* Configure MySql as follows: + +``` +datasource: MySQL +mysql_password: <> +``` + +* registry_credential: defines your access to certain dockers. A docker image name consists of three parts - registry name, image name, and image tag. If your job needs a certain private docker, then use 0. the registry name of that docker, 1. your user name and 2. your password to specify your access to it. \ No newline at end of file diff --git a/docs/deployment/On-Prem/Ubuntu-Machines.md b/docs/deployment/On-Prem/Ubuntu-Machines.md new file mode 100755 index 000000000..9e0dd3d85 --- /dev/null +++ b/docs/deployment/On-Prem/Ubuntu-Machines.md @@ -0,0 +1,135 @@ +# Steps to deploy DL workspace cluster for a off-prem cluster Ubuntu. + +This document describes the procedure to deploy DL workspace cluster on a off-prem Clusters (VM or actual machine) that is already been imaged with Ubuntu OS. + +1. On dev node, +..* Enable password-less sudo, with sudo visudo +..* Generate ssh-key, and grant github access for the dev machine. +..* Install git, if not installed yet. +..* Find data partition, if any, use mkfs ext4 to make partition, and to prepare the partition for mount in fstab. +..* use ```sudo mount -a``` to remount. +..* check DNS setting, check if network is working. If shows TLS error message, network may not be stable. + +2. [Run Once] Setup [development environment](../../DevEnvironment/Readme.md). + +3. [Configuration the cluster](../configuration/Readme.md), and determine important information of the cluster (e.g., cluster name, number of Etcd servers used). Please refer to [Backup/Restore](../Backup.md) on instruction to backup/restore cluster configuration. + +4. Configure and setup the [databased](../database/Readme.md) used in the cluster. + +5. Config shared file system to be used in the cluster, following instructions in [Storage](../Storage/Readme.md) and the [configuration](../Storage/nfs.md). + +6. Install sshkey to all nodes. +..* in config.yaml, insert entry of the admin_username of cluster + admin_username: +..* insert admin password to + ./deploy/sshkey/rootpasswd +..* install sshkey via: + ./deploy.py sshkey install + +7. Configure the information of the servers used in the cluster. Please write the following entries in config.yaml. + + ``` + network: + domain: <> + container-network-iprange: "<>" + + platform-scripts : ubuntu + + machines: + <>: + role: infrastructure + <>: + role: worker + <>: + role: worker + .... + ``` + If you are building a high availability cluster, please include multiple infrastructure nodes. The number of infrastructure nodes should be odd, e.g., 1, 3, 5. 3 infrastructure nodes tolerate 1 failure. 5 infrastructure nodes tolerate 2 failures. + +8. Build Ubuntu PXE-server via: + ``` + ./deploy.py -y build + ./deploy.py build pxe-ubuntu + ``` + +9. Start Ubuntu PXE-server. You will need to point DHCP server to the Ubuntu PXE-server. + ``` + ./deploy.py docker run pxe-ubuntu + ``` + Reboot each machine to be deployed. In each boot screen, select to install Ubuntu 16.04. + +10. After the machines is reimaged to Ubuntu, install sshkey. (optional: If you ignore step 2,3 and choose to use an existing ubuntu cluster, you may put root username and password to files: ./deploy/sshkey/rootuser and ./deploy/sshkey/rootpasswd. In this case, the root user should be able to run "sudo" without password.) + ``` + ./deploy.py sshkey install + ``` + +11. Enable password less sudo priviledge, by adding following entry to `sudo visudo` + ``` + ALL=(ALL) NOPASSWD:ALL + ``` + Please verify if password less sudo works on the remote machine. e.g., via, + ``` + ./deploy.py execonall sudo ls -al + ``` + +12. If apt-get gives a crash error, the issue is caused by: + https://askubuntu.com/questions/942895/e-problem-executing-scripts-aptupdatepost-invoke-success + ``` + ./deploy.py execonall sudo apt-get remove libappstream3 + ``` + + +12. Setup basic tools on the Ubuntu image. + ``` + ./deploy.py runscriptonall ./scripts/prepare_ubuntu.sh + ./deploy.py execonall sudo usermod -aG docker core + ``` + Use nvidia-docker as default docker run time (included in kubelet startup) + +13. Partition hard drive, if necessary. Please refer to section [Partition](Repartiuion.md) for details. + +14. Setup kubernetes + If setup in the cluster in China, we tried to use dlws/pause-amd64:3.0 as --pod-infra-container-image, however, for some reason, this doesn't work. Temporary, please do: + ``` + ./deploy.py execonall docker pull dlws/pause-amd64:3.0 + ./deploy.py execonall docker tag dlws/pause-amd64:3.0 gcr.io/google_containers/pause-amd64:3.0 + ``` + ``` + ./deploy.py -y deploy + ./deploy.py -y updateworker + ./deploy.py -y kubernetes labels + ``` + If you are running a small cluster, and need to run workload on the Kubernete master node (this choice may affect cluster stability), please use: + ``` + ./deploy.py -y kubernetes uncordon + ``` + Works now will be scheduled on the master node. If you stop here, you will have a fully functional kubernete cluster. Thus, part of DL Workspace setup can be considered automatic procedure to setup a kubernete cluster. You don't need shared file system or database for kubernete cluster operation. + +15. [optional] Configure, setup and mount [GlusterFS](../Storage/GlusterFS.md) +16. [Optional] Configure, setup and mount [HDFS](../Storage/hdfs.md) +17. [Optional] Setup [Spark](../Storage/spark.md) + +18. Mount shared file system + ``` + ./deploy.py mount + ``` +19. Deploy nvidia-device plugin. + ``` + ./deploy.py kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v1.9/nvidia-device-plugin.yml + ``` + +20. Build and deploy jobmanager, restfulapi, and webportal. Mount storage. + ``` + ./deploy.py webui + ./deploy.py docker push restfulapi + ./deploy.py docker push webui + ./deploy.py nginx fqdn + ./deploy.py nginx config + ./deploy.py kubernetes start mysql + ./deploy.py kubernetes start jobmanager + ./deploy.py kubernetes start restfulapi + ./deploy.py kubernetes start webportal + ./deploy.py kubernetes start cloudmonitor + ./deploy.py kubernetes start nginx + ./deploy.py kubernetes start custommetrics + ``` diff --git a/docs/deployment/Storage/nfs.md b/docs/deployment/Storage/nfs.md new file mode 100755 index 000000000..86faaf6e2 --- /dev/null +++ b/docs/deployment/Storage/nfs.md @@ -0,0 +1,36 @@ +# Deployment NFS server + +The document describes the procedure to setup NFS server. We follow the procedure in https://www.digitalocean.com/community/tutorials/how-to-set-up-an-nfs-mount-on-ubuntu-16-04 + +1. Install nfs kernel. + ``` + sudo apt-get update + sudo apt-get install nfs-kernel-server + ``` + +2. Format ext4 partition, and mount the partition to a particular mount point + ``` + sudo mkdir -p /mnt/dlwsdata + # nfs use nobody:nogroup to visit + sudo chown nobody:nogroup /mnt/dlwsdata + # discover the UUID information of the block device. + sudo lsblk -o Name,FSTYPE,UUID + # edit /etc/fstab, and add entry, which mounts a particular UUID storage device to the mount point. Last number is the fsck order. + UUID=e2c91cb7-c97d-46f7-a51b-001a06a14e08 /mnt/dlwsdata ext4 errors=remount-ro 0 2 + # Comment any swap entry, as kubelet doesn't run with swap on + # causes all filesystems mentioned in fstab (of the proper type and/or having or not having the proper options) to be mounted as indicated, except for those whose line contains the noauto keyword. + sudo mount -a + ``` + +3. Modify /etc/exports + /mnt/dlwsdata *(rw,sync,no_root_squash,no_subtree_check) + +4. Check firewall status, if any. + ``` + sudo ufw status + ``` + +5. Start NFS server. + ``` + sudo systemctl restart nfs-kernel-server + ``` diff --git a/docs/deployment/network/dns.md b/docs/deployment/network/dns.md new file mode 100644 index 000000000..4d7639d20 --- /dev/null +++ b/docs/deployment/network/dns.md @@ -0,0 +1,12 @@ +# Add DNS server. + +# https://unix.stackexchange.com/questions/128220/how-do-i-set-my-dns-when-resolv-conf-is-being-overwritten + +sudo vim /etc/resolvconf/resolv.conf.d/base + +Then put your nameserver list in like so: + +nameserver 8.8.8.8 +nameserver 8.8.4.4 + +sudo resolvconf -u diff --git a/sample.output b/sample.output old mode 100644 new mode 100755 diff --git a/src/ARM/azuredeploy.json b/src/ARM/azuredeploy.json index cbedc2406..b93e95005 100755 --- a/src/ARM/azuredeploy.json +++ b/src/ARM/azuredeploy.json @@ -20,7 +20,7 @@ "type": "string", "metadata": { "description": "Username on machines." - } + } }, "adminPassword": { "type": "securestring", @@ -52,7 +52,7 @@ "type": "string", "defaultValue": "Standard_DS2_v2", "metadata": { - "description": "Size of the infra-VM. Use a CPU VM for infra-VM." + "description": "Size of the infra-VM. Use a CPU VM for infra-VM." } }, "numberOfWorkerVM": { @@ -101,7 +101,7 @@ "metadata": { "description": "Client Secret of the web application registered with the authentication provider." } - }, + }, "_artifactsLocation": { "type": "string", "metadata": { @@ -467,9 +467,10 @@ "destinationPortRanges": [ "80", "443", - "30000-32767", + "30000-39999", "25826", - "3080" + "3080", + "40000-49999" ] } }, diff --git a/src/ClusterBootstrap/acs_tools.py b/src/ClusterBootstrap/acs_tools.py index 5f253b7d8..8415481f2 100755 --- a/src/ClusterBootstrap/acs_tools.py +++ b/src/ClusterBootstrap/acs_tools.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python # Tools to build ACS cluster import sys @@ -140,7 +140,7 @@ def acs_wait_for_kube(): while numNodes < expectedNodes: binary = os.path.abspath('./deploy/bin/kubectl') kubeconfig = os.path.abspath('./deploy/'+config["acskubeconfig"]) - cmd = binary + ' -o=json --kubeconfig='+kubeconfig+' get nodes' + cmd = binary + ' -o=json --kubeconfig='+kubeconfig+' get nodes' nodeInfo = utils.subproc_runonce(cmd) nodes = yaml.load(nodeInfo) numNodes = len(nodes["items"]) @@ -148,7 +148,7 @@ def acs_wait_for_kube(): print "Waiting for {0} kubernetes nodes to start up, currently have only {1} nodes".format(expectedNodes, numNodes) time.sleep(5) -# divide nodes into master / agent +# divide nodes into master / agent def acs_set_nodes_info(): if "acs_master_nodes" not in config or "acs_agent_nodes" not in config: allnodes = acs_get_kube_nodes() @@ -362,7 +362,7 @@ def acs_get_storage_key(): cmd += " --account-name=%s" % config["mountpoints"]["rootshare"]["accountname"] cmd += " --resource-group=%s" % config["resource_group"] keys = az_cmd(cmd) - return keys[0]["value"] + return keys[0]["value"] def acs_create_storage(): # Create storage account @@ -497,7 +497,7 @@ def acs_deploy(): az_create_sql() # Add rules for NSG - acs_add_nsg_rules({"HTTPAllow" : 80, "RestfulAPIAllow" : 5000, "AllowKubernetesServicePorts" : "30000-32767"}) + acs_add_nsg_rules({"HTTPAllow" : 80, "RestfulAPIAllow" : 5000, "AllowKubernetesServicePorts" : "30000-49999"}) # Get kubectl binary / acs config acs_get_config() diff --git a/src/ClusterBootstrap/az_params.py b/src/ClusterBootstrap/az_params.py index 27849210b..a8cb49b1c 100755 --- a/src/ClusterBootstrap/az_params.py +++ b/src/ClusterBootstrap/az_params.py @@ -1,14 +1,16 @@ -default_az_parameters = { - "azure_cluster" : { - "infra_node_num": 1, - "worker_node_num": 2, - "azure_location": "westus2", - "infra_vm_size" : "Standard_D1_v2", - "worker_vm_size": "Standard_NC6", - "vm_image" : "UbuntuLTS", - "vm_storage_sku" : "Standard_LRS", - # "udp_port_ranges": "" - # Use file_share_name to create Azure file share - # "file_share_name" : "files", - }, -} \ No newline at end of file +default_az_parameters = { + "azure_cluster" : { + "infra_node_num": 1, + "worker_node_num": 2, + "nfs_node_num": 0, + "azure_location": "westus2", + "infra_vm_size" : "Standard_D1_v2", + "worker_vm_size": "Standard_NC6", + "nfs_vm_size": "Standard_D1_v2", + "vm_image" : "Canonical:UbuntuServer:18.04-LTS:18.04.201907221", + "vm_storage_sku" : "Premium_LRS", + # "udp_port_ranges": "" + # Use file_share_name to create Azure file share + # "file_share_name" : "files", + }, +} diff --git a/src/ClusterBootstrap/az_tools.py b/src/ClusterBootstrap/az_tools.py index addd1a366..3f92e2a71 100755 --- a/src/ClusterBootstrap/az_tools.py +++ b/src/ClusterBootstrap/az_tools.py @@ -34,6 +34,7 @@ from params import * verbose = False +no_execution = False # These are the default configuration parameter @@ -76,6 +77,9 @@ def update_config(config, genSSH=True): config["azure_cluster"]["nsg_name"] = config[ "azure_cluster"]["cluster_name"] + "-nsg" + if int(config["azure_cluster"]["nfs_node_num"]) > 0: + config["azure_cluster"]["nfs_nsg_name"] = config[ + "azure_cluster"]["cluster_name"] + "-nfs-nsg" config["azure_cluster"]["sql_server_name"] = config[ "azure_cluster"]["cluster_name"] + "sqlserver" config["azure_cluster"]["sql_admin_name"] = config[ @@ -125,6 +129,7 @@ def create_vm_pwd(vmname, vm_ip, vm_size, use_private_ip, pwd): --nsg %s \ --admin-username %s \ --storage-sku %s \ + --data-disk-sizes-gb 2047 \ %s \ """ % (config["azure_cluster"]["resource_group_name"], vmname, @@ -141,10 +146,13 @@ def create_vm_pwd(vmname, vm_ip, vm_size, use_private_ip, pwd): if verbose: print(cmd) - output = utils.exec_cmd_local(cmd) - print(output) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) -def create_vm(vmname, vm_ip, bIsWorker, vm_size): +def create_vm(vmname, vm_ip, role, vm_size): + specify_priv_IP = role in ["worker","nfs"] + nsg = "nfs_nsg_name" if role == "nfs" else "nsg_name" cmd = """ az vm create --resource-group %s \ --name %s \ @@ -159,6 +167,7 @@ def create_vm(vmname, vm_ip, bIsWorker, vm_size): --nsg %s \ --admin-username %s \ --storage-sku %s \ + --data-disk-sizes-gb 2047 \ --ssh-key-value "%s" """ % (config["azure_cluster"]["resource_group_name"], vmname, @@ -168,10 +177,10 @@ def create_vm(vmname, vm_ip, bIsWorker, vm_size): config["azure_cluster"]["azure_location"], vm_size, config["azure_cluster"]["vnet_name"], - config["azure_cluster"]["nsg_name"], + config["azure_cluster"][nsg], config["cloud_config"]["default_admin_username"], config["azure_cluster"]["vm_storage_sku"], - config["azure_cluster"]["sshkey"]) if not bIsWorker else """ + config["azure_cluster"]["sshkey"]) if not specify_priv_IP else """ az vm create --resource-group %s \ --name %s \ --image %s \ @@ -184,6 +193,7 @@ def create_vm(vmname, vm_ip, bIsWorker, vm_size): --nsg %s \ --admin-username %s \ --storage-sku %s \ + --data-disk-sizes-gb 2047 \ --ssh-key-value "%s" """ % (config["azure_cluster"]["resource_group_name"], vmname, @@ -192,7 +202,7 @@ def create_vm(vmname, vm_ip, bIsWorker, vm_size): config["azure_cluster"]["azure_location"], vm_size, config["azure_cluster"]["vnet_name"], - config["azure_cluster"]["nsg_name"], + config["azure_cluster"][nsg], config["cloud_config"]["default_admin_username"], config["azure_cluster"]["vm_storage_sku"], config["azure_cluster"]["sshkey"]) @@ -200,8 +210,9 @@ def create_vm(vmname, vm_ip, bIsWorker, vm_size): # --public-ip-address-allocation static \ if verbose: print(cmd) - output = utils.exec_cmd_local(cmd) - print(output) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) def create_group(): @@ -210,8 +221,9 @@ def create_group(): """ % (config["azure_cluster"]["resource_group_name"], config["azure_cluster"]["azure_location"]) if verbose: print(cmd) - output = utils.exec_cmd_local(cmd) - print(output) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) def create_sql(): @@ -228,8 +240,9 @@ def create_sql(): config["azure_cluster"]["sql_admin_password"]) if verbose: print(cmd) - output = utils.exec_cmd_local(cmd) - print(output) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) cmd = """ az sql server firewall-rule create --resource-group %s \ @@ -241,8 +254,9 @@ def create_sql(): config["azure_cluster"]["sql_server_name"]) if verbose: print(cmd) - output = utils.exec_cmd_local(cmd) - print(output) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) def create_storage_account(): @@ -258,8 +272,9 @@ def create_storage_account(): config["azure_cluster"]["azure_location"]) if verbose: print(cmd) - output = utils.exec_cmd_local(cmd) - print(output) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) def create_file_share(): @@ -270,8 +285,9 @@ def create_file_share(): --query 'connectionString' \ -o tsv """ % (config["azure_cluster"]["storage_account_name"], config["azure_cluster"]["resource_group_name"]) - output = utils.exec_cmd_local(cmd) - print(output) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) cmd = """ az storage share create \ @@ -281,8 +297,9 @@ def create_file_share(): """ % (config["azure_cluster"]["file_share_name"], output) if verbose: print(cmd) - output = utils.exec_cmd_local(cmd) - print(output) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) def create_vnet(): @@ -299,12 +316,12 @@ def create_vnet(): config["cloud_config"]["vnet_range"]) if verbose: print(cmd) - output = utils.exec_cmd_local(cmd) - print(output) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) def create_nsg(): - # print config["cloud_config"]["dev_network"] if "source_addresses_prefixes" in config["cloud_config"]["dev_network"]: source_addresses_prefixes = config["cloud_config"][ "dev_network"]["source_addresses_prefixes"] @@ -321,8 +338,9 @@ def create_nsg(): config["azure_cluster"]["nsg_name"]) if verbose: print(cmd) - output = utils.exec_cmd_local(cmd) - print(output) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) if "tcp_port_ranges" in config["cloud_config"]: cmd = """ @@ -338,8 +356,9 @@ def create_nsg(): config["azure_cluster"]["nsg_name"], config["cloud_config"]["tcp_port_ranges"] ) - output = utils.exec_cmd_local(cmd) - print(output) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) if "udp_port_ranges" in config["cloud_config"]: cmd = """ @@ -355,8 +374,9 @@ def create_nsg(): config["azure_cluster"]["nsg_name"], config["cloud_config"]["udp_port_ranges"] ) - output = utils.exec_cmd_local(cmd) - print(output) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) cmd = """ az network nsg rule create \ @@ -365,7 +385,7 @@ def create_nsg(): --name allowdevtcp \ --protocol tcp \ --priority 900 \ - --destination-port-range %s \ + --destination-port-ranges %s \ --source-address-prefixes %s \ --access allow """ % ( config["azure_cluster"]["resource_group_name"], @@ -373,8 +393,64 @@ def create_nsg(): config["cloud_config"]["dev_network"]["tcp_port_ranges"], source_addresses_prefixes ) - output = utils.exec_cmd_local(cmd) - print(output) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) + +def create_nfs_nsg(): + if "source_addresses_prefixes" in config["cloud_config"]["dev_network"]: + source_addresses_prefixes = config["cloud_config"][ + "dev_network"]["source_addresses_prefixes"] + else: + print "Please setup source_addresses_prefixes in config.yaml, otherwise, your cluster cannot be accessed" + exit() + cmd = """ + az network nsg create \ + --resource-group %s \ + --name %s + """ % ( config["azure_cluster"]["resource_group_name"], + config["azure_cluster"]["nfs_nsg_name"]) + if verbose: + print(cmd) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) + + print type(config["cloud_config"]["nfs_ssh"]["source_ips"]), config["cloud_config"]["nfs_ssh"]["source_ips"],type(source_addresses_prefixes), source_addresses_prefixes + cmd = """ + az network nsg rule create \ + --resource-group %s \ + --nsg-name %s \ + --name allow_ssh\ + --priority 900 \ + --destination-port-ranges %s \ + --source-address-prefixes %s \ + --access allow + """ % ( config["azure_cluster"]["resource_group_name"], + config["azure_cluster"]["nfs_nsg_name"], + config["cloud_config"]["nfs_ssh"]["port"], + " ".join(config["cloud_config"]["nfs_ssh"]["source_ips"] + source_addresses_prefixes), + ) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) + + cmd = """ + az network nsg rule create \ + --resource-group %s \ + --nsg-name %s \ + --name allow_share \ + --priority 1000 \ + --source-address-prefixes %s \ + --destination-port-ranges \'*\' \ + --access allow + """ % ( config["azure_cluster"]["resource_group_name"], + config["azure_cluster"]["nfs_nsg_name"], + " ".join(config["cloud_config"]["nfs_share"]["source_ips"]), + ) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) def delete_group(): @@ -383,17 +459,18 @@ def delete_group(): """ % (config["azure_cluster"]["resource_group_name"]) if verbose: print(cmd) - output = utils.exec_cmd_local(cmd) - print(output) + if not no_execution: + output = utils.exec_cmd_local(cmd) + print(output) -def get_vm_ip(i, bIsWorker, bIsDev): +def get_vm_ip(i, role): vnet_range = config["cloud_config"]["vnet_range"] vnet_ip = vnet_range.split("/")[0] vnet_ips = vnet_ip.split(".") - if bIsWorker: + if role in ["worker", "nfs"]: return vnet_ips[0] + "." + vnet_ips[1] + "." + "1" + "." + str(i + 1) - elif bIsDev: + elif role == "dev": return vnet_ips[0] + "." + vnet_ips[1] + "." + "255" + "." + str(int(config["azure_cluster"]["infra_node_num"]) + 1) else: # 192.168.0 is reserved. @@ -402,6 +479,7 @@ def get_vm_ip(i, bIsWorker, bIsDev): def create_cluster(arm_vm_password=None): bSQLOnly = (config["azure_cluster"]["infra_node_num"] <= 0) + assert int(config["azure_cluster"]["nfs_node_num"]) >= len(config["cloud_config"]["nfs_suffixes"]) print "creating resource group..." create_group() if not bSQLOnly: @@ -414,43 +492,56 @@ def create_cluster(arm_vm_password=None): create_vnet() print "creating network security group..." create_nsg() + if int(config["azure_cluster"]["nfs_node_num"]) > 0: + create_nfs_nsg() if useSqlAzure(): print "creating sql server and database..." create_sql() if arm_vm_password is not None: - # dev box - create_vm_param(0, False, True, config["azure_cluster"]["infra_vm_size"], + # dev box, used in extreme condition when there's only one public IP available, then would use dev in cluster to bridge-connect all of them + create_vm_param(0, "dev", config["azure_cluster"]["infra_vm_size"], True, arm_vm_password) for i in range(int(config["azure_cluster"]["infra_node_num"])): - create_vm_param(i, False, False, config["azure_cluster"]["infra_vm_size"], + create_vm_param(i, "infra", config["azure_cluster"]["infra_vm_size"], arm_vm_password is not None, arm_vm_password) for i in range(int(config["azure_cluster"]["worker_node_num"])): - create_vm_param(i, True, False, config["azure_cluster"]["worker_vm_size"], + create_vm_param(i, "worker", config["azure_cluster"]["worker_vm_size"], arm_vm_password is not None, arm_vm_password) - - -def create_vm_param(i, isWorker, isDev, vm_size, no_az=False, arm_vm_password=None): - if isWorker: - if no_az: - vmname = "%s-worker%02d" % (config["azure_cluster"] - ["cluster_name"], i+1) + # create nfs server if specified. + for i in range(int(config["azure_cluster"]["nfs_node_num"])): + if i < len(config["azure_cluster"]["nfs_suffixes"]): + create_vm_role_suffix(i, "nfs", config["azure_cluster"]["nfs_vm_size"], + config["azure_cluster"]["nfs_suffixes"][i], arm_vm_password) else: - vmname = "%s-worker-%s" % (config["azure_cluster"] - ["cluster_name"], random_str(6)) - elif not isDev: + create_vm_param(i, "nfs", config["azure_cluster"]["nfs_vm_size"], + arm_vm_password is not None, arm_vm_password) + +def create_vm_param(i, role, vm_size, no_az=False, arm_vm_password=None): + if role in ["worker","nfs"]: + vmname = "{}-{}".format(config["azure_cluster"]["cluster_name"], role) + ("{:02d}".format(i+1) if no_az else '-'+random_str(6)) + elif role == "infra": vmname = "%s-infra%02d" % (config["azure_cluster"] ["cluster_name"], i + 1) - else: + elif role == "dev": vmname = "%s-dev" % (config["azure_cluster"]["cluster_name"]) print "creating VM %s..." % vmname - vm_ip = get_vm_ip(i, isWorker, isDev) + vm_ip = get_vm_ip(i, role) if arm_vm_password is not None: - create_vm_pwd(vmname, vm_ip, vm_size, not isWorker, arm_vm_password) + create_vm_pwd(vmname, vm_ip, vm_size, not role in ["worker","nfs"], arm_vm_password) else: - create_vm(vmname, vm_ip, isWorker, vm_size) + create_vm(vmname, vm_ip, role, vm_size) return vmname +def create_vm_role_suffix(i, role, vm_size, suffix, arm_vm_password=None): + vmname = "{}-{}-".format(config["azure_cluster"]["cluster_name"], role) + suffix + print "creating VM %s..." % vmname + vm_ip = get_vm_ip(i, role) + if arm_vm_password is not None: + create_vm_pwd(vmname, vm_ip, vm_size, not role in ["worker","nfs"], arm_vm_password) + else: + create_vm(vmname, vm_ip, role, vm_size) + return vmname def useSqlAzure(): if "datasource" in config["azure_cluster"]: @@ -518,12 +609,12 @@ def vm_interconnects(): --name tcpinterconnect \ --protocol tcp \ --priority 850 \ - --destination-port-range %s \ + --destination-port-ranges %s \ --source-address-prefixes %s \ --access allow """ % ( config["azure_cluster"]["resource_group_name"], config["azure_cluster"]["nsg_name"], - config["cloud_config"]["dev_network"]["tcp_port_ranges"], + config["cloud_config"]["inter_connect"]["tcp_port_ranges"], portinfo ) if verbose: @@ -597,7 +688,6 @@ def get_disk_from_vm(vmname): return output.split("/")[-1].strip('\n') - def gen_cluster_config(output_file_name, output_file=True, no_az=False): bSQLOnly = (config["azure_cluster"]["infra_node_num"] <= 0) if useAzureFileshare() and not no_az: @@ -656,8 +746,7 @@ def gen_cluster_config(output_file_name, output_file=True, no_az=False): for i in range(int(config["azure_cluster"]["infra_node_num"])): vmname = "%s-infra%02d" % (config["azure_cluster"] ["cluster_name"], i + 1) - cc["machines"][vmname] = { - "role": "infrastructure", "private-ip": get_vm_ip(i, False, False)} + cc["machines"][vmname] = {"role": "infrastructure", "private-ip": get_vm_ip(i, "infra")} # Generate the workers in machines. vm_list = [] @@ -665,23 +754,41 @@ def gen_cluster_config(output_file_name, output_file=True, no_az=False): vm_list = get_vm_list_by_grp() else: vm_list = get_vm_list_by_enum() + + vm_ip_names = get_vm_private_ip() + vm_ip_names = sorted(vm_ip_names, key = lambda x:x['name']) + + sku_mapping = config["sku_mapping"] + for vm in vm_list: vmname = vm["name"] if "-worker" in vmname: if isNewlyScaledMachine(vmname): cc["machines"][vmname] = { "role": "worker", "scaled": True, - "node-group": vm["vmSize"]} + "node-group": vm["vmSize"],"gpu-type":sku_mapping[vm["vmSize"]]["gpu-type"]} else: cc["machines"][vmname] = { "role": "worker", - "node-group": vm["vmSize"]} - + "node-group": vm["vmSize"],"gpu-type":sku_mapping[vm["vmSize"]]["gpu-type"]} + nfs_nodes = [] + for vm in vm_list: + vmname = vm["name"] + if "-nfs" in vmname: + cc["machines"][vmname] = { + "role": "nfs", + "node-group": vm["vmSize"]} + + # Dilemma : Before the servers got created, you don't know there name, cannot specify which server does a mountpoint config group belongs to + if int(config["azure_cluster"]["nfs_node_num"]) > 0: + nfs_names2ip = {rec['name']:rec['privateIP'][0] for rec in vm_ip_names if "-nfs" in rec['name']} + else: + nfs_names2ip = {rec['name']:rec['privateIP'][0] for rec in vm_ip_names if "infra" in rec['name']} if not bSQLOnly: # Require explicit authorization setting. # cc["WinbindServers"] = [] # cc["WebUIauthorizedGroups"] = ['MicrosoftUsers'] - cc["mountpoints"] = {"rootshare": {}} + cc["mountpoints"] = {} if useAzureFileshare(): cc["mountpoints"]["rootshare"]["type"] = "azurefileshare" cc["mountpoints"]["rootshare"]["accountname"] = config[ @@ -692,13 +799,29 @@ def gen_cluster_config(output_file_name, output_file=True, no_az=False): if file_share_key is not None: cc["mountpoints"]["rootshare"]["accesskey"] = file_share_key else: - cc["mountpoints"]["rootshare"]["type"] = "nfs" - cc["mountpoints"]["rootshare"]["server"] = get_vm_ip(0, False, False) - cc["mountpoints"]["rootshare"]["filesharename"] = "/mnt/share" - cc["mountpoints"]["rootshare"][ - "curphysicalmountpoint"] = "/mntdlws/nfs" - cc["mountpoints"]["rootshare"]["mountpoints"] = "" - + named_nfs_suffix = set(config["azure_cluster"]["nfs_suffixes"] if "nfs_suffixes" in config["azure_cluster"] else []) + used_nfs_suffix = set([nfs_cnf["server_suffix"] for nfs_cnf in config["cloud_config"]["nfs_svr_setup"] if "server_suffix" in nfs_cnf]) + assert (used_nfs_suffix - named_nfs_suffix) == set() and "suffix not in nfs_suffixes list!" + assert len(nfs_names2ip) >= len(config["cloud_config"]["nfs_svr_setup"]) and "More NFS config items than #. of NFS server" + suffix2used_nfs = {suffix: "{}-nfs-{}".format(config["cluster_name"], suffix) for suffix in used_nfs_suffix} + # unused, either node without name suffix or those with suffix but not specified in any nfs_svr_setup item + unused_nfs = sorted([s for s in nfs_names2ip.keys() if s not in suffix2used_nfs.values()]) + unused_ID_cnt = 0 + for nfs_cnf in config["cloud_config"]["nfs_svr_setup"]: + if "server_suffix" in nfs_cnf: + server_name = suffix2used_nfs[nfs_cnf["server_suffix"]] + else: + server_name = unused_nfs[unused_ID_cnt] + unused_ID_cnt += 1 + server_ip = nfs_names2ip[server_name] + for mntname, mntcnf in nfs_cnf["mnt_point"].items(): + if mntname in cc["mountpoints"]: + print("Warning, duplicated mountpoints item name {}, skipping".format(mntname)) + continue + cc["mountpoints"][mntname] = mntcnf + cc["mountpoints"][mntname]["type"] = "nfs" + cc["mountpoints"][mntname]["server"] = server_ip + cc["mountpoints"][mntname]["servername"] = server_name if output_file: print yaml.dump(cc, default_flow_style=False) with open(output_file_name, 'w') as outfile: @@ -706,7 +829,6 @@ def gen_cluster_config(output_file_name, output_file=True, no_az=False): return cc - def isNewlyScaledMachine(vmName): scaler_config_file = os.path.join(dirpath, "deploy/scaler.yaml") if os.path.exists(scaler_config_file): @@ -731,15 +853,25 @@ def get_vm_list_by_grp(): return utils.json_loads_byteified(output) +def get_vm_private_ip(): + cmd = """ + az vm list-ip-addresses -g %s --output json --query '[].{name:virtualMachine.name, privateIP:virtualMachine.network.privateIpAddresses}' + + """ % (config["azure_cluster"]["resource_group_name"]) + if verbose: + print(cmd) + output = utils.exec_cmd_local(cmd) + return utils.json_loads_byteified(output) + # simply enumerate to get vm list def get_vm_list_by_enum(): vm_list = [] - for i in range(int(config["azure_cluster"]["worker_node_num"])): - vminfo = {} - vminfo["name"] = "%s-worker%02d" % (config["azure_cluster"] - ["cluster_name"], i + 1) - vminfo["vmSize"] = config["azure_cluster"]["worker_vm_size"] - vm_list.append(vminfo) + for role in ["worker","nfs"]: + for i in range(int(config["azure_cluster"]["{}_node_num".format(role)])): + vminfo = {} + vminfo["name"] = "{}-{}{:02d}".format(config["azure_cluster"]["cluster_name"], role, i + 1) + vminfo["vmSize"] = config["azure_cluster"]["{}_vm_size".format(role)] + vm_list.append(vminfo) return vm_list def random_str(length): @@ -756,6 +888,7 @@ def delete_cluster(): def run_command(args, command, nargs, parser): if command == "create": + # print config["azure_cluster"]["infra_vm_size"] create_cluster(args.arm_password) vm_interconnects() @@ -814,13 +947,13 @@ def run_command(args, command, nargs, parser): * Create config.yaml according to instruction in docs/deployment/azure/configure.md. Command: - create Create an Azure VM cluster based on the parameters in config file. - delete Delete the Azure VM cluster. - scaleup Scale up operation. - scaledown shutdown a particular VM. + create Create an Azure VM cluster based on the parameters in config file. + delete Delete the Azure VM cluster. + scaleup Scale up operation. + scaledown shutdown a particular VM. list list VMs. interconnect create network links among VMs - genconfig Generate configuration files for Azure VM cluster. + genconfig Generate configuration files for Azure VM cluster. ''') ) parser.add_argument("--cluster_name", help="Specify a cluster name", @@ -921,7 +1054,9 @@ def run_command(args, command, nargs, parser): config_file = os.path.join(dirpath, "config.yaml") if os.path.exists(config_file): - tmpconfig = yaml.load(open(config_file)) + with open(config_file) as cf: + tmpconfig = yaml.load(cf) + assert tmpconfig["cluster_name"] in tmpconfig["azure_cluster"] merge_config(config, tmpconfig, verbose) if tmpconfig is not None and "cluster_name" in tmpconfig: config["azure_cluster"]["cluster_name"] = tmpconfig["cluster_name"] diff --git a/src/ClusterBootstrap/bash_step_by_step_deploy.sh b/src/ClusterBootstrap/bash_step_by_step_deploy.sh new file mode 100755 index 000000000..2d43f2754 --- /dev/null +++ b/src/ClusterBootstrap/bash_step_by_step_deploy.sh @@ -0,0 +1,26 @@ +./deploy.py -y build +./az_tools.py create +./az_tools.py genconfig +./deploy.py runscriptonroles infra worker ./scripts/prepare_vm_disk.sh +./deploy.py nfs-server create +./deploy.py runscriptonroles infra worker ./scripts/prepare_ubuntu.sh +./deploy.py genscripts +./deploy.py runscriptonroles infra worker ./scripts/dns.sh +./deploy.py -y deploy +./deploy.py -y updateworker +./deploy.py -y kubernetes labels +./deploy.py -y gpulabel +./deploy.py kubernetes start nvidia-device-plugin +./deploy.py webui +./deploy.py docker push restfulapi +./deploy.py docker push webui +./deploy.py mount +./deploy.py kubernetes start mysql +./deploy.py kubernetes start jobmanager +./deploy.py kubernetes start restfulapi +./deploy.py kubernetes start webportal +./deploy.py kubernetes start cloudmonitor +./deploy.py kubernetes start custommetrics +./deploy.py -y kubernetes patchprovider aztools +./deploy.py setconfigmap +./deploy.py --sudo runscriptonrandmaster ./scripts/pass_secret.sh diff --git a/src/ClusterBootstrap/deploy.py b/src/ClusterBootstrap/deploy.py index 5b127f1e9..a7747e89b 100755 --- a/src/ClusterBootstrap/deploy.py +++ b/src/ClusterBootstrap/deploy.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python import json import os import time @@ -17,6 +17,7 @@ import glob import copy import numbers +import requests from os.path import expanduser @@ -50,22 +51,22 @@ defanswer = "" ipAddrMetaname = "hostIP" -# CoreOS version and channels, further configurable. +# CoreOS version and channels, further configurable. coreosversion = "1235.9.0" coreoschannel = "stable" coreosbaseurl = "" verbose = False nocache = False limitnodes = None - +allroles = {"infra", "infrastructure", "worker", "nfs", "sql"} # default search for all partitions of hdb, hdc, hdd, and sdb, sdc, sdd sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) -# Path to mount name -# Change path, e.g., /mnt/glusterfs/localvolume to +# Path to mount name +# Change path, e.g., /mnt/glusterfs/localvolume to # name mnt-glusterfs-localvolume def path_to_mount_service_name( path ): ret = path @@ -77,8 +78,8 @@ def path_to_mount_service_name( path ): ret = ret.replace('/','-') return ret -# Generate a server IP according to the cluster ip range. -# E.g., given cluster IP range 10.3.0.0/16, index=1, +# Generate a server IP according to the cluster ip range. +# E.g., given cluster IP range 10.3.0.0/16, index=1, # The generated IP is 10.3.0.1 def generate_ip_from_cluster(cluster_ip_range, index ): slash_pos = cluster_ip_range.find("/") @@ -86,7 +87,7 @@ def generate_ip_from_cluster(cluster_ip_range, index ): ips3 = ips[:ips.rfind(".")] return ips3 + "." + str(index) -# Return a path name, expand on ~, for a particular config, +# Return a path name, expand on ~, for a particular config, # e.g., ssh_key def expand_path_in_config(key_in_config): if key_in_config in config: @@ -97,7 +98,7 @@ def expand_path_in_config(key_in_config): def parse_capacity_in_GB( inp ): # print "match capacity of %s" % inp mt = capacityMatch.search(inp) - if mt is None: + if mt is None: return 0.0 else: digits = digitsMatch.search(mt.group(0)).group(0) @@ -141,10 +142,10 @@ def copy_to_ISO(): def _check_config_items(cnfitem, cnf): if not cnfitem in cnf: - raise Exception("ERROR: we cannot find %s in config file" % cnfitem) + raise Exception("ERROR: we cannot find %s in config file" % cnfitem) else: print "Checking configurations '%s' = '%s'" % (cnfitem, cnf[cnfitem]) - + def check_config(cnf): if not config["isacs"]: _check_config_items("discovery_url",cnf) @@ -158,7 +159,7 @@ def check_config(cnf): _check_config_items("pod_ip_range",cnf) _check_config_items("service_cluster_ip_range",cnf) if not os.path.isfile(config["ssh_cert"]): - raise Exception("ERROR: we cannot find ssh key file at %s. \n please run 'python build-pxe-coreos.py docker_image_name' to generate ssh key file and pxe server image." % config["ssh_cert"]) + raise Exception("ERROR: we cannot find ssh key file at %s. \n please run 'python build-pxe-coreos.py docker_image_name' to generate ssh key file and pxe server image." % config["ssh_cert"]) def generate_trusted_domains(network_config, start_idx ): ret = "" @@ -198,14 +199,14 @@ def get_root_passwd(): # dstname: config name to be used. # srcname: config name to be searched for (expressed as a list, see fetch_config) # lambda: lambda function to translate srcname to target name -default_config_mapping = { - "dockerprefix": (["cluster_name"], lambda x:x.lower()+"/"), - "infrastructure-dockerregistry": (["dockerregistry"], lambda x:x), +default_config_mapping = { + "dockerprefix": (["cluster_name"], lambda x:x.lower()+"/"), + "infrastructure-dockerregistry": (["dockerregistry"], lambda x:x), "worker-dockerregistry": (["dockerregistry"], lambda x:x), "glusterfs-device": (["glusterFS"], lambda x: "/dev/%s/%s" % (fetch_dictionary(x, ["volumegroup"]), fetch_dictionary(x, ["volumename"]) ) ), "glusterfs-localvolume": (["glusterFS"], lambda x: fetch_dictionary(x, ["mountpoint"]) ), "storage-mount-path-name": (["storage-mount-path" ], lambda x: path_to_mount_service_name(x) ), - "api-server-ip": (["service_cluster_ip_range"], lambda x: generate_ip_from_cluster(x, 1) ), + "api-server-ip": (["service_cluster_ip_range"], lambda x: generate_ip_from_cluster(x, 1) ), "dns-server-ip": (["service_cluster_ip_range"], lambda x: generate_ip_from_cluster(x, 53) ), "network-trusted-domains": (["network"], lambda x: generate_trusted_domains(x, 5 )), #master deployment scripts @@ -218,12 +219,12 @@ def get_root_passwd(): "postworkerdeploymentscript" : (["platform-scripts"], lambda x: get_platform_script_directory(x)+"post-worker-deploy.sh"), "workercleanupscript" : (["platform-scripts"], lambda x: get_platform_script_directory(x)+"cleanup-worker.sh"), "workerdeploymentlist" : (["platform-scripts"], lambda x: get_platform_script_directory(x)+"deploy.list"), - "pxeserverip": (["pxeserver"], lambda x: fetch_dictionary(x,["ip"])), - "pxeserverrootpasswd": (["pxeserver"], lambda x: get_root_passwd()), - "pxeoptions": (["pxeserver"], lambda x: "" if fetch_dictionary(x,["options"]) is None else fetch_dictionary(x,["options"])), - "hdfs_cluster_name" : ( ["cluster_name"], lambda x:x ), - "etcd_user": ( ["admin_username"], lambda x:x ), - "kubernetes_master_ssh_user": ( ["admin_username"], lambda x:x ), + "pxeserverip": (["pxeserver"], lambda x: fetch_dictionary(x,["ip"])), + "pxeserverrootpasswd": (["pxeserver"], lambda x: get_root_passwd()), + "pxeoptions": (["pxeserver"], lambda x: "" if fetch_dictionary(x,["options"]) is None else fetch_dictionary(x,["options"])), + "hdfs_cluster_name" : ( ["cluster_name"], lambda x:x ), + "etcd_user": ( ["admin_username"], lambda x:x ), + "kubernetes_master_ssh_user": ( ["admin_username"], lambda x:x ), } def isInstallOnCoreOS(): @@ -236,7 +237,7 @@ def update_docker_image_config(): if "container" not in config["dockers"]: config["dockers"]["container"] = {} if "hyperkube" not in config["dockers"]["container"]: - config["dockers"]["container"]["hyperkube"] = {} + config["dockers"]["container"]["hyperkube"] = {} # config["dockers"]["container"]["hyperkube"]["fullname"] = config["worker-dockerregistry"] + config["dockerprefix"] + "kubernetes:" + config["dockertag"] def update_config(): @@ -244,7 +245,7 @@ def update_config(): update_one_config(config, "coreosversion",["coreos","version"], basestring, coreosversion) update_one_config(config, "coreoschannel",["coreos","channel"], basestring, coreoschannel) update_one_config(config, "coreosbaseurl",["coreos","baseurl"], basestring, coreosbaseurl) - if config["coreosbaseurl"] == "": + if config["coreosbaseurl"] == "": config["coreosusebaseurl"] = "" else: config["coreosusebaseurl"] = "-b "+config["coreosbaseurl"] @@ -261,6 +262,8 @@ def update_config(): config["elasticsearch_node"] = config["webportal_node"] if ("mysql_node" not in config): config["mysql_node"] = None if len(get_node_lists_for_service("mysql"))==0 else get_node_lists_for_service("mysql")[0] + if ("host" not in config["prometheus"]): + config["prometheus"]["host"] = None if len(get_node_lists_for_service("prometheus"))==0 else get_node_lists_for_service("prometheus")[0] update_docker_image_config() @@ -316,8 +319,8 @@ def add_acs_config(command): config["WinbindServers"] = [] config["etcd_node_num"] = config["master_node_num"] config["kube_addons"] = [] # no addons - config["mountpoints"]["rootshare"]["azstoragesku"] = config["azstoragesku"] - config["mountpoints"]["rootshare"]["azfilesharequota"] = config["azfilesharequota"] + # config["mountpoints"]["rootshare"]["azstoragesku"] = config["azstoragesku"] + # config["mountpoints"]["rootshare"]["azfilesharequota"] = config["azfilesharequota"] config["freeflow"] = True config["useclusterfile"] = True @@ -363,7 +366,7 @@ def add_acs_config(command): def add_kubelet_config(): renderfiles = [] -# Render all deployment script used. +# Render all deployment script used. utils.render_template_directory("./template/kubelet", "./deploy/kubelet",config) kubemaster_cfg_files = [f for f in os.listdir("./deploy/kubelet") if os.path.isfile(os.path.join("./deploy/kubelet", f))] @@ -407,9 +410,9 @@ def add_leading_spaces(content, nspaces): # fill in additional entry of cloud config def add_additional_cloud_config(): - # additional entry to be added to write_files + # additional entry to be added to write_files translate_config_entry( config, ["coreos", "write_files"], "coreoswritefiles", basestring, 2 ) - # additional entry to be added to units + # additional entry to be added to units translate_config_entry( config, ["coreos", "units"], "coreosunits", basestring, 4 ) # additional startup script to be added to report.sh translate_config_entry( config, ["coreos", "startupScripts"], "startupscripts", basestring ) @@ -421,7 +424,7 @@ def init_deployment(): clusterID = utils.get_cluster_ID_from_file() response = raw_input_with_default("There is a cluster (ID:%s) deployment in './deploy', do you want to keep the existing ssh key and CA certificates (y/n)?" % clusterID) if first_char(response) == "n": - # Backup old cluster + # Backup old cluster utils.backup_keys(config["cluster_name"]) regenerate_key = True else: @@ -443,7 +446,7 @@ def init_deployment(): - print "Cluster Id is : %s" % clusterID + print "Cluster Id is : %s" % clusterID config["clusterId"] = clusterID config["sshkey"] = sshkey_public @@ -466,8 +469,8 @@ def init_deployment(): config["role"] = "etcd" utils.render_template(template_file, target_file,config) - # Prepare to Generate the ISO image. - # Using files in PXE as template. + # Prepare to Generate the ISO image. + # Using files in PXE as template. copy_to_ISO() @@ -514,9 +517,9 @@ def is_cur_on_same_domain(): pass return False -# Get domain of the node +# Get domain of the node, assigned in add_acs_config (line config["network"]["domain"]) def get_domain(): - if "network" in config and "domain" in config["network"]: + if "network" in config and "domain" in config["network"] and len(config["network"]["domain"]) > 0 : if is_cur_on_same_domain(): domain = "" else: @@ -525,8 +528,9 @@ def get_domain(): domain = "" return domain -# Get a list of nodes from cluster.yaml +# Get a list of nodes from cluster.yaml def get_nodes_from_config(machinerole): + machinerole = "infrastructure" if machinerole == "infra" else machinerole if "machines" not in config: return [] else: @@ -541,7 +545,10 @@ def get_nodes_from_config(machinerole): Nodes.append(nodename) return sorted(Nodes) -# Get a list of scaled nodes from cluster.yaml +def get_node_full_name(nodename): + return nodename + get_domain() if len(nodename.split("."))<3 else nodename + +# Get a list of scaled nodes from cluster.yaml def get_scaled_nodes_from_config(): if "machines" not in config: return [] @@ -617,6 +624,21 @@ def get_worker_nodes_from_config(clusterId): config["worker_node"] = Nodes return Nodes +def get_nodes_by_roles(roles): + """ + role: "infrastructure", "worker", or "nfs" + this function aims to deprecate get_worker_nodes_from_config and get_ETCD_master_nodes_from_config + """ + Nodes = [] + for role in roles: + Nodes += get_nodes_from_config(role) + if role == "infrastructure" or role == "infra": + config["etcd_node"] = Nodes + config["kubernetes_master_node"] = Nodes + else: + config["{}_node".format(role)] = Nodes + return Nodes + def get_worker_nodes(clusterId, isScaledOnly): nodes = [] if "worker_node" in config and len(config["worker_node"]) > 0: @@ -624,7 +646,8 @@ def get_worker_nodes(clusterId, isScaledOnly): if "useclusterfile" not in config or not config["useclusterfile"]: nodes = get_worker_nodes_from_cluster_report(clusterId) else: - nodes = get_worker_nodes_from_config(clusterId) + print("from console") + nodes = get_nodes_by_roles(["worker"]) #get_worker_nodes_from_config(clusterId) if isScaledOnly: return get_scaled_nodes_from_config() @@ -662,10 +685,12 @@ def check_master_ETCD_status(): print "Checking Available Nodes for Deployment..." get_ETCD_master_nodes(config["clusterId"]) get_worker_nodes(config["clusterId"], False) + get_nodes_by_roles(["nfs"]) print "===============================================" print "Activate Master Node(s): %s\n %s \n" % (len(config["kubernetes_master_node"]),",".join(config["kubernetes_master_node"])) print "Activate ETCD Node(s):%s\n %s \n" % (len(config["etcd_node"]),",".join(config["etcd_node"])) print "Activate Worker Node(s):%s\n %s \n" % (len(config["worker_node"]),",".join(config["worker_node"])) + print "Activate NFS Node(s):%s\n %s \n" % (len(config["nfs_node"]),",".join(config["nfs_node"])) def clean_deployment(): print "===============================================" @@ -695,7 +720,7 @@ def GetCertificateProperty(): config["apiserver_ssl_dns"] = "\n".join(["DNS."+str(i+5)+" = "+dns for i,dns in enumerate(masterdns)]) config["apiserver_ssl_ip"] = "IP.1 = "+config["api-server-ip"]+"\nIP.2 = 127.0.0.1\n"+ "\n".join(["IP."+str(i+3)+" = "+ip for i,ip in enumerate(masterips)]) - + # kube-apiserver aggregator use easyrsa to generate crt files, we need to generate a group of master names for it. # It does not care if it's a DNS name or IP. masternames = [] @@ -759,14 +784,14 @@ def gen_configs(): etcd_servers = [] #if int(config["etcd_node_num"]) <= 0: - # raise Exception("ERROR: we need at least one etcd_server.") + # raise Exception("ERROR: we need at least one etcd_server.") if "kubernetes_master_node" in config: kubernetes_masters = config["kubernetes_master_node"] else: kubernetes_masters = [] #if len(kubernetes_masters) <= 0: - # raise Exception("ERROR: we need at least one etcd_server.") + # raise Exception("ERROR: we need at least one etcd_server.") if not config["isacs"]: config["discovery_url"] = utils.get_ETCD_discovery_URL(int(config["etcd_node_num"])) @@ -774,6 +799,7 @@ def gen_configs(): config["ssh_cert"] = expand_path("./deploy/sshkey/id_rsa") config["etcd_user"] = config["admin_username"] + config["nfs_user"] = config["admin_username"] config["kubernetes_master_ssh_user"] = config["admin_username"] #config["api_servers"] = ",".join(["https://"+x for x in config["kubernetes_master_node"]]) @@ -781,7 +807,7 @@ def gen_configs(): config["etcd_endpoints"] = ",".join(["https://"+x+":"+config["etcd3port1"] for x in config["etcd_node"]]) - + if os.path.isfile(config["ssh_cert"]+".pub"): f = open(config["ssh_cert"]+".pub") @@ -805,6 +831,7 @@ def get_ssh_config(): if "ssh_cert" in config: config["ssh_cert"] = expand_path(config["ssh_cert"]) config["etcd_user"] = config["admin_username"] + config["nfs_user"] = config["admin_username"] config["kubernetes_master_ssh_user"] = config["admin_username"] add_ssh_key() @@ -848,7 +875,6 @@ def clean_master(): utils.SSH_exec_script(config["ssh_cert"],kubernetes_master_user, kubernetes_master, "./deploy/master/%s" % config["mastercleanupscript"]) - def deploy_master(kubernetes_master): print "===============================================" kubernetes_master_user = config["kubernetes_master_ssh_user"] @@ -873,18 +899,15 @@ def deploy_master(kubernetes_master): def get_cni_binary(): os.system("mkdir -p ./deploy/bin") - urllib.urlretrieve ("http://ccsdatarepo.westus.cloudapp.azure.com/data/containernetworking/cni-amd64-v0.5.2.tgz", "./deploy/bin/cni-amd64-v0.5.2.tgz") + # This tar file contains binary build from https://github.com/containernetworking/cni which used by weave + urllib.urlretrieve("https://github.com/microsoft/DLWorkspace/releases/download/v1.2.0/cni-v0.7.1.tgz", "./deploy/bin/cni-v0.7.1.tgz") if verbose: print "Extracting CNI binaries" - os.system("tar -zxvf ./deploy/bin/cni-amd64-v0.5.2.tgz -C ./deploy/bin") + os.system("tar -zxvf ./deploy/bin/cni-v0.7.1.tgz -C ./deploy/bin") def get_kubectl_binary(force = False): get_hyperkube_docker(force = force) - #os.system("mkdir -p ./deploy/bin") - urllib.urlretrieve ("http://ccsdatarepo.westus.cloudapp.azure.com/data/kube/kubelet/kubelet", "./deploy/bin/kubelet-old") - #urllib.urlretrieve ("http://ccsdatarepo.westus.cloudapp.azure.com/data/kube/kubelet/kubectl", "./deploy/bin/kubectl") - #os.system("chmod +x ./deploy/bin/*") get_cni_binary() def get_hyperkube_docker(force = False) : @@ -895,7 +918,7 @@ def get_hyperkube_docker(force = False) : if force or not os.path.exists("./deploy/bin/kubelet"): copy_from_docker_image(config["dockers"]["container"]["hyperkube"]["fullname"], "/kubelet", "./deploy/bin/kubelet") if force or not os.path.exists("./deploy/bin/kubectl"): - copy_from_docker_image(config["dockers"]["container"]["hyperkube"]["fullname"], "/kubectl", "./deploy/bin/kubectl") + copy_from_docker_image(config["dockers"]["container"]["hyperkube"]["fullname"], "/kubectl", "./deploy/bin/kubectl") if config['kube_custom_cri']: if force or not os.path.exists("./deploy/bin/crishim"): copy_from_docker_image(config["dockers"]["container"]["hyperkube"]["fullname"], "/crishim", "./deploy/bin/crishim") @@ -927,35 +950,35 @@ def deploy_masters(force = False): for i,kubernetes_master in enumerate(kubernetes_masters): deploy_master(kubernetes_master) deploycmd = """ - until curl -q http://127.0.0.1:8080/version/ ; do - sleep 5; - echo 'waiting for master...'; - done; - - until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/weave.yaml --validate=false ; do - sleep 5; - echo 'waiting for master...'; + until curl -q http://127.0.0.1:8080/version/ ; do + sleep 5; + echo 'waiting for master...'; + done; + + until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/weave.yaml --validate=false ; do + sleep 5; + echo 'waiting for master...'; done ; - until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/dashboard.yaml --validate=false ; do - sleep 5; - echo 'waiting for master...'; + until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/dashboard.yaml --validate=false ; do + sleep 5; + echo 'waiting for master...'; done ; - until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/dns-addon.yml --validate=false ; do - sleep 5; - echo 'waiting for master...'; - done ; + until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/dns-addon.yml --validate=false ; do + sleep 5; + echo 'waiting for master...'; + done ; - until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/kube-proxy.json --validate=false ; do - sleep 5; - echo 'waiting for master...'; + until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/kube-proxy.json --validate=false ; do + sleep 5; + echo 'waiting for master...'; done ; - until sudo /opt/bin/kubectl create -f /etc/kubernetes/clusterroles/ ; do - sleep 5; - echo 'waiting for master...'; - done ; + until sudo /opt/bin/kubectl create -f /etc/kubernetes/clusterroles/ ; do + sleep 5; + echo 'waiting for master...'; + done ; """ utils.SSH_exec_cmd(config["ssh_cert"], kubernetes_master_user, kubernetes_masters[0], deploycmd , False) @@ -998,7 +1021,7 @@ def deploy_ETCD_docker(): print "===============================================" print "deploy certificates to etcd server %s" % etcd_server_address - utils.SSH_exec_cmd (config["ssh_cert"], etcd_server_user, etcd_server_address, "sudo mkdir -p /etc/etcd/ssl ; sudo chown %s /etc/etcd/ssl " % (etcd_server_user), showCmd=verbose) + utils.SSH_exec_cmd (config["ssh_cert"], etcd_server_user, etcd_server_address, "sudo mkdir -p /etc/etcd/ssl ; sudo chown %s /etc/etcd/ssl " % (etcd_server_user), showCmd=verbose) utils.scp(config["ssh_cert"],"./deploy/ssl/etcd/ca.pem","/etc/etcd/ssl", etcd_server_user, etcd_server_address, verbose=verbose ) utils.scp(config["ssh_cert"],"./deploy/ssl/etcd/etcd.pem","/etc/etcd/ssl", etcd_server_user, etcd_server_address, verbose=verbose ) utils.scp(config["ssh_cert"],"./deploy/ssl/etcd/etcd-key.pem","/etc/etcd/ssl", etcd_server_user, etcd_server_address, verbose=verbose ) @@ -1046,8 +1069,8 @@ def deploy_ETCD(): print "===============================================" print "deploy certificates to etcd server %s" % etcd_server_address - utils.SSH_exec_cmd (config["ssh_cert"], etcd_server_user, etcd_server_address, "sudo mkdir -p /etc/etcd/ssl") - utils.SSH_exec_cmd (config["ssh_cert"], etcd_server_user, etcd_server_address, "sudo chown %s /etc/etcd/ssl " % (etcd_server_user)) + utils.SSH_exec_cmd (config["ssh_cert"], etcd_server_user, etcd_server_address, "sudo mkdir -p /etc/etcd/ssl") + utils.SSH_exec_cmd (config["ssh_cert"], etcd_server_user, etcd_server_address, "sudo chown %s /etc/etcd/ssl " % (etcd_server_user)) utils.scp(config["ssh_cert"],"./deploy/ssl/etcd/ca.pem","/etc/etcd/ssl", etcd_server_user, etcd_server_address ) utils.scp(config["ssh_cert"],"./deploy/ssl/etcd/etcd.pem","/etc/etcd/ssl", etcd_server_user, etcd_server_address ) utils.scp(config["ssh_cert"],"./deploy/ssl/etcd/etcd-key.pem","/etc/etcd/ssl", etcd_server_user, etcd_server_address ) @@ -1086,11 +1109,35 @@ def deploy_ETCD(): utils.SSH_exec_script( config["ssh_cert"], etcd_server_user, etcd_servers[0], "./deploy/etcd/init_network.sh") def create_nfs_server(): - etcd_servers = config["etcd_node"] - etcd_server_user = config["etcd_user"] - os.system( "mkdir -p ./deploy/scripts") - utils.render_template("./scripts/setup_nfs_server.sh","./deploy/scripts/setup_nfs_server.sh",config) - utils.SSH_exec_script( config["ssh_cert"], etcd_server_user, etcd_servers[0], "./deploy/scripts/setup_nfs_server.sh") + """ + we assume there's only 1 cluster. + """ + etcd_server_user = config["nfs_user"] + cluster_by_name = config["azure_cluster"][config["cluster_name"]] + nfs_servers = config["nfs_node"] if int(cluster_by_name["nfs_node_num"]) > 0 else config["etcd_node"] + # if we have suffixed server, then it must be external + named_nfs_suffix = set(cluster_by_name["nfs_suffixes"] if "nfs_suffixes" in cluster_by_name else []) + used_nfs_suffix = set([nfs_cnf["server_suffix"] for nfs_cnf in config["cloud_config"]["nfs_svr_setup"] if "server_suffix" in nfs_cnf]) + assert (used_nfs_suffix - named_nfs_suffix) == set() and "suffix not in nfs_suffixes list!" + suffix2used_nfs = {suffix: get_node_full_name("{}-nfs-{}".format(config["cluster_name"], suffix)) for suffix in used_nfs_suffix} + # unused, either node without name suffix or those with suffix but not specified in any nfs_svr_setup item + unused_nfs = sorted([s for s in nfs_servers if s not in suffix2used_nfs.values()]) + unused_ID_cnt = 0 + # print(nfs_servers, suffix2used_nfs, unused_nfs) + + + for nfs_cnf in config["cloud_config"]["nfs_svr_setup"]: + nfs_cnf["cloud_config"] = {"vnet_range":config["cloud_config"]["vnet_range"], "samba_range": config["cloud_config"]["samba_range"]} + if "server_suffix" in nfs_cnf: + nfs_server = suffix2used_nfs[nfs_cnf["server_suffix"]] + else: + nfs_server = unused_nfs[unused_ID_cnt] + unused_ID_cnt += 1 + utils.render_template("./template/nfs/nfs_config.sh.template","./deploy/scripts/setup_nfs_server.sh",nfs_cnf) + # os.system("cat ./deploy/scripts/setup_nfs_server.sh") + # print("------------------>nfs_server<------------------------"+nfs_server) + utils.SSH_exec_script( config["ssh_cert"], etcd_server_user, nfs_server, "./deploy/scripts/setup_nfs_server.sh") + def create_ISO(): imagename = "./deploy/iso/dlworkspace-cluster-deploy-"+config["cluster_name"]+".iso" @@ -1100,7 +1147,7 @@ def create_ISO(): os.system("rm -rf ./iso-creator/syslinux-6.03*") os.system("rm -rf ./iso-creator/coreos-*") print "Please find the bootable USB image at: "+imagename - print + print def create_PXE(): @@ -1136,7 +1183,7 @@ def create_PXE_ubuntu(): utils.render_template_directory("./template/pxe-ubuntu", "./deploy/pxe-ubuntu",config, verbose=verbose ) dockername = push_one_docker("./deploy/pxe-ubuntu", config["dockerprefix"], config["dockertag"], "pxe-ubuntu", config ) - # tarname = "deploy/docker/pxe-ubuntu.tar" + # tarname = "deploy/docker/pxe-ubuntu.tar" # os.system("docker save " + dockername + " > " + tarname ) print ("A DL workspace docker is built at: "+ dockername) @@ -1206,7 +1253,6 @@ def update_scaled_worker_nodes( nargs ): os.system('sed "s/##api_servers##/%s/" ./deploy/kubelet/kubelet.service.template > ./deploy/kubelet/kubelet.service' % config["api_servers"].replace("/","\\/")) os.system('sed "s/##api_servers##/%s/" ./deploy/kubelet/worker-kubeconfig.yaml.template > ./deploy/kubelet/worker-kubeconfig.yaml' % config["api_servers"].replace("/","\\/")) - #urllib.urlretrieve ("http://ccsdatarepo.westus.cloudapp.azure.com/data/kube/kubelet/kubelet", "./deploy/bin/kubelet") get_hyperkube_docker() workerNodes = get_worker_nodes(config["clusterId"], True) @@ -1260,7 +1306,7 @@ def deploy_restful_API_on_node(ipAddress): masterIP = ipAddress dockername = "%s/dlws-restfulapi" % (config["dockerregistry"]) - # if user didn't give storage server information, use CCS public storage in default. + # if user didn't give storage server information, use CCS public storage in default. if "nfs-server" not in config: config["nfs-server"] = "10.196.44.241:/mnt/data" @@ -1511,7 +1557,7 @@ def acs_post_deploy(): #print config["master_predeploy"] #print config["master_filesdeploy"] #print config["master_postdeploy"] - deploy_on_nodes(config["master_predeploy"], config["master_filesdeploy"], config["master_postdeploy"], + deploy_on_nodes(config["master_predeploy"], config["master_filesdeploy"], config["master_postdeploy"], config["kubernetes_master_node"]) deploy_on_nodes(config["worker_predeploy"], config["worker_filesdeploy"], config["worker_postdeploy"], config["worker_node"]) @@ -1543,36 +1589,39 @@ def get_mount_fileshares(curNode = None): allmountpoints = { } fstab = "" bHasDefaultMountPoints = False - physicalmountpoint = config["physical-mount-path"] + physicalmountpoint = config["physical-mount-path"] storagemountpoint = config["storage-mount-path"] mountshares = {} + # print(config["mountpoints"]) for k,v in config["mountpoints"].iteritems(): + # print("<<<<<<<<<<<<<<<<<<0: - mountpoints = [v["mountpoints"]] + mountpoints = [v["mountpoints"]] else: mountpoints = [] elif isinstance( v["mountpoints"], list): mountpoints = v["mountpoints"] else: mountpoints = [] - + # print("-------------mount points---------------:", mountpoints, bHasDefaultMountPoints) if len(mountpoints)==0: if bHasDefaultMountPoints: errorMsg = "there are more than one default mount points in configuration. " print "!!!Configuration Error!!! " + errorMsg - raise ValueError(erorMsg) + raise ValueError(errorMsg) else: bHasDefaultMountPoints = True + print "default storage folders:", config["default-storage-folders"], "\n" mountpoints = config["default-storage-folders"] mountsharename = v["mountsharename"] if "mountsharename" in v else v["filesharename"] if mountsharename in mountshares: errorMsg = "There are multiple file share to be mounted at %s" % mountsharename print "!!!Configuration Error!!! " + errorMsg - raise ValueError(erorMsg) + raise ValueError(erorMsg) if os.path.isabs(mountsharename): curphysicalmountpoint = mountsharename @@ -1655,18 +1704,18 @@ def insert_fstab_section( node, secname, content): fstab = fstabmask + content + fstabmaskend usefstab = fstab if fstabcontent.find("No such file or directory")==-1: - indexst = fstabcontent.find(fstabmask) + indexst = fstabcontent.find(fstabmask) indexend = fstabcontent.find(fstabmaskend) if indexst > 1: if indexend < 0: - usefstab = fstabcontent[:indexst] + fstab + usefstab = fstabcontent[:indexst] + fstab else: usefstab = fstabcontent[:indexst] + fstab + fstabcontent[indexend+len(fstabmaskend):] else: if fstabcontent.endswith("\n"): - usefstab = fstabcontent + fstab + usefstab = fstabcontent + fstab else: - usefstab = fstabcontent + "\n" + fstab + usefstab = fstabcontent + "\n" + fstab if verbose: print "----------- Resultant /etc/fstab --------------------" print usefstab @@ -1682,12 +1731,12 @@ def remove_fstab_section( node, secname): fstabcontent = utils.SSH_exec_cmd_with_output(config["ssh_cert"], config["admin_username"], node, "cat /etc/fstab") bCopyFStab = False if fstabcontent.find("No such file or directory")==-1: - indexst = fstabcontent.find(fstabmask) + indexst = fstabcontent.find(fstabmask) indexend = fstabcontent.find(fstabmaskend) if indexst > 1: bCopyFStab = True if indexend < 0: - usefstab = fstabcontent[:indexst] + usefstab = fstabcontent[:indexst] else: usefstab = fstabcontent[:indexst] + fstabcontent[indexend+len(fstabmaskend):] if bCopyFStab: @@ -1716,17 +1765,17 @@ def fileshare_install(): if isInstallOnCoreOS(): for k,v in allmountpoints.iteritems(): if "curphysicalmountpoint" in v: - physicalmountpoint = v["curphysicalmountpoint"] + physicalmountpoint = v["curphysicalmountpoint"] if v["type"] in config["mountsupportedbycoreos"]: () else: print "Share %s: type %s is not supported in CoreOS, mount failed " % (k, v["type"] ) exit(1) else: - # In service, the mount preparation install relevant software on remote machine. + # In service, the mount preparation install relevant software on remote machine. for k,v in allmountpoints.iteritems(): if "curphysicalmountpoint" in v: - physicalmountpoint = v["curphysicalmountpoint"] + physicalmountpoint = v["curphysicalmountpoint"] if v["type"] == "azurefileshare": if not ("azurefileshare" in filesharetype): filesharetype["azurefileshare"] = True @@ -1739,7 +1788,7 @@ def fileshare_install(): if not ("nfs" in filesharetype): filesharetype["nfs"] = True remotecmd += "sudo apt-get install -y nfs-common; " - # Ubuntu has issue of rpc.statd not started automatically + # Ubuntu has issue of rpc.statd not started automatically # https://bugs.launchpad.net/ubuntu/+source/nfs-utils/+bug/1624715 remotecmd += "sudo cp /lib/systemd/system/rpc-statd.service /etc/systemd/system/; " remotecmd += "sudo systemctl add-wants rpc-statd.service nfs-client.target; " @@ -1761,17 +1810,17 @@ def config_fqdn(): all_nodes = get_nodes(config["clusterId"]) for node in all_nodes: remotecmd = "echo %s | sudo tee /etc/hostname-fqdn; sudo chmod +r /etc/hostname-fqdn" % node - utils.SSH_exec_cmd(config["ssh_cert"], config["admin_username"], node, remotecmd) + utils.SSH_exec_cmd(config["ssh_cert"], config["admin_username"], node, remotecmd) def config_nginx(): all_nodes = get_nodes(config["clusterId"]) template_dir = "services/nginx/" target_dir = "deploy/services/nginx/" utils.render_template_directory(template_dir, target_dir,config) - for node in all_nodes: + for node in all_nodes: utils.sudo_scp(config["ssh_cert"],"./deploy/services/nginx/","/etc/nginx/conf.other", config["admin_username"], node ) # See https://github.com/kubernetes/examples/blob/master/staging/https-nginx/README.md - # Please use + # Please use # kubectl create configmap nginxconfigmap --from-file=services/nginx/default.conf # run_kubectl( ["delete", "configmap", "nginxconfigmap"] ) # run_kubectl( ["create", "configmap", "nginxconfigmap", "--from-file=%s/default.conf" % target_dir ] ) @@ -1820,7 +1869,7 @@ def mount_fileshares_by_service(perform_mount=True): # remotecmd += "sudo systemctl stop auto_share.service; " if len(remotecmd)>0: utils.SSH_exec_cmd(config["ssh_cert"], config["admin_username"], node, remotecmd) - # We no longer recommend to insert fstabl into /etc/fstab file, instead, + # We no longer recommend to insert fstabl into /etc/fstab file, instead, # we recommend to use service to start auto mount if needed # insert_fstab_section( node, "DLWS", fstab ) for k, v in allmountpoints.iteritems(): @@ -1862,7 +1911,7 @@ def del_fileshare_links(): remotecmd = "sudo rm -r %s; " % config["storage-mount-path"] remotecmd += "sudo mkdir -p %s; " % config["storage-mount-path"] exec_rmt_cmd(node, remotecmd) - + def link_fileshares(allmountpoints, bForce=False): all_nodes = get_nodes(config["clusterId"]) # print fstab @@ -1948,7 +1997,7 @@ def get_partions_of_node(node, prog): capacity = parse_capacity_in_GB( driveString ) lines = driveString.splitlines() - # Skip to "parted" print out where each partition information is shown + # Skip to "parted" print out where each partition information is shown n = 0 while n < len(lines): segs = lines[n].split() @@ -1978,7 +2027,7 @@ def get_partions_of_node(node, prog): deviceinfo["parted"] = parted partinfo[blockdevice] = deviceinfo blockdevice += 1 - return partinfo + return partinfo # Get Partition of all nodes in a cluster def get_partitions(nodes, regexp): @@ -1997,14 +2046,14 @@ def get_partitions(nodes, regexp): def show_partitions(nodes, regexp): nodesinfo = get_partitions(nodes, regexp) for node in nodesinfo: - print "Node: " + node + print "Node: " + node alldeviceinfo = nodesinfo[node] for bdevice in alldeviceinfo: - deviceinfo = alldeviceinfo[bdevice] + deviceinfo = alldeviceinfo[bdevice] print deviceinfo["name"] + ", "+ deviceinfo["modelName"] + ", Capacity: " + str(deviceinfo["capacity"]) + "GB" + ", Partition: " + str(deviceinfo["parted"]) return nodesinfo -# Calculate out a partition configuration in GB as follows. +# Calculate out a partition configuration in GB as follows. # partitionConfig is of s1,s2,..,sn: # If s_i < 0, the partition is in absolute size (GB) # If s_i > 0, the partition is in proportion. @@ -2023,7 +2072,7 @@ def calculate_partitions( capacity, partitionConfig): partitionSize[i] = 0.0 else: sumProportion += partitionConfig[i] - #print "Ending Capacity " + str(capacity) + #print "Ending Capacity " + str(capacity) #print partitionSize for i in range(npart): if partitionConfig[i] >= 0.0: @@ -2039,7 +2088,7 @@ def repartition_nodes(nodes, nodesinfo, partitionConfig): cmd = "" alldeviceinfo = nodesinfo[node] for bdevice in alldeviceinfo: - deviceinfo = alldeviceinfo[bdevice] + deviceinfo = alldeviceinfo[bdevice] existingPartitions = deviceinfo["parted"] if len( existingPartitions ) > 0: # remove existing partitions @@ -2111,17 +2160,17 @@ def regmatch_glusterFS( glusterFSargs ): def find_matched_volume( alldeviceinfo, regmatch ): deviceList = {} for bdevice in alldeviceinfo: - deviceinfo = alldeviceinfo[bdevice] + deviceinfo = alldeviceinfo[bdevice] for part in deviceinfo["parted"]: bdevicename = deviceinfo["name"] + str(part) # print bdevicename match = regmatch.search(bdevicename) if not ( match is None ): deviceList[match.group(0)] = deviceinfo["parted"][part] - #print deviceList; + #print deviceList; return deviceList -# Form a configuration file for operation of glusterfs +# Form a configuration file for operation of glusterfs def write_glusterFS_configuration( nodesinfo, glusterFSargs ): config_file = fetch_config_and_check( config, ["glusterFS", "glustefs_nodes_yaml" ]) config_glusterFS = fetch_config_and_check( config, ["glusterFS"] ) @@ -2140,8 +2189,8 @@ def write_glusterFS_configuration( nodesinfo, glusterFSargs ): for volume, volume_config in glusterfs_groups[glusterfs_group]["gluster_volumes"].iteritems(): for param in required_param: if not param in volume_config: - print "Error: please check configuration file ..." - print "Gluster group %s volume %s doesn't have a required parameter %s" % (glusterfs_group, volume, param) + print "Error: please check configuration file ..." + print "Gluster group %s volume %s doesn't have a required parameter %s" % (glusterfs_group, volume, param) exit() glusterfs_groups[glusterfs_group]["nodes"] = [] glusterfs_groups[glusterfs_group]["nodes"].append( node ) @@ -2150,7 +2199,7 @@ def write_glusterFS_configuration( nodesinfo, glusterFSargs ): yaml.dump(config_glusterFS, datafile, default_flow_style=False) return config_glusterFS -# Form YAML file for glusterfs endpoints, launch glusterfs endpoints. +# Form YAML file for glusterfs endpoints, launch glusterfs endpoints. def launch_glusterFS_endpoint( nodesinfo, glusterFSargs ): os.system( "mkdir -p ./deploy/services/glusterFS_ep" ) config_glusterFS = write_glusterFS_configuration( nodesinfo, glusterFSargs ) @@ -2178,7 +2227,7 @@ def stop_glusterFS_endpoint( ): def format_mount_partition_volume( nodes, deviceSelect, format=True ): nodesinfo = get_partitions(nodes, deviceSelect ) - #if verbose: + #if verbose: # print nodesinfo reg = re.compile( deviceSelect ) for node in nodesinfo: @@ -2186,14 +2235,14 @@ def format_mount_partition_volume( nodes, deviceSelect, format=True ): volumes = find_matched_volume( alldeviceinfo, reg ) if verbose: print "................. Node %s ................." % node - print "Node = %s, volume = %s " % ( node, str(volumes)) + print "Node = %s, volume = %s " % ( node, str(volumes)) remotecmd = "" - if format: + if format: for volume in volumes: remotecmd += "sudo %s %s; " % ( fetch_config( config, ["localdisk", "mkfscmd"]), volume) - hdfsconfig = {} + hdfsconfig = {} for volume in volumes: - # mount remote volumes. + # mount remote volumes. mountpoint = config["hdfs"]["datadir"][volume] remotecmd += "sudo mkdir -p %s; " % mountpoint remotecmd += "sudo mount %s %s; " % ( volume, mountpoint ) @@ -2203,7 +2252,7 @@ def format_mount_partition_volume( nodes, deviceSelect, format=True ): def unmount_partition_volume( nodes, deviceSelect ): nodesinfo = get_partitions(nodes, deviceSelect ) - #if verbose: + #if verbose: # print nodesinfo reg = re.compile( deviceSelect ) for node in nodesinfo: @@ -2211,10 +2260,10 @@ def unmount_partition_volume( nodes, deviceSelect ): volumes = find_matched_volume( alldeviceinfo, reg ) if verbose: print "................. Node %s ................." % node - print "Node = %s, volume = %s " % ( node, str(volumes)) + print "Node = %s, volume = %s " % ( node, str(volumes)) remotecmd = "" for volume in volumes: - # mount remote volumes. + # mount remote volumes. mountpoint = config["hdfs"]["datadir"][volume] remotecmd += "sudo umount %s; " % ( mountpoint ) utils.SSH_exec_cmd( config["ssh_cert"], config["admin_username"], node, remotecmd, showCmd=verbose ) @@ -2255,13 +2304,13 @@ def generate_hdfs_config( nodes, deviceSelect): config["hdfsconfig"]["namenode"] = hdfsconfig["namenode"] return hdfsconfig -# Write configuration for each hdfs node. +# Write configuration for each hdfs node. def hdfs_config( nodes, deviceSelect): hdfsconfig = generate_hdfs_config( nodes, deviceSelect ) - if verbose: + if verbose: print "HDFS Configuration: %s " % hdfsconfig nodesinfo = get_partitions(nodes, deviceSelect ) - #if verbose: + #if verbose: # print nodesinfo reg = re.compile( deviceSelect ) for node in nodesinfo: @@ -2269,10 +2318,10 @@ def hdfs_config( nodes, deviceSelect): volumes = find_matched_volume( alldeviceinfo, reg ) if verbose: print "................. Node %s ................." % node - print "Node = %s, volume = %s " % ( node, str(volumes)) + print "Node = %s, volume = %s " % ( node, str(volumes)) volumelist = [] for volume in volumes: - # mount remote volumes. + # mount remote volumes. devicename = volume[volume.rfind("/")+1:] mountpoint = os.path.join( config["local-mount-path"], devicename ) volumelist.append( mountpoint ) @@ -2287,7 +2336,7 @@ def hdfs_config( nodes, deviceSelect): zknodes = get_node_lists_for_service("zookeeper") for node in zknodes: if not (node in nodesinfo): - # The node is used for HDFS infrastructure, and needs configuration. + # The node is used for HDFS infrastructure, and needs configuration. os.system( "mkdir -p %s" % config["docker-run"]["hdfs"]["volumes"]["configDir"]["from"]) config_file = "%s/config.yaml" % config["docker-run"]["hdfs"]["volumes"]["configDir"]["from"] hdfsconfig["dfs"]["data"] = "" @@ -2295,10 +2344,10 @@ def hdfs_config( nodes, deviceSelect): yaml.dump(hdfsconfig, datafile, default_flow_style=False) utils.sudo_scp( config["ssh_cert"], config_file, config["hdfsconfig"]["configfile"], config["admin_username"], node) - # Render docker. + # Render docker. # utils.render_template_directory("../docker-images/hdfs", "./deploy/docker-images/hdfs", config, verbose) -# Create gluster FS volume +# Create gluster FS volume def create_glusterFS_volume( nodesinfo, glusterFSargs ): utils.render_template_directory("./storage/glusterFS", "./deploy/storage/glusterFS", config, verbose) config_glusterFS = write_glusterFS_configuration( nodesinfo, glusterFSargs ) @@ -2315,31 +2364,31 @@ def create_glusterFS_volume( nodesinfo, glusterFSargs ): remotecmd += "sudo apt-get install -y thin-provisioning-tools; " capacityGB = 0.0 for volume in volumes: - remotecmd += "sudo pvcreate -f " + remotecmd += "sudo pvcreate -f " dataalignment = fetch_config( config, ["glusterFS", "dataalignment"] ) - if not dataalignment is None: + if not dataalignment is None: remotecmd += " --dataalignment " + dataalignment remotecmd += " " + volume + "; " capacityGB += volumes[volume] - if len(volumes)>0: + if len(volumes)>0: remotecmd += "sudo vgcreate " physicalextentsize = fetch_config( config, ["glusterFS", "physicalextentsize"] ) - if not physicalextentsize is None: + if not physicalextentsize is None: remotecmd += " --physicalextentsize " + physicalextentsize; volumegroup = fetch_config_and_check( config, ["glusterFS", "volumegroup" ] ) remotecmd += " " + volumegroup for volume in volumes: - remotecmd += " " + volume; + remotecmd += " " + volume; remotecmd += "; " else: # The machine doesn't have any data disk, skip glusterFS setup - break; + break; volumesize = fetch_config_and_check( config, ["glusterFS", "volumesize" ] ) metasize = fetch_config_and_check(config, ["glusterFS", "metasize" ] ) metapoolname = fetch_config_and_check(config, ["glusterFS", "metapoolname" ] ) - # create metapool + # create metapool remotecmd += "sudo lvcreate -L %s --name %s %s ; " % ( metasize, metapoolname, volumegroup ) - # create datapool + # create datapool volumesize = fetch_config_and_check( config, ["glusterFS", "volumesize" ] ) datapoolname = fetch_config_and_check( config, ["glusterFS", "datapoolname" ] ) remotecmd += "sudo lvcreate -l %s --name %s %s ; " % ( volumesize, datapoolname, volumegroup ) @@ -2368,11 +2417,11 @@ def create_glusterFS_volume( nodesinfo, glusterFSargs ): for volume, volume_config in gluster_volumes.iteritems(): multiple = volume_config["multiple"] numnodes = len(othernodes) + 1 - # Find the number of subvolume needed. + # Find the number of subvolume needed. subvolumes = 1 while ( numnodes * subvolumes ) % multiple !=0: - subvolumes +=1; - if verbose: + subvolumes +=1; + if verbose: print( "Volume %s, multiple is %d, # of nodes = %d, make %d volumes ..." % (volume, multiple, numnodes, subvolumes) ) for sub in range(1, subvolumes + 1 ): remotecmd += "sudo mkdir -p %s; " % ( os.path.join( remotepath, volume ) + str(sub) ) @@ -2385,7 +2434,7 @@ def remove_glusterFS_volume( nodesinfo, glusterFSargs ): volumes = find_matched_volume( alldeviceinfo, regmatch ) print "................. Node %s ................." % node remotecmd = ""; - if len(volumes)>0: + if len(volumes)>0: volumegroup = fetch_config_and_check( config, ["glusterFS", "volumegroup" ] ) datapoolname = fetch_config_and_check( config, ["glusterFS", "datapoolname" ] ) volumename = fetch_config_and_check( config, ["glusterFS", "volumename" ] ) @@ -2394,7 +2443,7 @@ def remove_glusterFS_volume( nodesinfo, glusterFSargs ): remotecmd += "sudo vgremove -y %s ; " % volumegroup else: # The machine doesn't have any data disk, skip glusterFS removal - break; + break; for volume in volumes: remotecmd += "sudo pvremove -y %s; " % volume # print remotecmd @@ -2429,6 +2478,11 @@ def exec_on_all_with_output(nodes, args, supressWarning = False): print "Node: " + node print output +def exec_on_rand_master(args, supressWarning = False): + nodes = get_ETCD_master_nodes(config["clusterId"]) + master_node = random.choice(nodes) + exec_on_all_with_output([master_node], args, supressWarning) + # run a shell script on one remote node def run_script(node, args, sudo = False, supressWarning = False): if ".py" in args[0]: @@ -2456,12 +2510,17 @@ def run_script_on_all(nodes, args, sudo = False, supressWarning = False): for node in nodes: run_script( node, args, sudo = sudo, supressWarning = supressWarning) +def run_script_on_rand_master(nargs, args): + nodes = get_ETCD_master_nodes(config["clusterId"]) + master_node = random.choice(nodes) + run_script_on_all([master_node], nargs, sudo = args.sudo ) + def copy_to_all(nodes, src, dst): for node in nodes: rmt_cp(node, src, dst) def add_mac_dictionary( dic, name, mac): - mac = mac.lower() + mac = mac.lower() if mac in dic: if dic[mac] != name: print "Error, two mac entries " + mac + "for machine " + dic[mac] + ", " + name @@ -2585,7 +2644,7 @@ def update_config_nodes(): for node in nodes: update_config_node( node ) -# Running a kubectl commands. +# Running a kubectl commands. def run_kube( prog, commands ): one_command = " ".join(commands) kube_command = "" @@ -2605,7 +2664,7 @@ def run_kubectl( commands ): def kubernetes_get_node_name(node): kube_node_name = "" domain = get_domain() - if len(domain) < 2: + if len(domain) < 2: kube_node_name = node elif domain in node: # print "Remove domain %d" % len(domain) @@ -2627,7 +2686,7 @@ def render_service_templates(): # Additional parameter calculation set_zookeeper_cluster() generate_hdfs_containermounts() - # Multiple call of render_template will only render the directory once during execution. + # Multiple call of render_template will only render the directory once during execution. utils.render_template_directory( "./services/", "./deploy/services/", config) def get_all_services(): @@ -2657,7 +2716,7 @@ def get_all_services(): content = f.read() f.close() if content.find( "Deployment" )>=0 or content.find( "DaemonSet" )>=0 or content.find("ReplicaSet")>=0: - # Only add service if it is a daemonset. + # Only add service if it is a daemonset. servicedic[service] = yamlname return servicedic @@ -2695,7 +2754,7 @@ def kubernetes_label_node(cmdoptions, nodename, label): run_kubectl(["label nodes %s %s %s" % (cmdoptions, nodename, label)]) # Get the list of nodes for a particular service -# +# def get_node_lists_for_service(service): if "etcd_node" not in config or "worker_node" not in config: check_master_ETCD_status() @@ -2726,13 +2785,13 @@ def get_node_lists_for_service(service): nodes.append(node) return nodes -# Label kubernete nodes according to a service. +# Label kubernete nodes according to a service. # A service (usually a Kubernete daemon service) can request to be run on: # all: all nodes # etcd_node: all etcd node # etcd_node_n: a particular etcd node # worker_node: all worker node -# The kubernete node will be marked accordingly to facilitate the running of daemon service. +# The kubernete node will be marked accordingly to facilitate the running of daemon service. def kubernetes_label_nodes( verb, servicelists, force ): servicedic = get_all_services() # print servicedic @@ -2755,7 +2814,7 @@ def kubernetes_label_nodes( verb, servicelists, force ): # print servicelists for label in servicelists: nodes = get_node_lists_for_service(label) - if verbose: + if verbose: print "kubernetes: apply action %s to label %s to nodes: %s" %(verb, label, nodes) if force: cmdoptions = "--overwrite" @@ -2770,6 +2829,14 @@ def kubernetes_label_nodes( verb, servicelists, force ): elif verb == "remove": kubernetes_label_node(cmdoptions, nodename, label+"-") + +# Label kubernete nodes with gpu types.skip for CPU workers +def kubernetes_label_GpuTypes(): + for nodename,nodeInfo in config["machines"].items(): + if nodeInfo["role"] == "worker" and nodeInfo["gpu-type"] != "NULL": + kubernetes_label_node("--overwrite", nodename, "gpuType="+nodeInfo["gpu-type"]) + + def kubernetes_patch_nodes_provider (provider, scaledOnly): nodes = [] if scaledOnly: @@ -2783,7 +2850,7 @@ def kubernetes_patch_nodes_provider (provider, scaledOnly): # Label kubernete nodes according to property of node (usually specified in config.yaml or cluster.yaml) # Certain property of node: -# E.g., rack +# E.g., rack def kubernetes_mark_nodes( marklist, bMark ): if marklist == []: marklist = config["kubemarks"] @@ -2805,7 +2872,7 @@ def kubernetes_mark_nodes( marklist, bMark ): def start_one_kube_service(fname): if verbose: - # use try/except because yaml.load cannot load yaml file with multiple documents. + # use try/except because yaml.load cannot load yaml file with multiple documents. try: f = open(fname) service_yaml = yaml.load(f) @@ -2815,6 +2882,10 @@ def start_one_kube_service(fname): except Exception as e: pass + if fname == "./deploy/services/jobmanager/jobmanager.yaml": + # recreate the configmap dlws-scripts + run_kubectl( ["create configmap dlws-scripts --from-file=../Jobs_Templete/ -o yaml --dry-run | ./deploy/bin/kubectl apply -f -"] ) + run_kubectl( ["create", "-f", fname ] ) def stop_one_kube_service(fname): @@ -2905,16 +2976,22 @@ def run_docker_image( imagename, native = False, sudo = False ): matches = find_dockers( imagename ) if len( matches ) == 0: print "Error: can't find any docker image built by name %s, you may need to build the relevant docker first..." % imagename - elif len( matches ) > 1: + elif len( matches ) > 1: print "Error: find multiple dockers by name %s as %s, you may need to be more specific on which docker image to run " % ( imagename, str(matches)) else: - if native: + if native: os.system( "docker run --rm -ti " + matches[0] ) else: run_docker( matches[0], prompt = imagename, dockerConfig = dockerConfig, sudo = sudo ) +def gen_dns_config_script(): + utils.render_template("./template/dns/dns.sh.template", "deploy/kubeconfig/kubeconfig.yaml", config) + +def gen_pass_secret_script(): + utils.render_template("./template/secret/pass_secret.sh.template", "scripts/pass_secret.sh", config) + def run_command( args, command, nargs, parser ): - # If necessary, show parsed arguments. + # If necessary, show parsed arguments. # print args global discoverserver global homeinserver @@ -2931,7 +3008,7 @@ def run_command( args, command, nargs, parser ): discoverserver = args.discoverserver homeinserver = args.homeinserver - if args.verbose: + if args.verbose: verbose = True utils.verbose = True print "Args = {0}".format(args) @@ -2945,7 +3022,7 @@ def run_command( args, command, nargs, parser ): config_cluster = os.path.join(dirpath,"cluster.yaml") if os.path.exists(config_cluster): merge_config( config, yaml.load(open(config_cluster))) - + config_file = os.path.join(dirpath,"config.yaml") # print "Config file: " + config_file @@ -2964,7 +3041,6 @@ def run_command( args, command, nargs, parser ): f.close() if "clusterId" in tmp: config["clusterId"] = tmp["clusterId"] - if "copy_sshtemp" in config and config["copy_sshtemp"]: if "ssh_origfile" not in config: config["ssh_origfile"] = config["ssh_cert"] @@ -2982,20 +3058,16 @@ def run_command( args, command, nargs, parser ): else: print "SSH Key {0} not found using original".format(sshfile) # exit() - add_acs_config(command) if verbose and config["isacs"]: print "Using Azure Container Services" - if os.path.exists("./deploy/clusterID.yml"): update_config() else: apply_config_mapping(config, default_config_mapping) update_docker_image_config() - # additional glusterfs launch parameter. config["launch-glusterfs-opt"] = args.glusterfs; - get_ssh_config() configuration( config, verbose ) if args.yes: @@ -3007,7 +3079,7 @@ def run_command( args, command, nargs, parser ): ipAddrMetaname = "clientIP" - if verbose: + if verbose: print "deploy " + command + " " + (" ".join(nargs)) print "PlatformScripts = {0}".format(config["platform-scripts"]) @@ -3034,15 +3106,15 @@ def run_command( args, command, nargs, parser ): elif command == "connect": check_master_ETCD_status() - if len(nargs) < 1 or nargs[0] == "master": + role2connect = nargs[0] + # print(role2connect, config["ssh_cert"], config["admin_username"]) + if len(nargs) < 1 or role2connect == "master": nodes = config["kubernetes_master_node"] - elif nargs[0] == "etcd": - nodes = config["etcd_node"] - elif nargs[0] == "worker": - nodes = config["worker_node"] + elif role2connect in ["etcd","worker","nfs"]: + nodes = config["{}_node".format(role2connect)] else: parser.print_help() - print "ERROR: must connect to either master, etcd or worker nodes" + print "ERROR: must connect to either master, etcd, nfs or worker nodes" exit() if len(nodes) == 0: parser.print_help() @@ -3066,10 +3138,10 @@ def run_command( args, command, nargs, parser ): if nargs[0] == "create": create_nfs_server() else: - print "Error: subcommand %s is not recognized for nfs-server. " % nargs[0] + print "Error: subcommand %s is not recognized for nfs-server. " % nargs[0] exit() else: - print "Error: nfs-server need a subcommand (create) " % nargs[0] + print "Error: nfs-server need a subcommand (create) " % nargs[0] exit() elif command == "build": @@ -3090,15 +3162,20 @@ def run_command( args, command, nargs, parser ): create_PXE_ubuntu() else: parser.print_help() - print "Error: build target %s is not recognized. " % nargs[0] + print "Error: build target %s is not recognized. " % nargs[0] exit() + elif command == "dnssetup": + os.system("./gene_loc_dns.sh") + nodes = get_nodes(config["clusterId"]) + run_script_on_all(nodes, "./scripts/dns.sh", sudo = args.sudo ) + elif command == "sshkey": if len(nargs) >=1 and nargs[0] == "install": install_ssh_key(nargs[1:]) else: parser.print_help() - print "Error: build target %s is not recognized. " % nargs[0] + print "Error: build target %s is not recognized. " % nargs[0] exit() elif command == "scan": @@ -3109,6 +3186,88 @@ def run_command( args, command, nargs, parser ): print "Error: scan need one parameter with format x.x.x.x/n. " exit() + elif command == "admin": + if len(nargs) >= 1: + if nargs[0] == "vc": + if len(nargs) >= 2: + if nargs[1] == "add": + url = "http://%s:%s/AddVC?vcName=%s"a=%s&metadata=%s&userName=Administrator" % (config["kubernetes_master_node"][0],config["restfulapiport"], nargs[2], nargs[3], nargs[4]) + response = requests.get(url) + print(response) + elif nargs[1] == "update": + url = "http://%s:%s/UpdateVC?vcName=%s"a=%s&metadata=%s&userName=Administrator" \ + % (config["kubernetes_master_node"][0],config["restfulapiport"], nargs[2], nargs[3], nargs[4]) + response = requests.get(url) + print(response) + elif nargs[1] == "delete": + url = "http://%s:%s/DeleteVC?vcName=%s&userName=Administrator" % (config["kubernetes_master_node"][0],config["restfulapiport"], nargs[2]) + response = requests.get(url) + print(response) + elif nargs[1] == "list": + url = "http://%s:%s/ListVCs?userName=Administrator" % (config["kubernetes_master_node"][0],config["restfulapiport"]) + response = requests.get(url) + print(response.text) + elif nargs[0] == "storage": + if len(nargs) >= 2: + if nargs[1] == "add": + url = "http://%s:%s/AddStorage?vcName=%s&url=%s&storageType=%s&metadata=%s&defaultMountPath=%s&userName=Administrator" % (config["kubernetes_master_node"][0],config["restfulapiport"], nargs[2], nargs[3], nargs[4], nargs[5], nargs[6]) + response = requests.get(url) + print(response) + elif nargs[1] == "update": + url = "http://%s:%s/UpdateStorage?vcName=%s&url=%s&storageType=%s&metadata=%s&defaultMountPath=%s&userName=Administrator" % (config["kubernetes_master_node"][0],config["restfulapiport"], nargs[2], nargs[3], nargs[4], nargs[5], nargs[6]) + response = requests.get(url) + print(response) + elif nargs[1] == "delete": + url = "http://%s:%s/DeleteStorage?vcName=%s&url=%s&userName=Administrator" % (config["kubernetes_master_node"][0],config["restfulapiport"], nargs[2], nargs[3]) + response = requests.get(url) + print(response) + elif nargs[1] == "list": + url = "http://%s:%s/ListStorages?vcName=%s&userName=Administrator" % (config["kubernetes_master_node"][0],config["restfulapiport"], nargs[2]) + response = requests.get(url) + print(response.text) + elif nargs[0] == "acl": + if len(nargs) >= 2: + if nargs[1] == "update": + url = "http://%s:%s/UpdateAce?identityName=%s&resourceType=%s&resourceName=%s&permissions=%s&userName=Administrator" % (config["kubernetes_master_node"][0],config["restfulapiport"], nargs[2], nargs[3], nargs[4], nargs[5]) + response = requests.get(url) + print(response) + elif nargs[1] == "list": + url = "http://%s:%s/GetACL?userName=Administrator" % (config["kubernetes_master_node"][0],config["restfulapiport"]) + response = requests.get(url) + print(response.text) + elif nargs[1] == "delete": + url = "http://%s:%s/DeleteAce?identityName=%s&resourceType=%s&resourceName=%s&userName=Administrator" % (config["kubernetes_master_node"][0],config["restfulapiport"],nargs[2],nargs[3],nargs[4]) + response = requests.get(url) + print(response.text) + elif nargs[0] == "job": + if len(nargs) >= 2: + if nargs[1] == "add": + url = "http://%s:%s/SubmitJob?jobName=%s&vcName=%s&resourcegpu=%s&gpuType=%s&dataPath=%s&workPath=%s&image=%s&jobType=%s&preemptionAllowed=%s&userName=Administrator" \ + % (config["kubernetes_master_node"][0],config["restfulapiport"], nargs[2], nargs[3], nargs[4], nargs[5], nargs[6], nargs[7], nargs[8], nargs[9], nargs[10]) + response = requests.get(url) + print(response.text) + elif nargs[1] == "delete": + url = "http://%s:%s/KillJob?jobId=%s&userName=Administrator" \ + % (config["kubernetes_master_node"][0],config["restfulapiport"], nargs[2]) + response = requests.get(url) + print(response.text) + elif nargs[1] == "list": + url = "http://%s:%s/ListJobs?vcName=%s&jobOwner=%s&num=%s&userName=Administrator" \ + % (config["kubernetes_master_node"][0],config["restfulapiport"], nargs[2], nargs[3], nargs[4]) + response = requests.get(url) + print(response.text) + elif nargs[0] == "user": + if len(nargs) >= 2: + if nargs[1] == "add": + if len(nargs) <= 3: + url = "http://%s:%s/AddUser?userName=%s" % (config["kubernetes_master_node"][0],config["restfulapiport"], nargs[2]) + response = requests.get(url) + print(response) + else: + url = "http://%s:%s/AddUser?userName=%s&uid=%s&gid=%s&groups=%s" % (config["kubernetes_master_node"][0],config["restfulapiport"], nargs[2], nargs[3], nargs[4], nargs[5]) + response = requests.get(url) + print(response) + elif command == "updateworker": response = raw_input_with_default("Deploy Worker Nodes (y/n)?") @@ -3173,7 +3332,7 @@ def run_command( args, command, nargs, parser ): elif command == "partition" and len(nargs) >= 1: nodes = get_nodes(config["clusterId"]) if nargs[0] == "ls": - # Display parititons. + # Display parititons. print "Show partition on data disk: " + config["data-disk"] nodesinfo = show_partitions(nodes, config["data-disk"] ) @@ -3199,7 +3358,7 @@ def run_command( args, command, nargs, parser ): elif command == "glusterFS_heketi" and len(nargs) >= 1: # nodes = get_nodes(config["clusterId"]) - # ToDo: change pending, schedule glusterFS on master & ETCD nodes, + # ToDo: change pending, schedule glusterFS on master & ETCD nodes, if nargs[0] == "start" or nargs[0] == "update" or nargs[0] == "stop" or nargs[0] == "clear": nodes = get_worker_nodes(config["clusterId"], False) nodesinfo = get_partitions(nodes, config["data-disk"] ) @@ -3227,7 +3386,7 @@ def run_command( args, command, nargs, parser ): elif command == "glusterfs" and len(nargs) >= 1: allnodes = get_nodes(config["clusterId"]) - # ToDo: change pending, schedule glusterFS on master & ETCD nodes, + # ToDo: change pending, schedule glusterFS on master & ETCD nodes, nodes = get_node_lists_for_service("glusterfs") glusterFSargs = fetch_config( config, ["glusterFS", "partitions"] ) if nargs[0] == "display": @@ -3250,7 +3409,7 @@ def run_command( args, command, nargs, parser ): if response == "REMOVE": remove_glusterFS_volume( nodesinfo, glusterFSargs ) elif nargs[0] == "config": - write_glusterFS_configuration( nodesinfo, glusterFSargs ) + write_glusterFS_configuration( nodesinfo, glusterFSargs ) dockername = fetch_config_and_check(config, ["glusterFS", "glusterfs_docker"]) push_docker_images( [dockername] ) elif nargs[0] == "start": @@ -3292,13 +3451,30 @@ def run_command( args, command, nargs, parser ): elif command == "execonall" and len(nargs)>=1: nodes = get_nodes(config["clusterId"]) - print "Exec on all: " + str(nodes) + print "Exec on all: " + str(nodes) exec_on_all_with_output(nodes, nargs) elif command == "runscriptonall" and len(nargs)>=1: nodes = get_nodes(config["clusterId"]) + # print(nodes) run_script_on_all(nodes, nargs, sudo = args.sudo ) + elif command == "runscriptonroles": + assert len(nargs)>=1 + nodeset, scripts_start = [], 0 + for ni, arg in enumerate(nargs): + scripts_start = ni + if arg in allroles: + nodeset += arg, + else: + break + nodes = get_nodes_by_roles(nodeset) + # print(nodes) + run_script_on_all(nodes, nargs[scripts_start:], sudo = args.sudo ) + + elif command == "runscriptonrandmaster" and len(nargs)>=1: + run_script_on_rand_master(nargs, args) + elif command == "runscriptonscaleup" and len(nargs)>=1: nodes = get_scaled_nodes(config["clusterId"]) run_script_on_all(nodes, nargs, sudo = args.sudo ) @@ -3364,7 +3540,7 @@ def run_command( args, command, nargs, parser ): elif command == "production": set_host_names_by_lookup() success = deploy_ETCD_master() - if success: + if success: update_worker_nodes( [] ) elif command == "azure": @@ -3439,7 +3615,7 @@ def run_command( args, command, nargs, parser ): elif command == "kubernetes": configuration( config, verbose ) - if len(nargs) >= 1: + if len(nargs) >= 1: if len(nargs)>=2: servicenames = nargs[1:] else: @@ -3456,7 +3632,7 @@ def run_command( args, command, nargs, parser ): response = raw_input ("Please type (WIPEOUT) in ALL CAPITALS to confirm the operation ---> ") if response == "WIPEOUT": config["hdfsconfig"]["formatoptions"] = "--force " - # Start a kubelet service. + # Start a kubelet service. for servicename in servicenames: start_kube_service(servicename) elif nargs[0] == "stop": @@ -3490,6 +3666,8 @@ def run_command( args, command, nargs, parser ): kubernetes_mark_nodes( nargs[1:], False) elif nargs[0] == "cordon" or nargs[0] == "uncordon": run_kube_command_on_nodes(nargs) + elif nargs[0] == "labelvc": + kubernetes_label_vc(True) else: parser.print_help() print "Error: Unknown kubernetes subcommand " + nargs[0] @@ -3498,6 +3676,17 @@ def run_command( args, command, nargs, parser ): print "Error: kubernetes need a subcommand." exit() + elif command == "gpulabel": + kubernetes_label_GpuTypes() + + elif command == "genscripts": + # print(config["azure_cluster"].keys()) + gen_dns_config_script() + gen_pass_secret_script() + + elif command == "setconfigmap": + os.system('./deploy/bin/kubectl create configmap dlws-scripts --from-file=../Jobs_Templete -o yaml --dry-run | ./deploy.py kubectl apply -f -') + elif command == "download": if len(nargs)>=1: if nargs[0] == "kubectl" or nargs[0] == "kubelet": @@ -3546,7 +3735,7 @@ def run_command( args, command, nargs, parser ): push_docker_images(nargs[1:]) elif nargs[0] == "run": if len(nargs)>=2: - run_docker_image( nargs[1], args.native, sudo = args.sudo ) + run_docker_image( nargs[1], args.native, sudo = args.sudo ) else: parser.print_help() print "Error: docker run expects an image name " @@ -3566,6 +3755,16 @@ def run_command( args, command, nargs, parser ): template_file = nargs[0] target_file = nargs[1] utils.render_template(template_file, target_file,config) + elif command == "upgrade_masters": + gen_configs() + upgrade_masters() + elif command == "upgrade_workers": + gen_configs() + upgrade_workers(nargs) + elif command == "upgrade": + gen_configs() + upgrade_masters() + upgrade_workers(nargs) elif command in scriptblocks: run_script_blocks(args.verbose, scriptblocks[command]) else: @@ -3588,8 +3787,116 @@ def run_script_blocks( verbose, script_collection ): args.verbose = verbose run_command( args, command, nargs, parser ) +def upgrade_worker_node(nodeIP): + print "===============================================" + print "upgrading worker node: %s ..." % nodeIP + + worker_ssh_user = config["admin_username"] + utils.SSH_exec_script(config["ssh_cert"],worker_ssh_user, nodeIP, "./deploy/kubelet/pre-worker-upgrade.sh") + + with open("./deploy/kubelet/upgrade.list", "r") as f: + deploy_files = [s.split(",") for s in f.readlines() if len(s.split(",")) == 2] + for (source, target) in deploy_files: + if (os.path.isfile(source.strip()) or os.path.exists(source.strip())): + utils.sudo_scp(config["ssh_cert"],source.strip(),target.strip(),worker_ssh_user, nodeIP) + + utils.SSH_exec_script(config["ssh_cert"],worker_ssh_user, nodeIP, "./deploy/kubelet/post-worker-upgrade.sh") + +def upgrade_workers(nargs, hypekube_url="gcr.io/google-containers/hyperkube:v1.15.2"): + config["dockers"]["external"]["hyperkube"]["fullname"] = hypekube_url + config["dockers"]["container"]["hyperkube"]["fullname"] = hypekube_url + + utils.render_template_directory("./template/kubelet", "./deploy/kubelet", config) + write_nodelist_yaml() + + os.system('sed "s/##etcd_endpoints##/%s/" "./deploy/kubelet/options.env.template" > "./deploy/kubelet/options.env"' % config["etcd_endpoints"].replace("/","\\/")) + os.system('sed "s/##api_servers##/%s/" ./deploy/kubelet/kubelet.service.template > ./deploy/kubelet/kubelet.service' % config["api_servers"].replace("/","\\/")) + os.system('sed "s/##api_servers##/%s/" ./deploy/kubelet/worker-kubeconfig.yaml.template > ./deploy/kubelet/worker-kubeconfig.yaml' % config["api_servers"].replace("/","\\/")) + + get_hyperkube_docker() + + workerNodes = get_worker_nodes(config["clusterId"], False) + workerNodes = limit_nodes(workerNodes) + for node in workerNodes: + if in_list(node, nargs): + upgrade_worker_node(node) + + os.system("rm ./deploy/kubelet/options.env") + os.system("rm ./deploy/kubelet/kubelet.service") + os.system("rm ./deploy/kubelet/worker-kubeconfig.yaml") + +def upgrade_master(kubernetes_master): + print "===============================================" + kubernetes_master_user = config["kubernetes_master_ssh_user"] + print "starting kubernetes master on %s..." % kubernetes_master + + config["master_ip"] = utils.getIP(kubernetes_master) + utils.render_template("./template/master/kube-apiserver.yaml","./deploy/master/kube-apiserver.yaml",config) + utils.render_template("./template/master/dns-kubeconfig.yaml","./deploy/master/dns-kubeconfig.yaml",config) + utils.render_template("./template/master/kubelet.service","./deploy/master/kubelet.service",config) + utils.render_template("./template/master/pre-upgrade.sh", "./deploy/master/pre-upgrade.sh", config) + utils.render_template("./template/master/post-upgrade.sh", "./deploy/master/post-upgrade.sh", config) + + utils.SSH_exec_script(config["ssh_cert"],kubernetes_master_user, kubernetes_master, "./deploy/master/pre-upgrade.sh") + + with open("./deploy/master/upgrade.list", "r") as f: + deploy_files = [s.split(",") for s in f.readlines() if len(s.split(",")) == 2] + + for (source, target) in deploy_files: + if (os.path.isfile(source.strip()) or os.path.exists(source.strip())): + utils.sudo_scp(config["ssh_cert"],source.strip(),target.strip(),kubernetes_master_user,kubernetes_master, verbose=verbose) + + utils.SSH_exec_script(config["ssh_cert"],kubernetes_master_user, kubernetes_master, "./deploy/master/post-upgrade.sh") + +def upgrade_masters(hypekube_url="gcr.io/google-containers/hyperkube:v1.15.2"): + config["dockers"]["external"]["hyperkube"]["fullname"] = hypekube_url + config["dockers"]["container"]["hyperkube"]["fullname"] = hypekube_url + + kubernetes_masters = config["kubernetes_master_node"] + kubernetes_master_user = config["kubernetes_master_ssh_user"] + + get_kubectl_binary(force=True) + + utils.render_template_directory("./template/master", "./deploy/master",config) + utils.render_template_directory("./template/kube-addons", "./deploy/kube-addons",config) + + for kubernetes_master in kubernetes_masters: + upgrade_master(kubernetes_master) + deploy_cmd = """ + until curl -q http://127.0.0.1:8080/version/ ; do + sleep 5; + echo 'waiting for master...'; + done; + + until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/weave.yaml --validate=false ; do + sleep 5; + echo 'waiting for master...'; + done ; + + until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/dashboard.yaml --validate=false ; do + sleep 5; + echo 'waiting for master...'; + done ; + + until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/dns-addon.yml --validate=false ; do + sleep 5; + echo 'waiting for master...'; + done ; + + until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/kube-proxy.json --validate=false ; do + sleep 5; + echo 'waiting for master...'; + done ; + + until sudo /opt/bin/kubectl apply -f /etc/kubernetes/clusterroles/ ; do + sleep 5; + echo 'waiting for master...'; + done ; + """ + utils.SSH_exec_cmd(config["ssh_cert"], kubernetes_master_user, kubernetes_masters[0], deploy_cmd , False) + if __name__ == '__main__': - # the program always run at the current directory. + # the program always run at the current directory. dirpath = os.path.dirname(os.path.abspath(os.path.realpath(__file__))) # print "Directory: " + dirpath os.chdir(dirpath) @@ -3603,133 +3910,136 @@ def run_script_blocks( verbose, script_collection ): * Metadata of deployed cluster is stored at deploy. Command: - scriptblocks Execute a block of scripts. + scriptblocks Execute a block of scripts. azure - build [arg] Build deployment environment + build [arg] Build deployment environment arg="": should be executed first, generate keys for the cluster arg=iso-coreos: build ISO image fore CoreOS deployment. - arg=pxe-coreos: build PXE server for CoreOS deployment. + arg=pxe-coreos: build PXE server for CoreOS deployment. arg=pxe-ubuntu: build PXE server for Ubuntu deployment. [We use standard Ubuntu ISO for Ubuntu ISO deployment. ] - nfs-server create: Create NFS-server. - sshkey install: [Ubuntu] install sshkey to Ubuntu cluster. + nfs-server create: Create NFS-server. + sshkey install: [Ubuntu] install sshkey to Ubuntu cluster. production [nodes] Deploy a production cluster, with tasks of: - set hostname, deploy etcd/master nodes, deploy worker nodes, uncordon master nodes. + set hostname, deploy etcd/master nodes, deploy worker nodes, uncordon master nodes. deploy Deploy DL workspace cluster. - updateworker [nodes] Update the worker nodes. If no additional node is specified, all nodes will be updated. + updateworker [nodes] Update the worker nodes. If no additional node is specified, all nodes will be updated. clean Clean away a failed deployment. - update [args] Update cluster. - config: update cloud-config of each deployed node. + update [args] Update cluster. + config: update cloud-config of each deployed node. connect [master|etcd|worker] num: Connect to either master, etcd or worker node (with an index number). hostname [args] manage hostname on the cluster set: set hostname - uncordon allow etcd/master nodes to be scheduled jobs - partition [args] Manage data partitions. - ls: show all existing partitions. + uncordon allow etcd/master nodes to be scheduled jobs + partition [args] Manage data partitions. + ls: show all existing partitions. create n: create n partitions of equal size. create s1 s2 ... sn: create n partitions; - if s_i < 0, the partition is s_i GB, - if s_i > 0, the partition is in portitional to s_i. - We use parted mkpart percentage% to create partitions. As such, the minimum partition is 1% of a disk. + if s_i < 0, the partition is s_i GB, + if s_i > 0, the partition is in portitional to s_i. + We use parted mkpart percentage% to create partitions. As such, the minimum partition is 1% of a disk. mount install | start | stop | link - start: mount all fileshares + start: mount all fileshares install: install all client components related to the fileshare stop: unmount all fileshares nolink: mount all fileshares, but doesnot symbolic link to the mount share - glusterfs [args] manage glusterFS on the cluster. - display: display lvm information on each node of the cluster. - create: formatting and create lvm for used by glusterfs. - remove: deletel and remove glusterfs volumes. + glusterfs [args] manage glusterFS on the cluster. + display: display lvm information on each node of the cluster. + create: formatting and create lvm for used by glusterfs. + remove: deletel and remove glusterfs volumes. config: generate configuration file, build and push glusterfs docker. - start: start glusterfs service and endpoints. - stop: stop glusterfs service and endpoints. - hdfs [args] manage HDFS on the cluster. - create: formatting and create local drive for use by HDFS. - mount: mount local drive for use by HDFS. + start: start glusterfs service and endpoints. + stop: stop glusterfs service and endpoints. + hdfs [args] manage HDFS on the cluster. + create: formatting and create local drive for use by HDFS. + mount: mount local drive for use by HDFS. umount: unmount local drive that is used for HDFS. download [args] Manage download kubectl: download kubelet/kubectl. kubelet: download kubelet/kubectl. - backup [fname] [key] Backup configuration & encrypt, fname is the backup file without surfix. - If key exists, the backup file will be encrypted. - restore [fname] [key] Decrypt & restore configuration, fname is the backup file with surfix. - If the backup file is encrypted, a key needs to be provided to decrypt the configuration. + backup [fname] [key] Backup configuration & encrypt, fname is the backup file without surfix. + If key exists, the backup file will be encrypted. + restore [fname] [key] Decrypt & restore configuration, fname is the backup file with surfix. + If the backup file is encrypted, a key needs to be provided to decrypt the configuration. etcd [args] manage etcd server. check: check ETCD service. - kubernetes [args] manage kubelet services on the cluster. - start: launch a certain kubelet service. - stop: stop a certain kubelet service. - restart: replace a certain kubelet service. - cordon [node]: cordon certain nodes. If no node, cordon all etcd nodes. - uncordon [node]: uncordon certain nodes. If no node, uncordon all etcd nodes. - labels verb [services]: applying labels to node according to service (usually daemon) setup. + kubernetes [args] manage kubelet services on the cluster. + start: launch a certain kubelet service. + stop: stop a certain kubelet service. + restart: replace a certain kubelet service. + cordon [node]: cordon certain nodes. If no node, cordon all etcd nodes. + uncordon [node]: uncordon certain nodes. If no node, uncordon all etcd nodes. + labels verb [services]: applying labels to node according to service (usually daemon) setup. -y: overwrite existing value verb: active, inactive, remove (default=on) services: if none, apply to all services in the service directory mark [properties]: applying labels on node according to node property (usually in cluster.yaml) unmark [properties]: removing labels on node according to node property (usually in cluster.yaml) - kubectl [args] run a native kubectl command. - docker [args] manage docker images. - build: build one or more docker images associated with the current deployment. + kubectl [args] run a native kubectl command. + docker [args] manage docker images. + build: build one or more docker images associated with the current deployment. push: build and push one or more docker images to register run [--sudo]: run a docker image (--sudo: in super user mode) nginx [args] manage nginx reverse proxy config: config nginx node, mainly install file that specify how to direct traffic fqdn: config nginx node, install FQDN for each node - execonall [cmd ... ] Execute the command on all nodes and print the output. - doonall [cmd ... ] Execute the command on all nodes. - runscriptonall [script] Execute the shell/python script on all nodes. + execonall [cmd ... ] Execute the command on all nodes and print the output. + doonall [cmd ... ] Execute the command on all nodes. + runscriptonall [script] Execute the shell/python script on all nodes. listmac display mac address of the cluster notes checkconfig display config items rendertemplate template_file target_file + upgrade_masters Upgrade the master nodes. + upgrade_workers [nodes] Upgrade the worker nodes. If no additional node is specified, all nodes will be updated. + upgrade [nodes] Upgrade the cluster and nodes. If no additional node is specified, all nodes will be updated. ''') ) - parser.add_argument("-y", "--yes", - help="Answer yes automatically for all prompt", + parser.add_argument("-y", "--yes", + help="Answer yes automatically for all prompt", action="store_true" ) - parser.add_argument("--force", - help="Force perform certain operation", + parser.add_argument("--force", + help="Force perform certain operation", action="store_true" ) - parser.add_argument("--native", - help="Run docker in native mode (in how it is built)", + parser.add_argument("--native", + help="Run docker in native mode (in how it is built)", action="store_true" ) - parser.add_argument("-p", "--public", - help="Use public IP address to deploy/connect [e.g., Azure, AWS]", + parser.add_argument("-p", "--public", + help="Use public IP address to deploy/connect [e.g., Azure, AWS]", action="store_true") - parser.add_argument("-s", "--sudo", - help = "Execute scripts in sudo", + parser.add_argument("-s", "--sudo", + help = "Execute scripts in sudo", action="store_true" ) - parser.add_argument("--discoverserver", - help = "Specify an alternative discover server, default = " + default_config_parameters["discoverserver"], - action="store", + parser.add_argument("--discoverserver", + help = "Specify an alternative discover server, default = " + default_config_parameters["discoverserver"], + action="store", default=default_config_parameters["discoverserver"]) - parser.add_argument("--homeinserver", - help = "Specify an alternative home in server, default = " + default_config_parameters["homeinserver"], - action="store", + parser.add_argument("--homeinserver", + help = "Specify an alternative home in server, default = " + default_config_parameters["homeinserver"], + action="store", default=default_config_parameters["homeinserver"]) - parser.add_argument("-v", "--verbose", - help = "verbose print", + parser.add_argument("-v", "--verbose", + help = "verbose print", action="store_true") - parser.add_argument("--nocache", - help = "Build docker without cache", + parser.add_argument("--nocache", + help = "Build docker without cache", action="store_true") - parser.add_argument("--glusterfs", + parser.add_argument("--glusterfs", help = textwrap.dedent('''"Additional glusterfs launch parameter, \ - detach: detach all glusterfs nodes (to rebuild cluster), + detach: detach all glusterfs nodes (to rebuild cluster), start: initiate cluster (all nodes need to be operative during start stage to construct the cluster), - run: continuous operation, - ''' ), - action="store", + run: continuous operation, + ''' ), + action="store", default="run" ) - parser.add_argument("--nodes", - help = "Specify an python regular expression that limit the nodes that the operation is applied.", + parser.add_argument("--nodes", + help = "Specify an python regular expression that limit the nodes that the operation is applied.", action="store", default=None ) - parser.add_argument("command", + parser.add_argument("command", help = "See above for the list of valid command" ) - parser.add_argument('nargs', nargs=argparse.REMAINDER, - help="Additional command argument", + parser.add_argument('nargs', nargs=argparse.REMAINDER, + help="Additional command argument", ) args = parser.parse_args() command = args.command @@ -3752,4 +4062,3 @@ def run_script_blocks( verbose, script_collection ): print "Error: Unknown scriptblocks " + nargs[0] else: run_command( args, command, nargs, parser) - diff --git a/src/ClusterBootstrap/install_prerequisites.sh b/src/ClusterBootstrap/install_prerequisites.sh index 314628915..81ffdf3b8 100755 --- a/src/ClusterBootstrap/install_prerequisites.sh +++ b/src/ClusterBootstrap/install_prerequisites.sh @@ -1,31 +1,42 @@ -#!/bin/bash - -sudo apt-get update -sudo apt-get install -y --no-install-recommends \ - apt-utils \ - software-properties-common \ - git \ - curl \ - python-pip \ - wget \ - cpio \ - mkisofs \ - apt-transport-https \ - openssh-client \ - ca-certificates - -# Install docker -curl -fsSL https://yum.dockerproject.org/gpg | apt-key add - -sudo add-apt-repository \ - "deb https://apt.dockerproject.org/repo/ \ - ubuntu-$(lsb_release -cs) \ - main" -sudo apt-get update -sudo apt-get install -y --no-install-recommends \ - docker-engine - -pip install --upgrade pip -pip install setuptools && pip install pyyaml && pip install jinja2 - -sudo echo "dockerd > /dev/null 2>&1 &" | cat >> /etc/bash.bashrc - +#!/bin/bash +sudo apt-get update +sudo apt-get install -y --no-install-recommends \ + apt-utils \ + software-properties-common \ + git \ + curl \ + python-dev \ + python-pip \ + wget \ + cpio \ + mkisofs \ + apt-transport-https \ + openssh-client \ + ca-certificates \ + network-manager + +sudo apt-get install libcurl4-openssl-dev libssl-dev gcc libnss3-dev libgnutls28-dev +sudo apt-get install -y python-subprocess32 +# Install docker +# curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - + +# sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" docker-ce +sudo apt install docker.io + +AZ_REPO=$(lsb_release -cs) +echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ $AZ_REPO main" | \ + sudo tee /etc/apt/sources.list.d/azure-cli.list + +sudo apt-key --keyring /etc/apt/trusted.gpg.d/Microsoft.gpg adv \ + --keyserver packages.microsoft.com \ + --recv-keys BC528686B50D79E339D3721CEB3E94ADBE1229CF + +sudo apt-get update +sudo apt-get install -y --no-install-recommends +curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + +# pip install --upgrade pip +sudo pip install setuptools && pip install pyyaml && pip install jinja2 && pip install requests && pip install tzlocal && pip install pycurl + +sudo echo "dockerd > /dev/null 2>&1 &" | cat >> /etc/bash.bashrc +./gene_loc_dns.sh diff --git a/src/ClusterBootstrap/params.py b/src/ClusterBootstrap/params.py index 178995008..0979bde91 100755 --- a/src/ClusterBootstrap/params.py +++ b/src/ClusterBootstrap/params.py @@ -19,12 +19,31 @@ "influxdb_rpc_port": "8088", "influxdb_data_path": "/var/lib/influxdb", + "prometheus": { "port": 9091, "reporter": {"port": 9092} }, + "job-exporter": { "port": 9102 }, + "node-exporter": { "port": 9100 }, + "watchdog": { "port": 9101 }, + "grafana": { "port": 3000, "prometheus-ip": "localhost" }, + "alert-manager": { + "port": 9093, + "configured": False, + "alert_users": False, + # If want to deploy with alert-manager, should config + # configured with True, and fill appropriate value to: + # smtp_url, smtp_from, smtp_auth_username, smtp_auth_password and receiver + "reaper": { + "dry-run": True, + "port": "9500", + "restful-url": "http://localhost:5000", + } + }, + "mysql_port": "3306", "mysql_username": "root", "mysql_data_path": "/var/lib/mysql", "datasource": "AzureSQL", - + "defalt_virtual_cluster_name": "platform", # Discover server is used to find IP address of the host, it need to be a well-known IP address # that is pingable. "discoverserver": "4.2.2.1", @@ -74,6 +93,8 @@ ".js": True, ".swf": True, ".gzip": True, + ".rules": True, + ".tmpl": True, }, "render-by-copy": { # The following file will be copied (not rendered for configuration) @@ -87,9 +108,11 @@ "collectd.graphite.conf.tpl": True, "collectd.influxdb.conf.tpl": True, "collectd.riemann.conf.tpl": True, + "prometheus-alerting.yaml": True, + "alert-templates.yaml": True, # "nginx": True, "RecogServer": True, - + # This template will be rendered inside container, but not at build stage # "hdfs-site.xml.template": True, }, @@ -198,7 +221,9 @@ "jobmanager": "etcd_node_1", "FragmentGPUJob": "all", "grafana": "etcd_node_1", - "influxdb": "etcd_node_1", + "prometheus": "etcd_node_1", + "alert-manager": "etcd_node_1", + "watchdog": "etcd_node_1", "elasticsearch": "etcd_node_1", "kibana": "etcd_node_1", "mysql": "etcd_node_1", @@ -334,7 +359,7 @@ "CCSAdmins": { # The match is in C# Regex Language, please refer to : # https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx - "Allowed": ["jinl@microsoft.com", "hongzl@microsoft.com", "sanjeevm@microsoft.com"], + "Allowed": ["hongzl@microsoft.com", "anbhu@microsoft.com", "jachzh@microsoft.com","zhexu@microsoft.com","dixu@microsoft.com","qixcheng@microsoft.com","jingzhao@microsoft.com","hayua@microsoft.com"], "uid": "900000000-999999999", "gid": "508953967" }, @@ -372,7 +397,6 @@ "DeployAuthentications": ["Corp", "Live", "Gmail"], # You should remove WinBindServers if you will use # UserGroups for authentication. - "WinbindServers": ["http://onenet40.redmond.corp.microsoft.com/domaininfo/GetUserId?userName={0}"], "workFolderAccessPoint": "", "dataFolderAccessPoint": "", @@ -547,26 +571,26 @@ "tutorial-nlp": {}, "tutorial-fastai": {}, "tutorial-imagenet18": {}, - "gobld": {}, - "kubernetes": {}, + "gobld": {}, + "kubernetes": {}, }, "external": { # These dockers are to be built by additional add ons. - "hyperkube": {"fullname":"dlws/hyperkube:v1.9.0"}, - "freeflow": {"fullname":"dlws/freeflow:0.16"}, + "hyperkube": {"fullname":"gcr.io/google-containers/hyperkube:v1.15.2"}, + "freeflow": {"fullname":"dlws/freeflow:0.18"}, "podinfra": {"fullname":"dlws/pause-amd64:3.0"}, "nvidiadriver": {"fullname":"dlws/nvidia_driver:375.20"}, - "weave":{"fullname":"dlws/weave:2.2.0"}, - "weave-npc":{"fullname":"dlws/weave-npc:2.2.0"}, + "weave":{"fullname":"docker.io/weaveworks/weave-kube:2.5.2"}, + "weave-npc":{"fullname":"docker.io/weaveworks/weave-npc:2.5.2"}, "k8s-dashboard":{"fullname":"dlws/kubernetes-dashboard-amd64:v1.5.1"}, "kube-dns":{"fullname":"dlws/k8s-dns-kube-dns-amd64:1.14.8"}, "kube-dnsmasq":{"fullname":"dlws/k8s-dns-dnsmasq-nanny-amd64:1.14.8"}, "kube-dns-sidecar":{"fullname":"dlws/k8s-dns-sidecar-amd64:1.14.8"}, - "heapster":{"fullname":"dlws/heapster-amd64:v1.4.0"}, - "etcd":{"fullname":"dlws/etcd:3.1.10"}, - "mysql":{"fullname":"dlws/mysql:5.6"}, + "heapster":{"fullname":"dlws/heapster-amd64:v1.4.0"}, + "etcd":{"fullname":"dlws/etcd:3.1.10"}, + "mysql":{"fullname":"dlws/mysql:5.6"}, "phpmyadmin":{"fullname":"dlws/phpmyadmin:4.7.6"}, - "fluentd-elasticsearch":{"fullname":"dlws/fluentd-elasticsearch:v2.0.2"}, + "fluentd-elasticsearch":{"fullname":"dlws/fluentd-elasticsearch:v2.0.2"}, }, "infrastructure": { @@ -581,42 +605,90 @@ "cloud_config": { "vnet_range": "192.168.0.0/16", "default_admin_username": "dlwsadmin", - "tcp_port_for_pods": "30000-32767", - "tcp_port_ranges": "80 443 30000-32767 25826", + "tcp_port_for_pods": "30000-49999", + "tcp_port_ranges": "80 443 30000-49999 25826", "udp_port_ranges": "25826", + "inter_connect": { + "tcp_port_ranges": "22 1443 2379 3306 5000 8086 10250", + # Need to white list dev machines to connect + # "source_addresses_prefixes": [ "52.151.0.0/16"] + }, "dev_network": { "tcp_port_ranges": "22 1443 2379 3306 5000 8086", # Need to white list dev machines to connect # "source_addresses_prefixes": [ "52.151.0.0/16"] - } + }, + "nfs_share": { + "source_ips": ["104.44.112.0/24", "131.107.0.0/16"], + }, + "nfs_ssh": { + "source_ips": ["131.107.0.0/16", "104.44.0.0/16"], + "port": "22", + }, + "nfs_suffixes":[], + "nfs_svr_setup": [ + { + "mnt_point": {"rootshare":{"curphysicalmountpoint":"/mntdlws/infranfs","filesharename":"/infradata/share","mountpoints":""}}} + ], + "samba_range": "104.44.112.0/24", + }, + "vc_config":{ + "VC-Default":["*"], }, + "sku_mapping": { + "Standard_ND6s":{"gpu-type": "P40","gpu-count": 1}, + "Standard_NV24": {"gpu-type": "M60", "gpu-count": 4}, + "Standard_ND12s": {"gpu-type": "P40", "gpu-count": 2}, + "Standard_ND24rs": {"gpu-type": "P40", "gpu-count": 4}, + "Standard_NV12": {"gpu-type": "M60", "gpu-count": 2}, + "Standard_NV48s_v3": {"gpu-type": "M60", "gpu-count": 4}, + "Standard_ND40s_v2": {"gpu-type": "V100", "gpu-count": 8}, + "Standard_NC6s_v3": {"gpu-type": "V100", "gpu-count": 1}, + "Standard_NC6s_v2": {"gpu-type": "P100", "gpu-count": 1}, + "Standard_ND24s": {"gpu-type": "P40", "gpu-count": 4}, + "Standard_NV24s_v3": {"gpu-type": "M60", "gpu-count": 2}, + "Standard_NV6": {"gpu-type": "M60", "gpu-count": 1}, + "Standard_NV12s_v3": {"gpu-type": "M60", "gpu-count": 1}, + "Standard_NC24s_v2": {"gpu-type": "P100", "gpu-count": 4}, + "Standard_NC24s_v3": {"gpu-type": "V100", "gpu-count": 4}, + "Standard_NC12s_v3": {"gpu-type": "V100", "gpu-count": 2}, + "Standard_NC12s_v2": {"gpu-type": "P100", "gpu-count": 2}, + "Standard_NC24rs_v3": {"gpu-type": "V100", "gpu-count": 4}, + "Standard_NC24rs_v2": {"gpu-type": "P100", "gpu-count": 4}, + } } # These are super scripts scriptblocks = { "azure": [ - "runscriptonall ./scripts/prepare_vm_disk.sh", + "runscriptonroles infra worker ./scripts/prepare_vm_disk.sh", "nfs-server create", - "runscriptonall ./scripts/prepare_ubuntu.sh", + "runscriptonroles infra worker ./scripts/prepare_ubuntu.sh", + "genscripts", + "runscriptonroles infra worker ./scripts/dns.sh", "-y deploy", "-y updateworker", "-y kubernetes labels", + "-y gpulabel", + "kubernetes start nvidia-device-plugin", "webui", "docker push restfulapi", "docker push webui", - "nginx fqdn", - "nginx config", + # "nginx fqdn", + # "nginx config", "mount", "kubernetes start mysql", "kubernetes start jobmanager", "kubernetes start restfulapi", "kubernetes start webportal", "kubernetes start cloudmonitor", - "kubernetes start nginx", + # "kubernetes start nginx", "kubernetes start custommetrics", # TODO(harry): we cannot distinguish gce aws from azure, so add the same providerID # This will not break current deployment. - "-y kubernetes patchprovider aztools" + "-y kubernetes patchprovider aztools", + "setconfigmap", + "--sudo runscriptonrandmaster ./scripts/pass_secret.sh", ], "azure_uncordon": [ "runscriptonall ./scripts/prepare_vm_disk.sh", @@ -626,6 +698,7 @@ "-y updateworker", "kubernetes uncordon", "-y kubernetes labels", + "kubernetes start nvidia-device-plugin", "webui", "docker push restfulapi", "docker push webui", @@ -646,6 +719,7 @@ "-y deploy", "-y updateworker", "-y kubernetes labels", + "kubernetes start nvidia-device-plugin", "kubernetes uncordon", "mount", "webui", @@ -663,6 +737,7 @@ "runscriptonall ./scripts/prepare_ubuntu.sh", "-y deploy", "-y kubernetes labels", + "kubernetes start nvidia-device-plugin", "kubernetes uncordon", "sleep 60", "-y updateworker", @@ -729,6 +804,7 @@ "-y deploy", "-y updateworker", "-y kubernetes labels", + "kubernetes start nvidia-device-plugin", "mount", "webui", "docker push restfulapi", diff --git a/src/ClusterBootstrap/scripts/disable_kernel_auto_updates.sh b/src/ClusterBootstrap/scripts/disable_kernel_auto_updates.sh new file mode 100644 index 000000000..6a2969b04 --- /dev/null +++ b/src/ClusterBootstrap/scripts/disable_kernel_auto_updates.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -x + +# Remove kernel updates + +sed 's/"${distro_id}:${distro_codename}";/\/\/"${distro_id}:${distro_codename}";/g' /etc/apt/apt.conf.d/50unattended-upgrades | sed 's/"${distro_id}:${distro_codename}-security";/\/\/"${distro_id}:${distro_codename}-security";/g' | sed 's/"${distro_id}ESM:${distro_codename}";/\/\/"${distro_id}ESM:${distro_codename}";/g' > /tmp/50unattended-upgrades + +sudo cp /tmp/50unattended-upgrades /etc/apt/apt.conf.d/50unattended-upgrades + +# Disable periodic unattended-update +sed 's/APT::Periodic::Unattended-Upgrade "1";/APT::Periodic::Unattended-Upgrade "0";/g' /etc/apt/apt.conf.d/20auto-upgrades > /tmp/20auto-upgrades + +sudo cp /tmp/20auto-upgrades /etc/apt/apt.conf.d/20auto-upgrades diff --git a/src/ClusterBootstrap/scripts/dns.sh b/src/ClusterBootstrap/scripts/dns.sh new file mode 100644 index 000000000..743bf2caa --- /dev/null +++ b/src/ClusterBootstrap/scripts/dns.sh @@ -0,0 +1,8 @@ +sudo systemctl disable systemd-resolved.service +sudo systemctl stop systemd-resolved +echo "dns=default" | sudo tee -a /etc/NetworkManager/NetworkManager.conf +sudo rm /etc/resolv.conf +echo "nameserver 8.8.8.8" | sudo tee -a /etc/resolv.conf +#echo 'search {{cnf["network"]["domain"]}}' | sudo tee -a /etc/resolv.conf +echo "search eastus.cloudapp.azure.com" | sudo tee -a /etc/resolv.conf +sudo service network-manager restart diff --git a/src/ClusterBootstrap/scripts/prepare_ubuntu.sh b/src/ClusterBootstrap/scripts/prepare_ubuntu.sh index 99dea6c33..c69f14d4a 100755 --- a/src/ClusterBootstrap/scripts/prepare_ubuntu.sh +++ b/src/ClusterBootstrap/scripts/prepare_ubuntu.sh @@ -1,9 +1,18 @@ #!/bin/bash + +set -x + +# https://unix.stackexchange.com/questions/146283/how-to-prevent-prompt-that-ask-to-restart-services-when-installing-libpq-dev +export DEBIAN_FRONTEND=noninteractive + +sudo killall apt-get +sudo killall dpkg +sudo dpkg --configure -a + # Install python on CoreOS base image # Docker environment for development of DL workspace sudo apt-get update -y -sudo apt-get upgrade -y -sudo apt-get install -y --no-install-recommends \ +yes | sudo apt-get install -y --no-install-recommends \ apt-utils \ software-properties-common \ build-essential \ @@ -24,7 +33,7 @@ sudo apt-get install -y --no-install-recommends \ nfs-common -sudo apt-get install -y bison curl parted +yes | sudo apt-get install -y bison curl parted # Install docker which docker @@ -33,10 +42,17 @@ then docker --version ## docker already installed else -curl -q https://get.docker.com/ | sudo bash +sudo apt-get remove docker docker-engine docker.io +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - +sudo add-apt-repository \ + "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) \ + stable" +sudo apt-get update +yes | sudo apt-get install -y docker-ce fi -sudo pip install --upgrade pip +yes | sudo pip install --upgrade pip # pip doesn't install python for root account, causing issues. # sudo pip install setuptools # sudo pip install pyyaml jinja2 argparse @@ -72,19 +88,52 @@ if lspci | grep -qE "[0-9a-fA-F][0-9a-fA-F]:[0-9a-fA-F][0-9a-fA-F].[0-9] (3D|VG # chmod +x /tmp/NVIDIA-Linux-x86_64-$NVIDIA_VERSION.run # sudo bash /tmp/NVIDIA-Linux-x86_64-$NVIDIA_VERSION.run -a -s - sudo apt-get purge -y nvidia* - sudo apt-get install -y nvidia-384 - + sudo systemctl stop kubelet + + echo kill all containers so we could remove old nvidia drivers + timeout 10 docker kill $(docker ps -a -q) + + lsmod | grep -qE "^nvidia" && + { + echo ======== NVIDIA driver is running, uninstall it ========= + DEP_MODS=`lsmod | tr -s " " | grep -E "^nvidia" | cut -f 4 -d " "` + for mod in ${DEP_MODS//,/ } + do + sudo rmmod $mod || + { + echo "The driver $mod is still in use, can't unload it." + exit 1 + } + done + sudo rmmod nvidia || + { + echo "The driver nvidia is still in use, can't unload it." + exit 1 + } + } + + sudo add-apt-repository -y ppa:graphics-drivers/ppa + + sudo apt-get purge -y nvidia* + sudo apt-get update + yes | sudo apt-get install -y nvidia-driver-430 + + yes | sudo apt install -y nvidia-modprobe + sudo rm -r /opt/nvidia-driver || true - sudo apt install -y nvidia-modprobe + # Install nvidia-docker and nvidia-docker-plugin ( Upgrade to nvidia-docker2) + # rm /tmp/nvidia-docker*.deb + # wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker_1.0.1-1_amd64.deb + # sudo dpkg -i /tmp/nvidia-docker*.deb && rm /tmp/nvidia-docker*.deb - sudo rm -r /opt/nvidia-driver || true + curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - + distribution=$(. /etc/os-release;echo $ID$VERSION_ID) + curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list + sudo apt-get update - # Install nvidia-docker and nvidia-docker-plugin - rm /tmp/nvidia-docker*.deb - wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker_1.0.1-1_amd64.deb - sudo dpkg -i /tmp/nvidia-docker*.deb && rm /tmp/nvidia-docker*.deb + yes | sudo apt-get install -y nvidia-docker2 + sudo pkill -SIGHUP dockerd # Test nvidia-smi sudo nvidia-docker run --rm dlws/cuda nvidia-smi @@ -96,3 +145,6 @@ if lspci | grep -qE "[0-9a-fA-F][0-9a-fA-F]:[0-9a-fA-F][0-9a-fA-F].[0-9] (3D|VG NV_DRIVER=/opt/nvidia-driver/$NVIDIA_VERSION sudo ln -s $NV_DRIVER /opt/nvidia-driver/current fi + +# https://github.com/kubernetes/kubeadm/issues/610 +sudo swapoff -a diff --git a/src/ClusterBootstrap/scripts/prepare_vm_disk.sh b/src/ClusterBootstrap/scripts/prepare_vm_disk.sh index 5fc33f144..47de44cf9 100755 --- a/src/ClusterBootstrap/scripts/prepare_vm_disk.sh +++ b/src/ClusterBootstrap/scripts/prepare_vm_disk.sh @@ -1,21 +1,31 @@ #!/bin/bash + +printf "o\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/sdc +sudo mkfs.ext4 /dev/sdc1 +sleep 10 +sudo mkdir /data +uuid=$(ls -l /dev/disk/by-uuid/ | grep sdc1 | awk '{print $9}') +echo "UUID=$uuid /data ext4 defaults,discard 0 0" | sudo tee -a /etc/fstab +sudo mount /data + sudo mv /var/log /var/log.bak -sudo mkdir -p /mnt/log -sudo rm -r /var/log ; sudo ln -s /mnt/log /var/log -sudo mv /var/log.bak/* /mnt/log +sudo mkdir -p /data/log +sudo rm -r /var/log ; sudo ln -s /data/log /var/log +sudo mv /var/log.bak/* /data/log sudo rm -r /var/log.bak -sudo mkdir -p /mnt/lib/docker -sudo mkdir -p /mnt/lib/mysql -sudo mkdir -p /mnt/lib/influxdb +sudo mkdir -p /data/lib/docker +sudo mkdir -p /data/lib/mysql +sudo mkdir -p /data/lib/influxdb if [ ! -L /var/lib/docker ]; then - sudo ln -s /mnt/lib/docker /var/lib/docker + sudo ln -s /data/lib/docker /var/lib/docker fi if [ ! -L /var/lib/mysql ]; then # It is a symlink! # Symbolic link specific commands go here. - sudo ln -s /mnt/lib/mysql /var/lib/mysql + sudo ln -s /data/lib/mysql /var/lib/mysql fi -if [ ! -L /var/lib/influxdb ]; then - sudo ln -s /mnt/lib/influxdb /var/lib/influxdb +if [ ! -L /var/lib/influxdb ]; then + sudo ln -s /data/lib/influxdb /var/lib/influxdb fi + diff --git a/src/ClusterBootstrap/scripts/remove_nvidia_docker1.sh b/src/ClusterBootstrap/scripts/remove_nvidia_docker1.sh new file mode 100755 index 000000000..d0cc8210f --- /dev/null +++ b/src/ClusterBootstrap/scripts/remove_nvidia_docker1.sh @@ -0,0 +1,10 @@ +docker volume ls -q -f driver=nvidia-docker | xargs -r -I{} -n1 docker ps -q -a -f volume={} | xargs -r docker rm -f +sudo apt-get purge -y nvidia-docker + +curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list +sudo apt-get update + +sudo apt-get install -y nvidia-docker2 +sudo pkill -SIGHUP dockerd \ No newline at end of file diff --git a/src/ClusterBootstrap/scripts/setup_nfs_server.sh b/src/ClusterBootstrap/scripts/setup_nfs_server.sh index 0bcd0ddcc..48cc90586 100755 --- a/src/ClusterBootstrap/scripts/setup_nfs_server.sh +++ b/src/ClusterBootstrap/scripts/setup_nfs_server.sh @@ -1,10 +1,10 @@ -sudo apt-get update -sudo apt-get install -y nfs-kernel-server - -sudo mkdir -p /mnt/share -sudo chown nobody:nogroup /mnt/share - -echo "/mnt/share {{cnf["cloud_config"]["vnet_range"]}}(rw,sync,no_subtree_check,no_root_squash)" | sudo tee /etc/exports -sudo systemctl restart nfs-kernel-server - - +sudo apt-get update +sudo apt-get install -y nfs-kernel-server + +sudo mkdir -p /data/share +sudo chown nobody:nogroup /data/share + +echo "/data/share {{cnf["cloud_config"]["vnet_range"]}}(rw,sync,no_subtree_check,no_root_squash)" | sudo tee /etc/exports +sudo systemctl restart nfs-kernel-server + + diff --git a/src/ClusterBootstrap/services/cloudmonitor/collectd.yaml b/src/ClusterBootstrap/services/cloudmonitor/collectd.yaml index 8ccc78808..983c3aac4 100755 --- a/src/ClusterBootstrap/services/cloudmonitor/collectd.yaml +++ b/src/ClusterBootstrap/services/cloudmonitor/collectd.yaml @@ -57,8 +57,8 @@ spec: readOnly: true - name: run mountPath: /var/run/docker.sock - - name: nvidia-driver - mountPath: /usr/local/nvidia + #- name: nvidia-driver + # mountPath: /usr/local/nvidia volumes: #- name: collectd-config # configMap: @@ -84,9 +84,9 @@ spec: - name: run hostPath: path: /var/run/docker.sock - - name: nvidia-driver - hostPath: - path: {{cnf["nvidia-driver-path"]}} + #- name: nvidia-driver + # hostPath: + # path: {{cnf["nvidia-driver-path"]}} tolerations: - key: CriticalAddonsOnly operator: Exists diff --git a/src/ClusterBootstrap/services/custommetrics/custom_metrics.yaml b/src/ClusterBootstrap/services/custommetrics/custom_metrics.yaml index 3f97b0bbf..f67adaafc 100644 --- a/src/ClusterBootstrap/services/custommetrics/custom_metrics.yaml +++ b/src/ClusterBootstrap/services/custommetrics/custom_metrics.yaml @@ -76,7 +76,7 @@ rules: verbs: - "*" --- -apiVersion: apps/v1beta2 +apiVersion: apps/v1 kind: Deployment metadata: name: custom-metrics-apiserver @@ -180,4 +180,4 @@ roleRef: subjects: - kind: ServiceAccount name: horizontal-pod-autoscaler - namespace: kube-system \ No newline at end of file + namespace: kube-system diff --git a/src/ClusterBootstrap/services/custommetrics/prometheus_operator.yaml b/src/ClusterBootstrap/services/custommetrics/prometheus_operator.yaml index 7b1503c55..290887bc3 100644 --- a/src/ClusterBootstrap/services/custommetrics/prometheus_operator.yaml +++ b/src/ClusterBootstrap/services/custommetrics/prometheus_operator.yaml @@ -69,7 +69,7 @@ subjects: name: prometheus-operator namespace: default --- -apiVersion: apps/v1beta2 +apiVersion: apps/v1 kind: Deployment metadata: name: prometheus-operator diff --git a/src/ClusterBootstrap/services/detectron/detectron.yaml b/src/ClusterBootstrap/services/detectron/detectron.yaml index e464c0bee..b3295f850 100755 --- a/src/ClusterBootstrap/services/detectron/detectron.yaml +++ b/src/ClusterBootstrap/services/detectron/detectron.yaml @@ -25,7 +25,7 @@ spec: image: {{cnf["dockers"]["container"]["tutorial-caffe2"]["fullname"]}} resources: limits: - alpha.kubernetes.io/nvidia-gpu: 1 + nvidia.com/gpu: 1 imagePullPolicy: Always command: ["/run.sh"] livenessProbe: diff --git a/src/ClusterBootstrap/services/jobmanager/jobmanager.yaml b/src/ClusterBootstrap/services/jobmanager/jobmanager.yaml index 32dff1fa1..54ccf8c67 100755 --- a/src/ClusterBootstrap/services/jobmanager/jobmanager.yaml +++ b/src/ClusterBootstrap/services/jobmanager/jobmanager.yaml @@ -12,6 +12,10 @@ spec: name: jobmanager labels: jobmanager-node: pod + app: jobmanager + annotations: + prometheus.io/scrape: "true" + prometheus.io/path: "/metrics" spec: {% if cnf["dnsPolicy"] %} dnsPolicy: {{cnf["dnsPolicy"]}} @@ -38,7 +42,40 @@ spec: - mountPath: {{cnf["storage-mount-path"]}}/jobfiles name: dlwsdatajobfiles - mountPath: /var/log/dlworkspace - name: log + name: log + ports: + - containerPort: 9200 + hostPort: 9200 + name: job-mgr + protocol: TCP + - containerPort: 9201 + hostPort: 9201 + name: user-mgr + protocol: TCP + - containerPort: 9202 + hostPort: 9202 + name: node-mgr + protocol: TCP + - containerPort: 9203 + hostPort: 9203 + name: joblog-mgr + protocol: TCP + - containerPort: 9204 + hostPort: 9204 + name: cmd-mgr + protocol: TCP + - containerPort: 9205 + hostPort: 9205 + name: endpoint-mgr + protocol: TCP + readinessProbe: + failureThreshold: 3 + initialDelaySeconds: 3 + periodSeconds: 30 + successThreshold: 1 + tcpSocket: + port: 9200 + timeoutSeconds: 10 volumes: - name: certs hostPath: diff --git a/src/ClusterBootstrap/services/jobmanager/launch_order b/src/ClusterBootstrap/services/jobmanager/launch_order new file mode 100755 index 000000000..526994c7c --- /dev/null +++ b/src/ClusterBootstrap/services/jobmanager/launch_order @@ -0,0 +1,2 @@ +dlws-scripts.yaml +jobmanager.yaml diff --git a/src/ClusterBootstrap/services/jobmanager/pre-render.sh b/src/ClusterBootstrap/services/jobmanager/pre-render.sh new file mode 100755 index 000000000..898c3d143 --- /dev/null +++ b/src/ClusterBootstrap/services/jobmanager/pre-render.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +dir=`dirname $0` + +dlws_scripts_file_name=${dir}/dlws-scripts.yaml + +${dir}/../../deploy/bin/kubectl create configmap dlws-scripts --from-file=${dir}/../../../init-scripts --dry-run -o yaml > $dlws_scripts_file_name diff --git a/src/ClusterBootstrap/services/monitor/alert-manager.yaml b/src/ClusterBootstrap/services/monitor/alert-manager.yaml new file mode 100644 index 000000000..a15534e3f --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/alert-manager.yaml @@ -0,0 +1,181 @@ +{% if cnf["alert-manager"]["configured"] %} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: alert-manager + namespace: kube-system +spec: + replicas: 1 + selector: + matchLabels: + app: alert-manager + template: + metadata: + name: alert-manager + labels: + app: alert-manager + annotations: + prometheus.io/alert: "true" + prometheus.io/port: "{{ cnf['alert-manager']['port'] }}" + spec: + nodeSelector: + alert-manager: active + hostNetwork: true + containers: + - name: alert-manager + image: prom/alertmanager:v0.18.0 + args: + - '--config.file=/etc/alertmanager/config.yml' + - '--storage.path=/alertmanager' + - '--web.external-url=http://localhost/alert-manager/' + - '--web.route-prefix=alert-manager' + ports: + - name: alert-manager + containerPort: {{ cnf["alert-manager"]["port"] }} + volumeMounts: + - name: config-volume + mountPath: /etc/alertmanager + - name: storage + mountPath: /alertmanager + - name: templates-volume + mountPath: /etc/alertmanager/template + {% if cnf["alert-manager"]["reaper"] %} + - name: reaper + image: {{cnf["worker-dockerregistry"]}}{{cnf["dockerprefix"]}}reaper:{{cnf["dockertag"]}} + command: + - 'python' + - '/reaper/main.py' + - '--port' + - '{{ cnf["alert-manager"]["reaper"]["port"] }}' + - '--restful_url' + - '{{ cnf["alert-manager"]["reaper"]["restful-url"] }}' + {% if cnf["alert-manager"]["reaper"]["dry-run"] %} + - '--dry_run' + {% endif %} + ports: + - name: alert-manager + containerPort: {{ cnf["alert-manager"]["reaper"]["port"] }} + {% endif %} + volumes: + - name: config-volume + configMap: + name: alert-manager + - name: templates-volume + configMap: + name: alert-templates + - name: storage + emptyDir: {} + tolerations: + - key: node.kubernetes.io/memory-pressure + operator: "Exists" + - key: node.kubernetes.io/disk-pressure + operator: "Exists" + - key: node-role.kubernetes.io/master + effect: NoSchedule +--- +{% endif %} +kind: ConfigMap +apiVersion: v1 +metadata: + name: alert-manager + namespace: kube-system +data: + config.yml: |- +{% if cnf["alert-manager"]["configured"] %} +{% set alert_info = cnf["alert-manager"] %} + global: + resolve_timeout: 5m + smtp_smarthost: {{ alert_info["smtp_url"] }} + smtp_from: {{ alert_info["smtp_from"] }} + smtp_auth_username: {{ alert_info["smtp_auth_username"] }} + smtp_auth_password: {{ alert_info["smtp_auth_password"] }} + templates: + - "/etc/alertmanager/template/*.tmpl" + route: + repeat_interval: 24h + receiver: alert-email + group_wait: 30s + group_interval: 5m + group_by: [alertname, cluster] + routes: + - receiver: idle_gpu_receiver + repeat_interval: 4h + group_by: [alertname, user_email, cluster] + match_re: + type: idle_gpu + alertname: "zero-gpu-usage" + - receiver: job_state_change_receiver + group_by: [alertname, user_email, cluster, subject] + match_re: + type: user_alert + alertname: "job-state-changed" + - receiver: reaper + group_by: [alertname, user_email, job_name] + group_wait: 0s + match_re: + type: reaper + - receiver: kill_idle_job_email + group_by: [alertname, user_email, cluster] + group_wait: 0s + match_re: + type: kill_idle_job_email + alertname: "kill-idle-jobs-email" + receivers: + - name: "alert-email" + email_configs: + - to: {{ alert_info["receiver"] }} + html: '{{ "{{" }} template "email.html" . {{ "}}" }}' + headers: + subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}' + - name: "idle_gpu_receiver" + email_configs: + {% if cnf["alert-manager"]["alert_users"] %} + - to: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}' + {% else %} + - to: '{{ alert_info["receiver"] }}' + {% endif %} + html: '{{ "{{" }} template "idle_gpu.html" . {{ "}}" }}' + headers: + {% if cnf["alert-manager"]["alert_users"] %} + To: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }}' + CC: '{{ alert_info["receiver"] }}' + {% endif %} + subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}' + - name: "job_state_change_receiver" + email_configs: + {% if cnf["alert-manager"]["alert_users"] %} + - to: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}' + {% else %} + - to: '{{ alert_info["receiver"] }}' + {% endif %} + html: '{{ "{{" }} template "job_state.html" . {{ "}}" }}' + headers: + {% if cnf["alert-manager"]["alert_users"] %} + To: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }}' + CC: '{{ alert_info["receiver"] }}' + {% endif %} + subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}' + - name: "reaper" + {% if cnf["alert-manager"]["reaper"] %} + webhook_configs: + - send_resolved: False + url: 'http://localhost:{{ cnf["alert-manager"]["reaper"]["port"] }}/kill' + http_config: + bearer_token: 'shinigami' + - name: "kill_idle_job_email" + email_configs: + {% if cnf["alert-manager"]["alert_users"] %} + - to: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }},{{ alert_info["receiver"] }}' + {% else %} + - to: '{{ alert_info["receiver"] }}' + {% endif %} + html: '{{ "{{" }} template "kill_idle.html" . {{ "}}" }}' + headers: + {% if cnf["alert-manager"]["alert_users"] %} + To: '{{ "{{" }} .GroupLabels.user_email {{ "}}" }}' + CC: '{{ alert_info["receiver"] }}' + {% endif %} + subject: '{{ "{{" }} .GroupLabels.cluster {{ "}}" }}: {{ "{{" }} template "__subject" . {{ "}}" }}' + {% endif %} +{% endif %} diff --git a/src/ClusterBootstrap/services/monitor/alert-templates/email.tmpl b/src/ClusterBootstrap/services/monitor/alert-templates/email.tmpl new file mode 100644 index 000000000..b9e411dde --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/alert-templates/email.tmpl @@ -0,0 +1,128 @@ +{{ define "email.html" }} + + + + + + +{{ template "__subject" . }} + + + + + + + + + + + +
+
+ + + {{ if gt (len .Alerts.Firing) 0 }} + + + + + +
+ {{ else }} + + {{ end }} + {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }} + {{ .Name }}={{ .Value }} + {{ end }} +
+ + + + + {{ if gt (len .Alerts.Firing) 0 }} + + + + {{ end }} + {{ range .Alerts.Firing }} + + + + {{ end }} + + {{ if gt (len .Alerts.Resolved) 0 }} + + + + {{ end }} + {{ range .Alerts.Resolved }} + + + + {{ end }} +
+ View in {{ template "__alertmanager" . }} +
+ [{{ .Alerts.Firing | len }}] Firing +
+ {{ if .Annotations.summary }} + {{ .Annotations.summary }} + {{ else }} + Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ end }} + Source
+
+ [{{ .Alerts.Resolved | len }}] Resolved +
+ {{ if .Annotations.summary }} + {{ .Annotations.summary }} + {{ else }} + Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ end }} + Source
+
+
+ +
+
+ + + +{{ end }} diff --git a/src/ClusterBootstrap/services/monitor/alert-templates/idle-gpu.tmpl b/src/ClusterBootstrap/services/monitor/alert-templates/idle-gpu.tmpl new file mode 100644 index 000000000..02b6169af --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/alert-templates/idle-gpu.tmpl @@ -0,0 +1,72 @@ +{{ define "idle_gpu.html" }} + + + + + + +{{ template "__subject" . }} + + + + + + + + + + + +
+
+ + + + +
+ + {{ range .Alerts.Firing }} + + + + {{ end }} +
+Your job + +{{.Labels.job_name}} + from cluster '{{.Labels.cluster}}' VC '{{.Labels.vc_name}}' have been idle for too long. +Please kill the job if you do not need it anymore. +
+
+ +
+
+ + + +{{ end }} diff --git a/src/ClusterBootstrap/services/monitor/alert-templates/job_state.tmpl b/src/ClusterBootstrap/services/monitor/alert-templates/job_state.tmpl new file mode 100644 index 000000000..2da286cc1 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/alert-templates/job_state.tmpl @@ -0,0 +1,71 @@ +{{ define "job_state.html" }} + + + + + + +{{ template "__subject" . }} + + + + + + + + + + + +
+
+ + + + +
+ + {{ range .Alerts.Firing }} + + + + {{ end }} +
+Your job + +{{.Labels.job_name}} + from cluster '{{.Labels.cluster}}' has changed to state of {{.Labels.job_state}}. +
+
+ +
+
+ + + +{{ end }} diff --git a/src/ClusterBootstrap/services/monitor/alert-templates/kill-idle.tmpl b/src/ClusterBootstrap/services/monitor/alert-templates/kill-idle.tmpl new file mode 100644 index 000000000..e29dc993c --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/alert-templates/kill-idle.tmpl @@ -0,0 +1,71 @@ +{{ define "kill_idle.html" }} + + + + + + +{{ template "__subject" . }} + + + + + + + + + + + +
+
+ + + + +
+ + {{ range .Alerts.Firing }} + + + + {{ end }} +
+Your job + +{{.Labels.job_name}} + from cluster '{{.Labels.cluster}}' VC '{{.Labels.vc_name}}' was killed because it have been idle for too long. +
+
+ +
+
+ + + +{{ end }} diff --git a/src/ClusterBootstrap/services/monitor/alerting/gpu.rules b/src/ClusterBootstrap/services/monitor/alerting/gpu.rules new file mode 100644 index 000000000..1c6f7b0dd --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/alerting/gpu.rules @@ -0,0 +1,32 @@ +groups: + - name: gpu_related + rules: + - alert: NvidiaSmiLatencyTooLarge + expr: histogram_quantile(0.95, sum(rate(cmd_nvidia_smi_latency_seconds_bucket[5m])) by (le, instance)) > 40 + for: 30m + annotations: + summary: "95th nvidia-smi call latency is larger than 40s in {{$labels.instance}}, should check the gpu status manually" + + - alert: NvidiaSmiEccError + expr: nvidiasmi_ecc_error_count{type="volatile_double"} > 0 + for: 30m + annotations: + summary: "nvidia card from {{$labels.instance}} minor number {{$labels.minor_number}} has {{$labels.type}} ecc error, count {{$value}}" + + - alert: NvidiaMemoryLeak + expr: nvidiasmi_memory_leak_count > 0 + for: 30m + annotations: + summary: "found nvidia memory leak from {{$labels.instance}} minor number {{$labels.minor_number}}" + + - alert: NvidiaZombieProcess + expr: zombie_process_count{command="nvidia-smi"} > 0 + for: 30m + annotations: + summary: "found nvidia zombie process in {{$labels.instance}}" + + - alert: NvidiaRetiredPage + expr: sum (nvidiasmi_retired_page_count) by (instance, minor_number) > 60 + for: 30m + annotations: + summary: "gpu retired page from {{$labels.instance}} {{$labels.minor_number}} exceed threshold, may need to replace this gpu" diff --git a/src/ClusterBootstrap/services/monitor/alerting/jobs.rules b/src/ClusterBootstrap/services/monitor/alerting/jobs.rules new file mode 100644 index 000000000..7f095f2e4 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/alerting/jobs.rules @@ -0,0 +1,8 @@ +groups: + - name: idle-job + rules: + - alert: zero-gpu-usage + expr: avg(task_gpu_percent) by (user_email, job_name, vc_name) == 0 + for: 4h + labels: + type: idle_gpu diff --git a/src/ClusterBootstrap/services/monitor/alerting/k8s.rules b/src/ClusterBootstrap/services/monitor/alerting/k8s.rules new file mode 100644 index 000000000..17c3ba277 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/alerting/k8s.rules @@ -0,0 +1,14 @@ +groups: + - name: k8s_component + rules: + - alert: k8sApiServerNotOk + expr: k8s_api_server_count{error!="ok"} > 0 + for: 30m + annotations: + summary: "api server in {{$labels.host_ip}} is {{$labels.error}}" + + - alert: k8sDockerDaemonNotOk + expr: docker_daemon_count{error!="ok"} > 0 + for: 30m + annotations: + summary: "docker daemon in {{$labels.ip}} is {{$labels.error}}" diff --git a/src/ClusterBootstrap/services/monitor/alerting/node.rules b/src/ClusterBootstrap/services/monitor/alerting/node.rules new file mode 100644 index 000000000..d182ca241 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/alerting/node.rules @@ -0,0 +1,44 @@ +groups: + - name: node-rules + rules: + - alert: NodeFilesystemUsage + expr: node_filesystem_avail_bytes{mountpoint=~"/host-root.*", device=~"/dev.*"} / node_filesystem_size_bytes * 100 <= 20 + for: 30m + annotations: + summary: "Free space in {{$labels.device}} from {{$labels.instance}} is less than 20% (current value is: {{ $value }})" + + - alert: NodeMemoryUsage + expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 95 + for: 30m + annotations: + summary: "Memory usage in {{$labels.instance}} is above 95% (current value is: {{ $value }})" + + - alert: NodeCPUUsage + expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) > 98 + for: 30m + annotations: + summary: "CPU usage in {{$labels.instance}} is above 98% (current value is: {{ $value }})" + + - alert: NodeDiskPressure + expr: pai_node_count{disk_pressure="true"} > 0 + for: 10m + annotations: + summary: "{{$labels.name}} is under disk pressure" + + - alert: NodeOutOfDisk + expr: pai_node_count{out_of_disk="true"} > 0 + for: 30m + annotations: + summary: "{{$labels.name}} is out of disk" + + - alert: NodeNotReady + expr: pai_node_count{ready!="true"} > 0 + for: 10m + annotations: + summary: "{{$labels.name}} is not ready" + + - alert: AzureAgentConsumeTooMuchMem + expr: process_mem_usage_byte{cmd=~".*om[is]agent.*"} > 1073741824 # 1G + for: 10m + annotations: + summary: "{{$labels.cmd}} with pid {{$labels.pid}} in {{$labels.instance}} consume more than 1G of memory" diff --git a/src/ClusterBootstrap/services/monitor/alerting/services.rules b/src/ClusterBootstrap/services/monitor/alerting/services.rules new file mode 100644 index 000000000..288477496 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/alerting/services.rules @@ -0,0 +1,24 @@ +groups: + - name: services + rules: + - alert: ServicePodNotRunning + expr: pai_pod_count{phase!="running"} > 0 + for: 30m + annotations: + summary: "{{$labels.name}} in {{$labels.host_ip}} not running detected" + + - alert: ServicePodNotReady + expr: pai_pod_count{phase="running", ready="false"} > 0 + for: 30m + labels: + type: pai_service + annotations: + summary: "{{$labels.name}} in {{$labels.host_ip}} not ready detected" + + - alert: ServiceNotUp + expr: up != 1 + for: 30m + labels: + type: pai_service + annotations: + summary: "{{$labels.pai_service_name}} in {{$labels.instance}} not up detected" diff --git a/src/ClusterBootstrap/services/monitor/collectd.yaml b/src/ClusterBootstrap/services/monitor/collectd.yaml deleted file mode 100755 index 9784fe88f..000000000 --- a/src/ClusterBootstrap/services/monitor/collectd.yaml +++ /dev/null @@ -1,100 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: collectd-node-agent - namespace: kube-system - labels: - app: collectd-node-agent -spec: - selector: - matchLabels: - name: collectd-node-agent - template: - metadata: - labels: - name: collectd-node-agent - spec: - hostNetwork: true - hostPID: true - hostIPC: true - dnsPolicy: Default - imagePullSecrets: - - name: acrregkey - containers: - - name: collectd - image: {{cnf["dockers"]["container"]["collectd"]["fullname"]}} - imagePullPolicy: Always - securityContext: - privileged: true - env: - - name: CONFIG_TYPE - value: influxdb - - name: EP_HOST - value: {{cnf["influxdb_node"]}} - - name: EP_PORT - value: "{{cnf["influxdb_tp_port"]}}" - - name: K8SAPI - value: "https://127.0.0.1:{{cnf["k8sAPIport"]}}" - - name: HOST_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - volumeMounts: - #- name: collectd-config - # mountPath: /etc/collectd - - mountPath: /etc/kubernetes/ssl - name: certs - - mountPath: /root/.kube/config - name: kubeconfig - - name: proc - mountPath: /mnt/proc - readOnly: true - - name: root - mountPath: /hostfs - readOnly: true - - name: varlog - mountPath: /hostfs/var/log - readOnly: true - - name: etc - mountPath: /mnt/etc - readOnly: true - - name: run - mountPath: /var/run/docker.sock - - name: nvidia-driver - mountPath: /usr/local/nvidia - volumes: - #- name: collectd-config - # configMap: - # name: collectd-config - # items: - # - key: node-collectd.conf - # path: collectd.conf - - name: certs - hostPath: - path: /etc/kubernetes/ssl - - name: kubeconfig - hostPath: - path: /etc/kubernetes/restapi-kubeconfig.yaml - - name: proc - hostPath: - path: /proc - - name: root - hostPath: - path: / - - name: varlog - hostPath: - path: /var/log - - name: etc - hostPath: - path: /etc - - name: run - hostPath: - path: /var/run/docker.sock - - name: nvidia-driver - hostPath: - path: {{cnf["nvidia-driver-path"]}} - tolerations: - - key: CriticalAddonsOnly - operator: Exists - - key: node-role.kubernetes.io/master - effect: NoSchedule diff --git a/src/ClusterBootstrap/services/monitor/config_alerting.py b/src/ClusterBootstrap/services/monitor/config_alerting.py new file mode 100755 index 000000000..5efff9963 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/config_alerting.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +import os +import sys +import yaml + +headers = """ +groups: + - name: kill-idle + rules: +""" + +kill_template = """ + - alert: kill-idle-jobs-email-%s + for: %dh + expr: avg(task_gpu_percent{vc_name="%s"}) by (user_email, job_name, vc_name) == 0 + labels: + type: kill_idle_job_email + - alert: kill-idle-jobs-%s + for: %dh + expr: avg(task_gpu_percent{vc_name="%s"}) by (user_email, job_name, vc_name) == 0 + labels: + type: reaper +""" + +def config_kill_rule(m): + for vc_name, hour in m.items(): + print(kill_template % (vc_name, hour, vc_name, vc_name, hour, vc_name)) + +def extract_relevant_config(config_map): + return config_map.get("prometheus", {}).get("alerting", {}).get("kill-idle", {}) + +if __name__ == "__main__": + with open(sys.argv[1]) as f: + config = yaml.load(f.read()) + + print(headers) + config_kill_rule(extract_relevant_config(config)) diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/cluster-gpu-statistic-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/cluster-gpu-statistic-dashboard.json new file mode 100644 index 000000000..685277ee2 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/grafana-config/cluster-gpu-statistic-dashboard.json @@ -0,0 +1,535 @@ +{ + "dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "count (task_gpu_percent) by (username)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{ '{{' }} username {{ '}}' }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Per user allocation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "count (task_gpu_percent) by (vc_name)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{ '{{' }} vc_name {{ '}}' }} allocation", + "refId": "A" + }, + { + "expr": "count(task_gpu_percent)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Cluster wide allocation", + "refId": "B" + }, + { + "expr": "k8s_vc_gpu_total", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ '{{' }} vc_name {{ '}}' }} quota", + "refId": "C" + }, + { + "expr": "sum(k8s_vc_gpu_total)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Cluster wide quota", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Per VC allocation & quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "count (task_gpu_percent) by (vc_name) / sum (k8s_vc_gpu_total) by (vc_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ '{{' }} vc_name {{ '}}' }}", + "refId": "A" + }, + { + "expr": "count (task_gpu_percent) / sum (k8s_vc_gpu_total)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Cluster wide", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Per VC allocation rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg (task_gpu_percent) by (username)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ '{{' }} username {{ '}}' }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Per user GPU utils", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg (task_gpu_percent) by (vc_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ '{{' }} vc_name {{ '}}' }}", + "refId": "A" + }, + { + "expr": "avg (task_gpu_percent)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Cluster wide", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Per VC GPU utils", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Cluster GPU statistic", + "version": 0 + } +} diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/cluster-status-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/cluster-status-dashboard.json new file mode 100644 index 000000000..6e3297d31 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/grafana-config/cluster-status-dashboard.json @@ -0,0 +1,625 @@ +{ + "dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(k8s_node_gpu_total) - sum(k8s_node_gpu_available) - sum(k8s_node_gpu_reserved)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "title": "Used GPUs", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(k8s_node_gpu_available)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "title": "Avaliable GPUs", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(k8s_node_gpu_reserved)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "title": "Reserved GPU", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 1, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(k8s_node_gpu_total)", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Total GPUs", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(task_cpu_percent)", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "title": "Active Jobs", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(pai_node_count{ready=\"true\", unschedulable=\"false\"})", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "title": "Active Nodes", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(pai_node_count)", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "title": "Total Node", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Cluster Status", + "version": 3 + } +} diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/gpu-fragmentation-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/gpu-fragmentation-dashboard.json new file mode 100644 index 000000000..7f6264bf0 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/grafana-config/gpu-fragmentation-dashboard.json @@ -0,0 +1,149 @@ +{"dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 1, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "count_values(\"gpu_available\", k8s_node_gpu_available)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{'{{'}} gpu_available {{'}}'}} available gpus", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Node count with certain number of available gpu", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Node count", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "GPU fragmentation", + "version": 0 +}} diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/gpu-usage-history-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/gpu-usage-history-dashboard.json new file mode 100644 index 000000000..3e093f35f --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/grafana-config/gpu-usage-history-dashboard.json @@ -0,0 +1,164 @@ +{"dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 1, + "legend": { + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(k8s_node_gpu_available)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Available", + "refId": "A" + }, + { + "expr": "sum(k8s_node_gpu_total)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Total", + "refId": "B" + }, + { + "expr": "sum(k8s_node_gpu_reserved)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Reserved", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "GPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "GPU Usage", + "version": 0 +}} diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/job-status-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/job-status-dashboard.json new file mode 100644 index 000000000..ae5b3d1b3 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/grafana-config/job-status-dashboard.json @@ -0,0 +1,592 @@ +{"dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Dashboard to view job metrics", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PM", + "fill": 0, + "height": "400px", + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "task_cpu_percent{job_name=\"$job_name\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{'{{'}}instance{{'}}'}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PM", + "fill": 0, + "height": "400px", + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "task_mem_usage_byte{job_name=\"$job_name\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{'{{'}}instance{{'}}'}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PM", + "fill": 0, + "height": "400px", + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "task_net_in_byte{job_name=\"$job_name\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{'{{'}}instance{{'}}'}} Inbound", + "refId": "A" + }, + { + "expr": "task_net_out_byte{job_name=\"$job_name\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{'{{'}}instance{{'}}'}} Outbound", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PM", + "fill": 0, + "height": "400px", + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(task_block_in_byte{job_name=\"$job_name\"}[300s])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{'{{'}}instance{{'}}'}} Read", + "refId": "A" + }, + { + "expr": "irate(task_block_out_byte{job_name=\"$job_name\"}[300s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{'{{'}}instance{{'}}'}} Write", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Block IO", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "task_gpu_percent{job_name=\"$job_name\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "GPU {{'{{'}}minor_number{{'}}'}} on {{'{{'}}instance{{'}}'}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "GPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "task_gpu_mem_percent{job_name=\"$job_name\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "GPU {{'{{'}}minor_number{{'}}'}} on {{'{{'}}instance{{'}}'}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "GPU Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "PM", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "job_name", + "options": [], + "query": "label_values(task_cpu_percent, job_name)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Job Status", + "version": 1 +}} diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/node-status-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/node-status-dashboard.json new file mode 100644 index 000000000..45f4e5e13 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/grafana-config/node-status-dashboard.json @@ -0,0 +1,934 @@ +{"dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Dashboard to view multiple Node resource usage", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": true, + "id": null, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": 266, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PM", + "decimals": 3, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": null, + "nullPointMode": "connected", + "percentage": true, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (instance)(irate(node_cpu_seconds_total{mode=\"idle\",instance=~\"$node(:[0-9]*)?$\"}[300s])) * 100)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "usage", + "refId": "B" + } + ], + "thresholds": [ + { + "colorMode": "custom", + "fill": false, + "fillColor": "rgba(216, 200, 27, 0.27)", + "op": "gt", + "value": 0 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "%", + "logBase": 1, + "max": "100", + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "Slab": "#E5A8E2", + "Swap": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PM", + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": 2, + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/Apps|Buffers|Cached|Free|Slab|SwapCached|PageTables|VmallocUsed/" + }, + { + "alias": "Swap" + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal_bytes{instance=~'$node(:[0-9]*)?$'} - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "usage", + "metric": "", + "refId": "A", + "step": 600, + "target": "" + }, + { + "expr": "node_memory_Buffers_bytes{instance=~'$node(:[0-9]*)?$'} + node_memory_Cached_bytes", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "buff/cache", + "refId": "B" + }, + { + "expr": "node_memory_MemTotal_bytes{instance=~'$node(:[0-9]*)?$'}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "total", + "refId": "C", + "step": 600 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "GB", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PM", + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": null, + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*_in/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_receive_bytes_total{instance=~\"$node(:[0-9]*)?$\"}[300s]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "in", + "refId": "A", + "step": 600 + }, + { + "expr": "sum(rate(node_network_transmit_bytes_total{instance=~\"$node(:[0-9]*)?$\"}[300s]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "out", + "refId": "B", + "step": 600 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network Traffic", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "CPU", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 262, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PM", + "decimals": 3, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": null, + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_disk_read_bytes_total{instance=~\"$node(:[0-9]*)?$\"}[300s]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "read", + "metric": "", + "refId": "A", + "step": 600, + "target": "" + }, + { + "expr": "sum(rate(node_disk_written_bytes_total{instance=~\"$node(:[0-9]*)?$\"}[300s]))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "write", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PM", + "decimals": 3, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": null, + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "nvidiasmi_utilization_gpu{instance=~\"$node(:[0-9]*)?$\"}", + "format": "time_series", + "legendFormat": "{{'{{'}}minor_number{{'}}'}}", + "interval": "", + "intervalFactor": 2, + "metric": "", + "refId": "A", + "step": 600, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "GPU Utilization", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PM", + "decimals": 3, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "id": 23, + "legend": { + "show": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": null, + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "nvidiasmi_utilization_memory{instance=~\"$node(:[0-9]*)?$\"}", + "format": "time_series", + "legendFormat": "{{'{{'}}minor_number{{'}}'}}", + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "GPU Memory", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg by (device) (node_filesystem_free_bytes{instance=~\"$node(:[0-9]*)?$\", device=~\"/dev/.*\"} / node_filesystem_size_bytes) * 100", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{'{{'}}device{{'}}'}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Filesystem free space percent", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 26, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg by (device) (node_filesystem_files_free{instance=~\"$node(:[0-9]*)?$\", device=~\"/dev/.*\"} / node_filesystem_files) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{'{{'}}device{{'}}'}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Filesystem free inode percent", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 25, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=~\"$node(:[0-9]*)?$\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load1", + "refId": "A" + }, + { + "expr": "node_load5{instance=~\"$node(:[0-9]*)?$\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load5", + "refId": "B" + }, + { + "expr": "node_load15{instance=~\"$node(:[0-9]*)?$\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load15", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Node load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allFormat": "glob", + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "PM", + "hide": 0, + "includeAll": false, + "label": "", + "multi": false, + "multiFormat": "regex values", + "name": "node", + "options": [], + "query": "label_values(node_uname_info, instance)", + "refresh": 1, + "regex": "/([^:]*)/", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Node Status", + "version": 1 +}} diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/per-vc-gpu-usage-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/per-vc-gpu-usage-dashboard.json new file mode 100644 index 000000000..f5b6737c8 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/grafana-config/per-vc-gpu-usage-dashboard.json @@ -0,0 +1,190 @@ +{ + "dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 1, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(k8s_vc_gpu_total{vc_name=\"$vc_name\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Total", + "refId": "A" + }, + { + "expr": "sum(k8s_vc_gpu_available{vc_name=\"$vc_name\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Available", + "refId": "B" + }, + { + "expr": "sum(k8s_vc_gpu_preemptive_availabe{vc_name=\"$vc_name\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Preemptive Available", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Per VC GPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "tags": [], + "text": "platform", + "value": "platform" + }, + "datasource": "PM", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "vc_name", + "options": [], + "query": "label_values(k8s_vc_gpu_total, vc_name)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Per VC Gpu statistic", + "version": 0 + } +} diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/perf-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/perf-dashboard.json new file mode 100644 index 000000000..c78fab7dd --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/grafana-config/perf-dashboard.json @@ -0,0 +1,389 @@ +{ + "dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.9, sum(rate(datahandler_fn_latency_seconds_bucket{scraped_from=~\"jobmanager.*\"}[5m])) by (le, fn_name))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{'{{'}} fn_name {{'}}'}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Datahandler 90th percentile latency per function from jobmanager", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.9, sum(rate(datahandler_fn_latency_seconds_bucket{scraped_from=~\"restfulapi.*\"}[5m])) by (le, fn_name))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{'{{'}} fn_name {{'}}'}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Datahandler 90th percentile latency per function from restfulapi", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.9, sum(rate(db_connect_latency_seconds_bucket{scraped_from=~\"jobmanager.*\"}[5m])) by (le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Connection Latency", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "90th percentile DB connection latency from jobmanager", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.9, sum(rate(db_connect_latency_seconds_bucket{scraped_from=~\"restfulapi.*\"}[5m])) by (le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Connection Latency", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "90th percentile DB connection latency from restfulapi", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Performance dashboard", + "version": 0 + } +} diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/prom-datasource.json b/src/ClusterBootstrap/services/monitor/grafana-config/prom-datasource.json new file mode 100644 index 000000000..027e5ea6c --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/grafana-config/prom-datasource.json @@ -0,0 +1,8 @@ +{ + "name": "PM", + "url": "http://{{cnf['grafana']['prometheus-ip']}}:9091/prometheus", + "basicAuth": false, + "access": "proxy", + "type": "prometheus", + "isDefault": true +} diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/service-status-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/service-status-dashboard.json new file mode 100644 index 000000000..75807fbdb --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/grafana-config/service-status-dashboard.json @@ -0,0 +1,429 @@ +{"dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Dashboard to view service metrics", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PM", + "fill": 0, + "height": "400px", + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "service_cpu_percent{name=\"$service_name\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{'{{'}}instance{{'}}'}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PM", + "fill": 0, + "height": "400px", + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "service_mem_usage_byte{name=\"$service_name\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{'{{'}}instance{{'}}'}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PM", + "fill": 0, + "height": "400px", + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "service_net_in_byte{name=\"$service_name\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{'{{'}}instance{{'}}'}} Inbound", + "refId": "A" + }, + { + "expr": "service_net_out_byte{name=\"$service_name\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{'{{'}}instance{{'}}'}} Outbound", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Network", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "PM", + "fill": 0, + "height": "400px", + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(service_block_in_byte{name=\"$service_name\"}[300s])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{'{{'}}instance{{'}}'}} Read", + "refId": "A" + }, + { + "expr": "irate(service_block_out_byte{name=\"$service_name\"}[300s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{'{{'}}instance{{'}}'}} Write", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Block IO", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "PM", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "service_name", + "options": [], + "query": "label_values(service_cpu_percent, name)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Service Status", + "version": 1 +}} diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/storage-usage-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/storage-usage-dashboard.json new file mode 100644 index 000000000..85b1c8c56 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/grafana-config/storage-usage-dashboard.json @@ -0,0 +1,149 @@ +{"dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min ((node_filesystem_size_bytes - node_filesystem_free_bytes{fstype=\"nfs4\"}) / node_filesystem_size_bytes) by (device) * 100", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{ '{{' }} device {{ '}}' }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Storage Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Storage usage", + "version": 0 +}} diff --git a/src/ClusterBootstrap/services/monitor/grafana-config/user-status-dashboard.json b/src/ClusterBootstrap/services/monitor/grafana-config/user-status-dashboard.json new file mode 100644 index 000000000..ab2a93946 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/grafana-config/user-status-dashboard.json @@ -0,0 +1,157 @@ +{ + "dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "columns": [], + "datasource": null, + "fontSize": "100%", + "id": 1, + "links": [], + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 0, + "desc": true + }, + "span": 12, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "", + "thresholds": [], + "type": "number", + "unit": "short" + }, + { + "alias": "used GPU", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": null, + "pattern": "Value", + "thresholds": [], + "type": "number", + "unit": "none" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "thresholds": [], + "type": "number", + "unit": "short" + } + ], + "targets": [ + { + "expr": "count by (username) (task_gpu_percent)", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "User Status", + "transform": "table", + "type": "table" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "User Status", + "version": 1 + } +} diff --git a/src/ClusterBootstrap/services/monitor/grafana.yaml b/src/ClusterBootstrap/services/monitor/grafana.yaml old mode 100755 new mode 100644 index 5820f7f5e..076be35eb --- a/src/ClusterBootstrap/services/monitor/grafana.yaml +++ b/src/ClusterBootstrap/services/monitor/grafana.yaml @@ -1,37 +1,50 @@ -kind: DaemonSet apiVersion: apps/v1 +kind: Deployment metadata: - name: dlws-grafana + name: grafana namespace: kube-system spec: + replicas: 1 selector: matchLabels: - k8s-app: grafana + app: grafana template: metadata: labels: task: monitoring - k8s-app: grafana + app: grafana spec: nodeSelector: grafana: active hostNetwork: true + hostPID: false + dnsPolicy: ClusterFirstWithHostNet containers: - name: grafana - image: {{cnf["dockers"]["container"]["grafana"]["fullname"]}} + image: openpai/grafana:dlws imagePullPolicy: Always + ports: + - containerPort: {{cnf["grafana"]["port"]}} + protocol: TCP volumeMounts: + - mountPath: /grafana-configuration + name: grafana-confg-volume - mountPath: /etc/ssl/certs name: ca-certificates readOnly: true - mountPath: /etc/hostname-fqdn name: hostname-fqdn readOnly: true + resources: + limits: + memory: "256Mi" env: - - name: INFLUXDB_HOST - value: {{cnf["influxdb_node"]}} + - name : GRAFANA_URL + value: 'http://127.0.0.1:{{cnf["grafana"]["port"]}}' # for script to install dashboard only + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "true" - name: GF_SERVER_HTTP_PORT - value: "3000" + value: '{{cnf["grafana"]["port"]}}' # The following env variables are required to make Grafana accessible via # the kubernetes api-server proxy. On production clusters, we recommend # removing these env variables, setup auth for grafana, and expose the grafana @@ -46,9 +59,12 @@ spec: - name: GF_SERVER_ROOT_URL # If you're only using the API Server proxy, set this value instead: #value: /api/v1/proxy/namespaces/kube-system/services/monitoring-grafana/ - value: /grafana/ - #value: / + #value: /grafana/ + value: / volumes: + - name: grafana-confg-volume + configMap: + name: grafana-configuration - name: ca-certificates hostPath: path: /etc/ssl/certs @@ -59,4 +75,4 @@ spec: - key: CriticalAddonsOnly operator: Exists - key: node-role.kubernetes.io/master - effect: NoSchedule \ No newline at end of file + effect: NoSchedule diff --git a/src/ClusterBootstrap/services/monitor/heapster.yaml b/src/ClusterBootstrap/services/monitor/heapster.yaml deleted file mode 100755 index c43f5adf2..000000000 --- a/src/ClusterBootstrap/services/monitor/heapster.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# Service Account for heapster ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: heapster - namespace: kube-system ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: heapster - namespace: kube-system -spec: - selector: - matchLabels: - k8s-app: heapster - replicas: 1 - template: - metadata: - labels: - task: monitoring - k8s-app: heapster - spec: - nodeSelector: - infrastructure: active - serviceAccountName: heapster - containers: - - name: heapster - image: {{cnf["dockers"]["container"]["heapster"]["fullname"]}} - imagePullPolicy: IfNotPresent - command: - - /heapster - - --source=kubernetes:https://{{cnf["api-server-ip"]}}:443 #this is using k8s endpoint, port should be 443 no matter what port is configed for external access. - - --sink=influxdb:http://{{cnf["influxdb_node"]}}:{{cnf["influxdb_port"]}} - tolerations: - - key: CriticalAddonsOnly - operator: Exists - - key: node-role.kubernetes.io/master - effect: NoSchedule - -# TODO merge this with cloudmonitor/heapster \ No newline at end of file diff --git a/src/ClusterBootstrap/services/monitor/influxdb.yaml b/src/ClusterBootstrap/services/monitor/influxdb.yaml deleted file mode 100755 index 4a47e3d44..000000000 --- a/src/ClusterBootstrap/services/monitor/influxdb.yaml +++ /dev/null @@ -1,56 +0,0 @@ -kind: DaemonSet -apiVersion: apps/v1 -metadata: - name: dlws-influxdb - namespace: kube-system -spec: - selector: - matchLabels: - k8s-app: influxdb - template: - metadata: - labels: - task: monitoring - k8s-app: influxdb - spec: - nodeSelector: - influxdb: active - hostNetwork: true - containers: - - name: influxdb - image: {{cnf["dockers"]["container"]["influxdb"]["fullname"]}} - imagePullPolicy: Always - volumeMounts: - - name: influxdb - mountPath: /var/lib/influxdb - volumes: - - name: influxdb - hostPath: - path: {{cnf["influxdb_data_path"]}} - tolerations: - - key: CriticalAddonsOnly - operator: Exists - - key: node-role.kubernetes.io/master - effect: NoSchedule ---- -apiVersion: v1 -kind: Service -metadata: - labels: - task: monitoring - # For use as a Cluster add-on (https://github.com/kubernetes/kubernetes/tree/master/cluster/addons) - # If you are NOT using this as an addon, you should comment out this line. - kubernetes.io/cluster-service: 'true' - kubernetes.io/name: dlws-influxdb - name: dlws-influxdb - namespace: kube-system -spec: - ports: - - name: influxdb-html - port: {{cnf["influxdb_port"]}} - targetPort: {{cnf["influxdb_port"]}} - - name: influxdb-colletcd - port: {{cnf["influxdb_tp_port"]}} - targetPort: {{cnf["influxdb_tp_port"]}} - selector: - k8s-app: influxdb \ No newline at end of file diff --git a/src/ClusterBootstrap/services/monitor/job-exporter.yaml b/src/ClusterBootstrap/services/monitor/job-exporter.yaml new file mode 100644 index 000000000..90df9fd62 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/job-exporter.yaml @@ -0,0 +1,86 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: job-exporter + namespace: kube-system +spec: + selector: + matchLabels: + app: job-exporter + template: + metadata: + labels: + app: job-exporter + task: monitoring + annotations: + prometheus.io/scrape: "true" + prometheus.io/path: "/metrics" + prometheus.io/port: "{{cnf['job-exporter']['port']}}" + name: job-exporter + spec: + containers: + - image: openpai/job-exporter:dlws + imagePullPolicy: Always + livenessProbe: # in case job-exporter hangs + httpGet: + path: '/healthz' + port: {{cnf['job-exporter']['port']}} + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 10 + command: + - "python" + - "/job_exporter/main.py" + - "--port" + - "{{cnf['job-exporter']['port']}}" + - "--interval" + - "30" + - "--interface" + - "eth0" + resources: + limits: + memory: "512Mi" + securityContext: + privileged: true # this is required since job-exporter will call setns to other containers + env: + - name: HOST_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: LOGGING_LEVEL + value: INFO + - name: NV_DRIVER + value: /var/drivers/nvidia/current + - name: NVIDIA_VISIBLE_DEVICES + value: all + volumeMounts: + - mountPath: /var/run/docker.sock + name: docker-socket + - mountPath: /dev + name: device-mount + - mountPath: /var/drivers/nvidia/current + name: driver-path + name: job-exporter + ports: + - containerPort: {{cnf['job-exporter']['port']}} + hostPort: {{cnf['job-exporter']['port']}} + name: main + volumes: + - name: docker-socket + hostPath: + path: /var/run/docker.sock + - name: device-mount + hostPath: + path: /dev + - name: driver-path + hostPath: + path: /var/drivers/nvidia/current + hostNetwork: true + hostPID: true # This is required since job-exporter should get list of pid in container + tolerations: + - key: node.kubernetes.io/memory-pressure + operator: "Exists" + - key: node.kubernetes.io/disk-pressure + operator: "Exists" + - key: node-role.kubernetes.io/master + operator: "Exists" diff --git a/src/ClusterBootstrap/services/monitor/launch_order b/src/ClusterBootstrap/services/monitor/launch_order index d83bcb3e6..1e17557f9 100755 --- a/src/ClusterBootstrap/services/monitor/launch_order +++ b/src/ClusterBootstrap/services/monitor/launch_order @@ -1,4 +1,9 @@ -influxdb.yaml -collectd.yaml +node-exporter.yaml +job-exporter.yaml +watchdog.yaml +prometheus-alerting.yaml +prometheus.yaml +grafana-config.yaml grafana.yaml -heapster.yaml \ No newline at end of file +alert-templates.yaml +alert-manager.yaml diff --git a/src/ClusterBootstrap/services/monitor/node-exporter.yaml b/src/ClusterBootstrap/services/monitor/node-exporter.yaml new file mode 100644 index 000000000..26fb8633e --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/node-exporter.yaml @@ -0,0 +1,82 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-exporter + namespace: kube-system +spec: + selector: + matchLabels: + app: node-exporter + template: + metadata: + labels: + app: node-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/path: "/metrics" + prometheus.io/port: "{{ cnf['node-exporter']['port'] }}" + name: node-exporter + spec: + containers: + - image: prom/node-exporter:v0.16.0 + imagePullPolicy: Always + readinessProbe: + tcpSocket: + port: {{ cnf["node-exporter"]["port"] }} + initialDelaySeconds: 3 + periodSeconds: 30 + timeoutSeconds: 10 + resources: + limits: + memory: "128Mi" + name: node-exporter + args: + - '--no-collector.arp' + - '--no-collector.bcache' + - '--no-collector.bonding' + - '--no-collector.conntrack' +#- '--no-collector.cpu' Exposes CPU statistics. +#- '--no-collector.diskstats' Exposes disk I/O statistics. + - '--no-collector.edac' + - '--no-collector.entropy' +#- '--no-collector.filefd' Exposes file descriptor statistics from /proc/sys/fs/file-nr +#- '--no-collector.filesystem' Exposes filesystem statistics, such as disk space used. + - '--no-collector.hwmon' + - '--no-collector.infiniband' + - '--no-collector.ipvs' +#- '--no-collector.loadavg' Exposes load average. + - '--no-collector.mdadm' +#- '--no-collector.meminfo' Exposes memory statistics. +#- '--no-collector.netdev' Exposes network interface statistics such as bytes transferred. +#- '--no-collector.netstat' Exposes network statistics from /proc/net/netstat. This is the same information as netstat -s. + - '--no-collector.nfs' + - '--no-collector.nfsd' + - '--no-collector.sockstat' + - '--no-collector.stat' + - '--no-collector.time' + - '--no-collector.timex' +#- '--no-collector.uname' + - '--no-collector.vmstat' + - '--no-collector.wifi' + - '--no-collector.xfs' + - '--no-collector.zfs' + ports: + - containerPort: {{ cnf["node-exporter"]["port"] }} + hostPort: {{ cnf["node-exporter"]["port"] }} + name: scrape + volumeMounts: + - name: host-root + mountPath: /host-root/ + readOnly: true + volumes: + - name: host-root + hostPath: + path: / + hostNetwork: true + tolerations: + - key: node.kubernetes.io/memory-pressure + operator: "Exists" + - key: node.kubernetes.io/disk-pressure + operator: "Exists" + - key: node-role.kubernetes.io/master + operator: "Exists" diff --git a/src/ClusterBootstrap/services/monitor/pre-render.sh b/src/ClusterBootstrap/services/monitor/pre-render.sh new file mode 100755 index 000000000..cf783b746 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/pre-render.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +dir=`dirname $0` + +kill_idle_rule=${dir}/alerting/kill-idle.rules + +grafana_file_name=${dir}/grafana-config.yaml +alert_tmpl_file_name=${dir}/alert-templates.yaml +prometheus_file_name=${dir}/prometheus-alerting.yaml + +rm $kill_idle_rule $grafana_file_name $alert_tmpl_file_name $prometheus_file_name 2> /dev/null + +# config kill rules +${dir}/config_alerting.py "${dir}/../../config.yaml" > $kill_idle_rule + +# create configmap +for i in `find ${dir}/grafana-config/ -type f -regex ".*json" ` ; do + echo --from-file=$i +done | xargs ${dir}/../../deploy/bin/kubectl --namespace=kube-system create configmap grafana-configuration --dry-run -o yaml >> $grafana_file_name + +${dir}/../../deploy/bin/kubectl --namespace=kube-system create configmap alert-templates --from-file=${dir}/alert-templates --dry-run -o yaml > $alert_tmpl_file_name + +${dir}/../../deploy/bin/kubectl --namespace=kube-system create configmap prometheus-alert --from-file=${dir}/alerting --dry-run -o yaml > $prometheus_file_name diff --git a/src/ClusterBootstrap/services/monitor/prometheus.yaml b/src/ClusterBootstrap/services/monitor/prometheus.yaml new file mode 100644 index 000000000..4920b9520 --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/prometheus.yaml @@ -0,0 +1,150 @@ +{% if cnf["prometheus"]["cluster_name"] %} +{% set cluster_name = cnf["prometheus"]["cluster_name"] %} +{% else %} +{% set cluster_name = cnf["cluster_name"] %} +{% endif %} + +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-configmap + namespace: kube-system +data: + prometheus.yml: |- + global: + external_labels: + cluster: {{ cluster_name }} + rule_files: + - "/etc/prometheus-alert/*.rules" + scrape_configs: + - job_name: 'serivce_exporter' + scrape_interval: '30s' + kubernetes_sd_configs: + - role: pod + tls_config: + ca_file: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' + bearer_token_file: '/var/run/secrets/kubernetes.io/serviceaccount/token' + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + regex: true + action: keep + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + regex: (.+) + action: replace + target_label: __metrics_path__ + - source_labels: [__meta_kubernetes_pod_host_ip, __meta_kubernetes_pod_annotation_prometheus_io_port] + regex: '([^;]+);(\d+)' + replacement: ${1}:${2} + action: replace + target_label: __address__ + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: scraped_from + - source_labels: [__meta_kubernetes_pod_label_app] + action: replace + target_label: exporter_name + alerting: + alertmanagers: + - path_prefix: alert-manager + tls_config: + ca_file: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' + bearer_token_file: '/var/run/secrets/kubernetes.io/serviceaccount/token' + kubernetes_sd_configs: + - role: pod + namespaces: + names: ["kube-system"] + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_alert] + regex: true + action: keep + - source_labels: [__meta_kubernetes_pod_host_ip, __meta_kubernetes_pod_annotation_prometheus_io_port] + regex: '([^;]+);(\d+)' + replacement: ${1}:${2} + action: replace + target_label: __address__ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-deployment + namespace: kube-system +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + name: prometheus + labels: + task: monitoring + app: prometheus + annotations: + prometheus.io/scrape: "true" + prometheus.io/path: "/metrics" + prometheus.io/port: '{{cnf["prometheus"]["reporter"]["port"]}}' + spec: + nodeSelector: + prometheus: active + hostNetwork: true + initContainers: + - name: init + image: bash:4 + volumeMounts: + - name: prometheus-data + mountPath: /prometheus-data + command: ["chmod", "777", "/prometheus-data"] # newly create dir have permission 755, which makes prometheus container unable to write + containers: + - name: prometheus + image: prom/prometheus:v2.1.0 + resources: + limits: + memory: "10Gi" + requests: + memory: "256Mi" + args: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--web.listen-address=0.0.0.0:{{cnf["prometheus"]["port"]}}' + - '--web.external-url=http://localhost:{{cnf["prometheus"]["port"]}}/prometheus/' + - '--web.route-prefix=prometheus' + - '--storage.tsdb.path=/prometheus-data' + - '--storage.tsdb.retention=31d' + ports: + - name: web + containerPort: {{cnf["prometheus"]["port"]}} + volumeMounts: + - name: config-volume + mountPath: /etc/prometheus + - name: rules-volume + mountPath: /etc/prometheus-alert + - name: prometheus-data + mountPath: /prometheus-data + - name: gpu-reporter + image: {{cnf["worker-dockerregistry"]}}{{cnf["dockerprefix"]}}gpu-reporter:{{cnf["dockertag"]}} + args: + - 'python' + - '/gpu-reporter/reporter.py' + - '--prometheus_url' + - 'http://localhost:{{cnf["prometheus"]["port"]}}' + - '--port' + - '{{cnf["prometheus"]["reporter"]["port"]}}' + ports: + - name: reporter + containerPort: {{cnf["prometheus"]["reporter"]["port"]}} + volumes: + - name: config-volume + configMap: + name: prometheus-configmap + - name: rules-volume + configMap: + name: prometheus-alert + - name: prometheus-data + hostPath: + path: /data/prometheus/data + tolerations: + - key: node.kubernetes.io/memory-pressure + operator: "Exists" + - key: node.kubernetes.io/disk-pressure + operator: "Exists" + - key: node-role.kubernetes.io/master + effect: NoSchedule diff --git a/src/ClusterBootstrap/services/monitor/watchdog.yaml b/src/ClusterBootstrap/services/monitor/watchdog.yaml new file mode 100644 index 000000000..fb43e43dc --- /dev/null +++ b/src/ClusterBootstrap/services/monitor/watchdog.yaml @@ -0,0 +1,66 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: watchdog + namespace: kube-system +spec: + replicas: 1 + selector: + matchLabels: + app: watchdog + template: + metadata: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: '{{cnf["watchdog"]["port"]}}' + prometheus.io/scrape: 'true' + labels: + app: watchdog + task: monitoring + name: watchdog + spec: + nodeSelector: + watchdog: active + containers: + - command: + - python + - /watchdog.py + - --interval + - '30' + - --port + - '{{cnf["watchdog"]["port"]}}' + - --ca + - /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + - --bearer + - /var/run/secrets/kubernetes.io/serviceaccount/token + - https://{{ cnf["api-server-ip"] }}:443 + {% if "vc_url" in cnf["watchdog"] %} + - --vc_url + - {{cnf["watchdog"]["vc_url"]}} + {% endif %} + env: + - name: LOGGING_LEVEL + value: INFO + image: openpai/watchdog:dlws + imagePullPolicy: Always + name: watchdog + readinessProbe: + httpGet: + path: /healthz + port: {{cnf["watchdog"]["port"]}} + initialDelaySeconds: 3 + periodSeconds: 30 + timeoutSeconds: 10 + resources: + limits: + memory: 256Mi + hostNetwork: true + imagePullSecrets: + - name: pai-secret + tolerations: + - key: node.kubernetes.io/memory-pressure + operator: Exists + - key: node.kubernetes.io/disk-pressure + operator: Exists + - key: node-role.kubernetes.io/master + effect: NoSchedule diff --git a/src/ClusterBootstrap/services/nvidia-device-plugin/nvidia-device-plugin.yaml b/src/ClusterBootstrap/services/nvidia-device-plugin/nvidia-device-plugin.yaml new file mode 100755 index 000000000..912cb7655 --- /dev/null +++ b/src/ClusterBootstrap/services/nvidia-device-plugin/nvidia-device-plugin.yaml @@ -0,0 +1,43 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + updateStrategy: + type: RollingUpdate + template: + metadata: + # Mark this pod as a critical add-on; when enabled, the critical add-on scheduler + # reserves resources for critical add-on pods so that they can be rescheduled after + # a failure. This annotation works in tandem with the toleration below. + annotations: + scheduler.alpha.kubernetes.io/critical-pod: "" + labels: + name: nvidia-device-plugin-ds + spec: + tolerations: + # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode. + # This, along with the annotation above marks this pod as a critical add-on. + - key: CriticalAddonsOnly + operator: Exists + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + containers: + - image: nvidia/k8s-device-plugin:1.11 + name: nvidia-device-plugin-ctr + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/src/ClusterBootstrap/services/restfulapi/restfulapi.yaml b/src/ClusterBootstrap/services/restfulapi/restfulapi.yaml index 0cdda35fe..d1e5a6ce4 100755 --- a/src/ClusterBootstrap/services/restfulapi/restfulapi.yaml +++ b/src/ClusterBootstrap/services/restfulapi/restfulapi.yaml @@ -4,7 +4,7 @@ metadata: name: restfulapi namespace: default labels: - run: dlwsrestfulapi + run: dlwsrestfulapi spec: selector: matchLabels: @@ -14,13 +14,18 @@ spec: name: restfulapi labels: restfulapi-node: pod + app: restfulapi + annotations: + prometheus.io/scrape: "true" + prometheus.io/path: "/metrics" + prometheus.io/port: "5000" spec: - {% if cnf["dnsPolicy"] %} + {% if cnf["dnsPolicy"] %} dnsPolicy: {{cnf["dnsPolicy"]}} {% endif %} nodeSelector: restfulapi: active - hostNetwork: true + hostNetwork: true containers: - name: restfulapi image: {{cnf["worker-dockerregistry"]}}{{cnf["dockerprefix"]}}{{cnf["restfulapi"]}}:{{cnf["dockertag"]}} @@ -30,6 +35,10 @@ spec: name: apiconfig - mountPath: /var/log/apache2 name: log + ports: + - containerPort: 5000 + hostPort: 5000 + name: main {% if False %} {% for volume in cnf["mountpoints"] %} {% if cnf["mountpoints"][volume]["mountpoints"] is string and cnf["mountpoints"][volume]["mountpoints"]!="" %} @@ -41,7 +50,7 @@ spec: name: {{mp}} {% endfor %} {% endif %} - {% endfor %} + {% endfor %} {% endif %} volumes: - name: apiconfig @@ -59,14 +68,14 @@ spec: {% else %} {% for mp in cnf["mountpoints"][volume]["mountpoints"] %} - name: {{mp}} - hostPath: + hostPath: path: {{cnf["storage-mount-path"]}}/{{mp}} {% endfor %} {% endif %} - {% endfor %} + {% endfor %} {% endif %} tolerations: - key: CriticalAddonsOnly operator: Exists - key: node-role.kubernetes.io/master - effect: NoSchedule + effect: NoSchedule diff --git a/src/ClusterBootstrap/storage/glusterFS/gk-deploy b/src/ClusterBootstrap/storage/glusterFS/gk-deploy index 3470d9532..67b38967c 100755 --- a/src/ClusterBootstrap/storage/glusterFS/gk-deploy +++ b/src/ClusterBootstrap/storage/glusterFS/gk-deploy @@ -207,7 +207,7 @@ check_pods() { break fi sleep 2 - pods=$(${CLI} get pod --no-headers --show-all --selector="${1}" 2>&1) + pods=$(${CLI} get pod --no-headers --selector="${1}" 2>&1) if [[ ${s} -ne 0 ]] && [[ ${VERBOSE} -eq 1 ]]; then podlines=$(echo "$pods" | wc -l) ((podlines+=1)) diff --git a/src/ClusterBootstrap/template/RestfulAPI/config.yaml b/src/ClusterBootstrap/template/RestfulAPI/config.yaml index 384b3ab86..f75ef7021 100755 --- a/src/ClusterBootstrap/template/RestfulAPI/config.yaml +++ b/src/ClusterBootstrap/template/RestfulAPI/config.yaml @@ -30,3 +30,21 @@ default-storage-folders : {{cnf["default-storage-folders"]}} webportal_node: {{cnf["webportal_node"]}} datasource : {{cnf["datasource"]}} kube_custom_scheduler: {{cnf["kube_custom_scheduler"]}} +WinbindServers: {{cnf["WinbindServers"]}} +azure_cluster : + worker_node_num : {{cnf["azure_cluster"][cnf["cluster_name"]]["worker_node_num"]}} + worker_vm_size : {{cnf["azure_cluster"][cnf["cluster_name"]]["worker_vm_size"]}} +sku_mapping: {{cnf["sku_mapping"]}} +defalt_virtual_cluster_name: {{cnf["defalt_virtual_cluster_name"]}} +{% if cnf["job-manager"] %} +job-manager: + {% if cnf["job-manager"]["notifier"] %} + notifier: + {% if cnf["job-manager"]["notifier"]["cluster"] %} + cluster: {{ cnf["job-manager"]["notifier"]["cluster"] }} + {% endif %} + {% if cnf["job-manager"]["notifier"]["alert-manager-url"] %} + alert-manager-url: {{ cnf["job-manager"]["notifier"]["alert-manager-url"] }} + {% endif %} + {% endif %} +{% endif %} diff --git a/src/ClusterBootstrap/template/WebUI/userconfig.json b/src/ClusterBootstrap/template/WebUI/userconfig.json index b0ac02965..b54603d14 100755 --- a/src/ClusterBootstrap/template/WebUI/userconfig.json +++ b/src/ClusterBootstrap/template/WebUI/userconfig.json @@ -21,6 +21,7 @@ "RegisterGroups": {{cnf["WebUIregisterGroups"]}}, "ClusterId": "{{cnf["clusterId"]}}", "Restapi": "{{cnf["restapi"]}}", + "Grafana": "", "WorkFolderAccessPoint": "{{cnf["workFolderAccessPoint"]}}", "DataFolderAccessPoint": "{{cnf["dataFolderAccessPoint"]}}", "smbUsername":"{{cnf["smbUsername"]}}", diff --git a/src/ClusterBootstrap/template/dns/dns.sh.template b/src/ClusterBootstrap/template/dns/dns.sh.template new file mode 100755 index 000000000..36c384c12 --- /dev/null +++ b/src/ClusterBootstrap/template/dns/dns.sh.template @@ -0,0 +1,7 @@ +sudo systemctl disable systemd-resolved.service +sudo systemctl stop systemd-resolved +echo "dns=default" | sudo tee -a /etc/NetworkManager/NetworkManager.conf +sudo rm /etc/resolv.conf +echo "nameserver 8.8.8.8" | sudo tee -a /etc/resolv.conf +echo 'search {{cnf["azure_cluster"][cnf["cluster_name"]]["azure_location"]}}.cloudapp.azure.com' | sudo tee -a /etc/resolv.conf +sudo service network-manager restart \ No newline at end of file diff --git a/src/ClusterBootstrap/template/kube-addons/heapster-deployment.json b/src/ClusterBootstrap/template/kube-addons/heapster-deployment.json deleted file mode 100755 index c08be655b..000000000 --- a/src/ClusterBootstrap/template/kube-addons/heapster-deployment.json +++ /dev/null @@ -1,100 +0,0 @@ -{ - "apiVersion": "apps/v1", - "kind": "Deployment", - "metadata": { - "labels": { - "k8s-app": "heapster", - "kubernetes.io/cluster-service": "true", - "version": "v1.2.0" - }, - "name": "heapster-v1.2.0", - "namespace": "kube-system" - }, - "spec": { - "replicas": 1, - "selector": { - "matchLabels": { - "k8s-app": "heapster", - "version": "v1.2.0" - } - }, - "template": { - "metadata": { - "annotations": { - "scheduler.alpha.kubernetes.io/critical-pod": "", - "scheduler.alpha.kubernetes.io/tolerations": "[{\"key\":\"CriticalAddonsOnly\", \"operator\":\"Exists\"}]" - }, - "labels": { - "k8s-app": "heapster", - "version": "v1.2.0" - } - }, - "spec": { - "containers": [ - { - "command": [ - "/heapster", - "--source=kubernetes.summary_api:''" - ], - "image": "{{cnf["dockers"]["container"]["heapster"]["fullname"]}}", - "name": "heapster", - "resources": { - "limits": { - "cpu": "80m", - "memory": "140Mi" - }, - "requests": { - "cpu": "80m", - "memory": "140Mi" - } - } - }, - { - "command": [ - "/pod_nanny", - "--cpu=80m", - "--extra-cpu=0.5m", - "--memory=140Mi", - "--extra-memory=4Mi", - "--threshold=5", - "--deployment=heapster-v1.2.0-beta.1", - "--container=heapster", - "--poll-period=300000", - "--estimator=exponential" - ], - "env": [ - { - "name": "MY_POD_NAME", - "valueFrom": { - "fieldRef": { - "fieldPath": "metadata.name" - } - } - }, - { - "name": "MY_POD_NAMESPACE", - "valueFrom": { - "fieldRef": { - "fieldPath": "metadata.namespace" - } - } - } - ], - "image": "dlws/addon-resizer:1.6", - "name": "heapster-nanny", - "resources": { - "limits": { - "cpu": "50m", - "memory": "90Mi" - }, - "requests": { - "cpu": "50m", - "memory": "90Mi" - } - } - } - ] - } - } - } -} diff --git a/src/ClusterBootstrap/template/kube-addons/heapster-svc.json b/src/ClusterBootstrap/template/kube-addons/heapster-svc.json deleted file mode 100644 index 8dd242862..000000000 --- a/src/ClusterBootstrap/template/kube-addons/heapster-svc.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "apiVersion": "v1", - "kind": "Service", - "metadata": { - "labels": { - "kubernetes.io/cluster-service": "true", - "kubernetes.io/name": "Heapster" - }, - "name": "heapster", - "namespace": "kube-system" - }, - "spec": { - "ports": [ - { - "port": 80, - "targetPort": 8082 - } - ], - "selector": { - "k8s-app": "heapster" - } - } -} diff --git a/src/ClusterBootstrap/template/kube-addons/kube-proxy.json b/src/ClusterBootstrap/template/kube-addons/kube-proxy.json index f8538fcb8..194b4e33f 100755 --- a/src/ClusterBootstrap/template/kube-addons/kube-proxy.json +++ b/src/ClusterBootstrap/template/kube-addons/kube-proxy.json @@ -27,7 +27,7 @@ { "command": [ "/hyperkube", - "proxy", + "kube-proxy", "--kubeconfig=/etc/kubernetes/worker-kubeconfig.yaml" ], "image": "{{cnf["dockers"]["container"]["hyperkube"]["fullname"]}}", diff --git a/src/ClusterBootstrap/template/kube-addons/nvidia-driver.json b/src/ClusterBootstrap/template/kube-addons/nvidia-driver.json deleted file mode 100755 index 56fba5486..000000000 --- a/src/ClusterBootstrap/template/kube-addons/nvidia-driver.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "apiVersion": "apps/v1", - "kind": "DaemonSet", - "metadata": { - "labels": { - "component": "nvidia-driver", - "tier": "node" - }, - "name": "nvidia-driver", - "namespace": "kube-system" - }, - "spec": { - "selector": { - "matchLabels": { - "component": "nvidia-driver" - } - }, - "template": { - "metadata": { - "labels": { - "component": "nvidia-driver", - "tier": "node" - } - }, - "spec": { - "containers": [ - { - "command": [ - "/opt/init_service.sh" - ], - "image": "mlcloudreg.westus.cloudapp.azure.com:5000/nvidia_driver:GeForce375.20_kube", - "name": "kube-proxy", - "imagePullPolicy": "Always", - "securityContext": { - "privileged": true - }, - "volumeMounts": [ - { - "mountPath": "/opt/nvidia-docker-volume", - "name": "nvidia-docker-volume" - }, - { - "mountPath": "/opt/nvidia-driver", - "name": "nvidia-driver" - }, - { - "mountPath": "/opt/nvidia-docker", - "name": "nvidia-docker" - }, - { - "mountPath": "/dev", - "name": "dev" - } - ] - } - ], - "hostNetwork": true, - "volumes": [ - { - "hostPath": { - "path": "/var/lib/nvidia-docker" - }, - "name": "nvidia-docker-volume" - }, - { - "hostPath": { - "path": "/opt/nvidia-driver" - }, - "name": "nvidia-driver" - }, - { - "hostPath": { - "path": "/opt/nvidia-docker" - }, - "name": "nvidia-docker" - }, - { - "hostPath": { - "path": "/dev" - }, - "name": "dev" - } - ] - } - } - } -} diff --git a/src/ClusterBootstrap/template/kube-addons/weave.yaml b/src/ClusterBootstrap/template/kube-addons/weave.yaml index 5dfa3ace1..9ef1d9021 100755 --- a/src/ClusterBootstrap/template/kube-addons/weave.yaml +++ b/src/ClusterBootstrap/template/kube-addons/weave.yaml @@ -9,8 +9,8 @@ items: cloud.weave.works/launcher-info: |- { "original-request": { - "url": "/k8s/v1.8/net.yaml?k8s-version=1.9", - "date": "Tue Mar 06 2018 01:14:36 GMT+0000 (UTC)" + "url": "/k8s/v1.10/net.yaml?k8s-version=1.15", + "date": "Thu Aug 15 2019 05:48:37 GMT+0000 (UTC)" }, "email-address": "support@weave.works" } @@ -25,14 +25,13 @@ items: cloud.weave.works/launcher-info: |- { "original-request": { - "url": "/k8s/v1.8/net.yaml?k8s-version=1.9", - "date": "Tue Mar 06 2018 01:14:36 GMT+0000 (UTC)" + "url": "/k8s/v1.10/net.yaml?k8s-version=1.15", + "date": "Thu Aug 15 2019 05:48:37 GMT+0000 (UTC)" }, "email-address": "support@weave.works" } labels: name: weave-net - namespace: kube-system rules: - apiGroups: - '' @@ -52,6 +51,13 @@ items: - get - list - watch + - apiGroups: + - '' + resources: + - nodes/status + verbs: + - patch + - update - apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRoleBinding metadata: @@ -60,14 +66,13 @@ items: cloud.weave.works/launcher-info: |- { "original-request": { - "url": "/k8s/v1.8/net.yaml?k8s-version=1.9", - "date": "Tue Mar 06 2018 01:14:36 GMT+0000 (UTC)" + "url": "/k8s/v1.10/net.yaml?k8s-version=1.15", + "date": "Thu Aug 15 2019 05:48:37 GMT+0000 (UTC)" }, "email-address": "support@weave.works" } labels: name: weave-net - namespace: kube-system roleRef: kind: ClusterRole name: weave-net @@ -84,8 +89,8 @@ items: cloud.weave.works/launcher-info: |- { "original-request": { - "url": "/k8s/v1.8/net.yaml?k8s-version=1.9", - "date": "Tue Mar 06 2018 01:14:36 GMT+0000 (UTC)" + "url": "/k8s/v1.10/net.yaml?k8s-version=1.15", + "date": "Thu Aug 15 2019 05:48:37 GMT+0000 (UTC)" }, "email-address": "support@weave.works" } @@ -116,8 +121,8 @@ items: cloud.weave.works/launcher-info: |- { "original-request": { - "url": "/k8s/v1.8/net.yaml?k8s-version=1.9", - "date": "Tue Mar 06 2018 01:14:36 GMT+0000 (UTC)" + "url": "/k8s/v1.10/net.yaml?k8s-version=1.15", + "date": "Thu Aug 15 2019 05:48:37 GMT+0000 (UTC)" }, "email-address": "support@weave.works" } @@ -132,7 +137,7 @@ items: - kind: ServiceAccount name: weave-net namespace: kube-system - - apiVersion: extensions/v1beta1 + - apiVersion: apps/v1 kind: DaemonSet metadata: name: weave-net @@ -140,8 +145,8 @@ items: cloud.weave.works/launcher-info: |- { "original-request": { - "url": "/k8s/v1.8/net.yaml?k8s-version=1.9", - "date": "Tue Mar 06 2018 01:14:36 GMT+0000 (UTC)" + "url": "/k8s/v1.10/net.yaml?k8s-version=1.15", + "date": "Thu Aug 15 2019 05:48:37 GMT+0000 (UTC)" }, "email-address": "support@weave.works" } @@ -149,6 +154,10 @@ items: name: weave-net namespace: kube-system spec: + selector: + matchLabels: + name: weave-net + minReadySeconds: 5 template: metadata: labels: @@ -159,20 +168,17 @@ items: command: - /home/weave/launch.sh env: - - name: IPALLOC_RANGE - value: {{cnf["pod_ip_range"]}} - name: HOSTNAME valueFrom: fieldRef: apiVersion: v1 fieldPath: spec.nodeName - image: '{{cnf["dockers"]["container"]["weave"]["fullname"]}}' - livenessProbe: + image: '{{cnf["dockers"]["container"]["weave"]["fullname"]}}' + readinessProbe: httpGet: host: 127.0.0.1 path: /status port: 6784 - initialDelaySeconds: 30 resources: requests: cpu: 10m @@ -194,7 +200,6 @@ items: - name: xtables-lock mountPath: /run/xtables.lock - name: weave-npc - args: [] env: - name: HOSTNAME valueFrom: @@ -241,5 +246,6 @@ items: - name: xtables-lock hostPath: path: /run/xtables.lock + type: FileOrCreate updateStrategy: - type: RollingUpdate \ No newline at end of file + type: RollingUpdate diff --git a/src/ClusterBootstrap/template/kubelet/10-weave.conf b/src/ClusterBootstrap/template/kubelet/10-weave.conf new file mode 100644 index 000000000..9eceb84da --- /dev/null +++ b/src/ClusterBootstrap/template/kubelet/10-weave.conf @@ -0,0 +1,5 @@ +{ + "name": "weave", + "type": "weave-net", + "hairpinMode": true +} diff --git a/src/ClusterBootstrap/template/kubelet/daemon.json b/src/ClusterBootstrap/template/kubelet/daemon.json new file mode 100755 index 000000000..f938c1b50 --- /dev/null +++ b/src/ClusterBootstrap/template/kubelet/daemon.json @@ -0,0 +1,9 @@ +{ + "default-runtime": "nvidia", + "runtimes": { + "nvidia": { + "path": "nvidia-container-runtime", + "runtimeArgs": [] + } + } +} \ No newline at end of file diff --git a/src/ClusterBootstrap/template/kubelet/kube-proxy.yaml b/src/ClusterBootstrap/template/kubelet/kube-proxy.yaml deleted file mode 100755 index 5880fa78b..000000000 --- a/src/ClusterBootstrap/template/kubelet/kube-proxy.yaml +++ /dev/null @@ -1,37 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: kube-proxy - namespace: kube-system -spec: - hostNetwork: true - containers: - - name: kube-proxy - image: {{cnf["dockers"]["container"]["hyperkube"]["fullname"]}} - command: - - /proxy - - --master={{cnf["api_servers"]}} - - --kubeconfig=/etc/kubernetes/worker-kubeconfig.yaml - - --proxy-mode=iptables - securityContext: - privileged: true - volumeMounts: - - mountPath: /etc/ssl/certs - name: "ssl-certs" - - mountPath: /etc/kubernetes/worker-kubeconfig.yaml - name: "kubeconfig" - readOnly: true - - mountPath: /etc/kubernetes/ssl - name: "etc-kube-ssl" - readOnly: true - volumes: - - name: "ssl-certs" - hostPath: - path: "/usr/share/ca-certificates" - - name: "kubeconfig" - hostPath: - path: "/etc/kubernetes/worker-kubeconfig.yaml" - - name: "etc-kube-ssl" - hostPath: - path: "/etc/kubernetes/ssl" - diff --git a/src/ClusterBootstrap/template/kubelet/kubelet.service.template b/src/ClusterBootstrap/template/kubelet/kubelet.service.template index 0640a367f..29a509a24 100755 --- a/src/ClusterBootstrap/template/kubelet/kubelet.service.template +++ b/src/ClusterBootstrap/template/kubelet/kubelet.service.template @@ -19,19 +19,17 @@ ExecStartPre=/bin/bash -c 'if lspci | grep -qE "[0-9a-fA-F][0-9a-fA-F]:[0-9a-fA # # https://github.com/kubernetes/kubernetes/issues/48937 # Glusterfs currently need docker-disable-shared-pid option, will evaluate in future kubernete release -# +# +# https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/ ExecStart=/opt/bin/kubelet \ - --require-kubeconfig=true \ --container-runtime={{'remote' if cnf["kube_custom_cri"] else 'docker'}} \ --enable-server=true \ --register-node=true \ - --feature-gates="Accelerators=true" \ - --allow-privileged=true \ + --feature-gates="DevicePlugins=true,PodShareProcessNamespace=true" \ --pod-manifest-path=/etc/kubernetes/manifests \ - --pod-infra-container-image {{cnf["dockers"]["container"]["podinfra"]["fullname"]}} \ + --pod-infra-container-image={{cnf["dockers"]["container"]["podinfra"]["fullname"]}} \ --network-plugin=cni \ --cluster_dns={{cnf["dns-server-ip"]}} \ - --docker-disable-shared-pid \ --cluster_domain=cluster.local \ --tls-cert-file=/etc/kubernetes/ssl/worker.pem \ --tls-private-key-file=/etc/kubernetes/ssl/worker-key.pem \ @@ -44,4 +42,4 @@ Restart=always RestartSec=10 [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target diff --git a/src/ClusterBootstrap/template/kubelet/post-worker-deploy.sh b/src/ClusterBootstrap/template/kubelet/post-worker-upgrade.sh similarity index 55% rename from src/ClusterBootstrap/template/kubelet/post-worker-deploy.sh rename to src/ClusterBootstrap/template/kubelet/post-worker-upgrade.sh index c1f0ba1c4..dffa9d6c8 100755 --- a/src/ClusterBootstrap/template/kubelet/post-worker-deploy.sh +++ b/src/ClusterBootstrap/template/kubelet/post-worker-upgrade.sh @@ -2,14 +2,8 @@ sudo cp /etc/kubernetes/ssl/ca.pem /etc/ssl/etcd/ca.pem sudo cp /etc/kubernetes/ssl/worker.pem /etc/ssl/etcd/worker.pem sudo cp /etc/kubernetes/ssl/worker-key.pem /etc/ssl/etcd/worker-key.pem -sudo chmod +x /opt/bin/kubelet +sudo chmod +x /opt/bin/kubelet sudo systemctl daemon-reload sudo systemctl stop kubelet -sudo systemctl stop docker -sudo systemctl stop flanneld -sudo systemctl start flanneld -sudo systemctl start docker sudo systemctl start kubelet -sudo systemctl start rpc-statd -sudo systemctl enable flanneld -sudo systemctl enable kubelet \ No newline at end of file +sudo systemctl enable kubelet diff --git a/src/ClusterBootstrap/template/kubelet/pre-worker-deploy.sh b/src/ClusterBootstrap/template/kubelet/pre-worker-deploy.sh deleted file mode 100755 index a2e322b22..000000000 --- a/src/ClusterBootstrap/template/kubelet/pre-worker-deploy.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -sudo systemctl stop kubelet -sudo docker rm -f $(docker ps -a | grep 'k8s_kube\|k8s_POD' | awk '{print $1}') -sudo systemctl stop docker -sudo systemctl stop flanneld - -sudo rm /etc/systemd/system/flanneld.service.d/40-ExecStartPre-symlink.conf -sudo rm /etc/systemd/system/docker.service.d/40-flannel.conf -sudo rm /etc/flannel/options.env -sudo systemctl disable kubelet -sudo systemctl disable reportcluster -sudo systemctl disable bootstrap -sudo systemctl disable checkinternet -sudo systemctl disable nvidia-docker -sudo systemctl disable nvidia-driver - -sudo rm /etc/systemd/system/kubelet.service -sudo rm /etc/systemd/system/reportcluster.service -sudo rm /etc/systemd/system/bootstrap.service -sudo rm /etc/systemd/system/checkinternet.service -sudo rm /etc/systemd/system/nvidia-docker.service -sudo rm /etc/systemd/system/nvidia-driver.service - - -sudo rm -r /etc/kubernetes -sudo rm /opt/kubelet.sh -sudo rm /opt/bin/kubelet -sudo rm -r /etc/kubernetes -sudo rm -r /etc/systemd/system/reportcluster.service - -sudo mkdir -p /etc/kubernetes -sudo mkdir -p /etc/systemd/system/flanneld.service.d -sudo mkdir -p /etc/systemd/system/docker.service.d -sudo mkdir -p /etc/flannel -sudo mkdir -p /etc/kubernetes/manifests -sudo mkdir -p /etc/kubernetes/ssl/ -sudo mkdir -p /etc/ssl/etcd -sudo mkdir -p /opt/bin -{{'sudo mkdir -p '~cnf["kubeletlogdir"]~'/kubelet' if "kubeletlogdir" in cnf}} - diff --git a/src/ClusterBootstrap/template/kubelet/pre-worker-upgrade.sh b/src/ClusterBootstrap/template/kubelet/pre-worker-upgrade.sh new file mode 100755 index 000000000..58115a847 --- /dev/null +++ b/src/ClusterBootstrap/template/kubelet/pre-worker-upgrade.sh @@ -0,0 +1,21 @@ +#!/bin/bash +sudo systemctl stop kubelet +sudo docker rm -f $(docker ps -a | grep 'k8s_kube\|k8s_POD' | awk '{print $1}') + +sudo systemctl disable kubelet + +sudo rm /etc/systemd/system/kubelet.service + +sudo rm -r /etc/kubernetes +sudo rm /opt/kubelet.sh +sudo rm /opt/bin/kubelet +sudo rm -r /etc/kubernetes +sudo rm -rf /opt/cni + +sudo mkdir -p /etc/kubernetes +sudo mkdir -p /etc/kubernetes/manifests +sudo mkdir -p /etc/kubernetes/ssl/ +sudo mkdir -p /etc/ssl/etcd +sudo mkdir -p /opt/bin +sudo mkdir -p /opt/cni/bin +{{'sudo mkdir -p '~cnf["kubeletlogdir"]~'/kubelet' if "kubeletlogdir" in cnf}} diff --git a/src/ClusterBootstrap/template/kubelet/ubuntu/deploy.list b/src/ClusterBootstrap/template/kubelet/ubuntu/deploy.list index 5b1ca6cb1..74d608fef 100755 --- a/src/ClusterBootstrap/template/kubelet/ubuntu/deploy.list +++ b/src/ClusterBootstrap/template/kubelet/ubuntu/deploy.list @@ -20,7 +20,11 @@ ./deploy/bin/ipvlan,/opt/cni/bin/ipvlan ./deploy/bin/bridge,/opt/cni/bin/bridge ./deploy/bin/tuning,/opt/cni/bin/tuning -./deploy/bin/noop,/opt/cni/bin/noop ./deploy/bin/host-local,/opt/cni/bin/host-local -./deploy/bin/cnitool,/opt/cni/bin/cnitool -./deploy/bin/flannel,/opt/cni/bin/flannel \ No newline at end of file +./deploy/bin/flannel,/opt/cni/bin/flannel +./deploy/bin/host-device,/opt/cni/bin/host-device +./deploy/bin/portmap,/opt/cni/bin/portmap +./deploy/bin/sample,/opt/cni/bin/sample +./deploy/bin/vlan,/opt/cni/bin/vlan +./deploy/kubelet/daemon.json,/etc/docker/daemon.json +./deploy/kubelet/10-weave.conf,/etc/cni/net.d/10-weave.conf diff --git a/src/ClusterBootstrap/template/kubelet/ubuntu/post-worker-deploy.sh b/src/ClusterBootstrap/template/kubelet/ubuntu/post-worker-deploy.sh index f7196fe0d..3622dc7dc 100755 --- a/src/ClusterBootstrap/template/kubelet/ubuntu/post-worker-deploy.sh +++ b/src/ClusterBootstrap/template/kubelet/ubuntu/post-worker-deploy.sh @@ -6,6 +6,7 @@ sudo chmod +x /opt/bin/kubelet sudo systemctl daemon-reload sudo systemctl stop kubelet sudo systemctl stop kubecri +sudo systemctl restart docker {{'sudo systemctl start kubecri' if cnf["kube_custom_cri"]}} sudo systemctl start kubelet sudo systemctl start rpc-statd diff --git a/src/ClusterBootstrap/template/kubelet/ubuntu/pre-worker-deploy.sh b/src/ClusterBootstrap/template/kubelet/ubuntu/pre-worker-deploy.sh index 01fe7403a..d69bce752 100755 --- a/src/ClusterBootstrap/template/kubelet/ubuntu/pre-worker-deploy.sh +++ b/src/ClusterBootstrap/template/kubelet/ubuntu/pre-worker-deploy.sh @@ -1,5 +1,6 @@ sudo systemctl stop kubelet sudo systemctl stop kubecri +sudo systemctl restart docker sudo docker rm -f $(docker ps -a | grep 'k8s_kube\|k8s_POD' | awk '{print $1}') sudo mkdir -p /etc/kubernetes @@ -12,5 +13,6 @@ sudo rm -r /etc/kubernetes/manifests/* sudo rm -r /etc/kubernetes/ssl/* sudo rm -r /etc/ssl/etcd/* sudo rm -r /opt/addons/kube-addons/* +sudo rm -rf /etc/cni/net.d sudo chown -R $USER /etc/kubernetes sudo chown -R $USER /opt/addons/kube-addons diff --git a/src/ClusterBootstrap/template/kubelet/deploy.list b/src/ClusterBootstrap/template/kubelet/upgrade.list similarity index 50% rename from src/ClusterBootstrap/template/kubelet/deploy.list rename to src/ClusterBootstrap/template/kubelet/upgrade.list index 9e982cd36..69a332f42 100755 --- a/src/ClusterBootstrap/template/kubelet/deploy.list +++ b/src/ClusterBootstrap/template/kubelet/upgrade.list @@ -1,5 +1,3 @@ -./deploy/master/40-ExecStartPre-symlink.conf,/etc/systemd/system/flanneld.service.d/40-ExecStartPre-symlink.conf -./deploy/master/40-flannel.conf,/etc/systemd/system/docker.service.d/40-flannel.conf ./deploy/kubelet/options.env,/etc/flannel/options.env ./deploy/kubelet/kubelet.service,/etc/systemd/system/kubelet.service ./deploy/kubelet/worker-kubeconfig.yaml,/etc/kubernetes/worker-kubeconfig.yaml @@ -9,6 +7,18 @@ ./deploy/ssl/ca/ca.pem,/etc/kubernetes/ssl/ca.pem ./deploy/ssl/kubelet/apiserver.pem,/etc/kubernetes/ssl/worker.pem ./deploy/ssl/kubelet/apiserver-key.pem,/etc/kubernetes/ssl/worker-key.pem -./deploy/kubelet/report.sh,/opt/report.sh -./deploy/kubelet/reportcluster.service,/etc/systemd/system/reportcluster.service -./deploy/kubelet/nodelist.yaml,/etc/kubernetes/nodes/nodelist.yaml \ No newline at end of file +./deploy/kubelet/nodelist.yaml,/etc/kubernetes/nodes/nodelist.yaml +./deploy/bin/macvlan,/opt/cni/bin/macvlan +./deploy/bin/dhcp,/opt/cni/bin/dhcp +./deploy/bin/loopback,/opt/cni/bin/loopback +./deploy/bin/ptp,/opt/cni/bin/ptp +./deploy/bin/ipvlan,/opt/cni/bin/ipvlan +./deploy/bin/bridge,/opt/cni/bin/bridge +./deploy/bin/tuning,/opt/cni/bin/tuning +./deploy/bin/host-local,/opt/cni/bin/host-local +./deploy/bin/flannel,/opt/cni/bin/flannel +./deploy/bin/host-device,/opt/cni/bin/host-device +./deploy/bin/portmap,/opt/cni/bin/portmap +./deploy/bin/sample,/opt/cni/bin/sample +./deploy/bin/vlan,/opt/cni/bin/vlan +./deploy/kubelet/10-weave.conf,/etc/cni/net.d/10-weave.conf diff --git a/src/ClusterBootstrap/template/master/kube-apiserver.yaml b/src/ClusterBootstrap/template/master/kube-apiserver.yaml index a3ea51283..07f74282e 100755 --- a/src/ClusterBootstrap/template/master/kube-apiserver.yaml +++ b/src/ClusterBootstrap/template/master/kube-apiserver.yaml @@ -10,7 +10,7 @@ spec: image: {{cnf["dockers"]["container"]["hyperkube"]["fullname"]}} command: - /hyperkube - - apiserver + - kube-apiserver - --bind-address=0.0.0.0 - --etcd-servers={{cnf["etcd_endpoints"]}} - --etcd-cafile=/etc/kubernetes/ssl/ca.pem @@ -20,8 +20,8 @@ spec: - --service-cluster-ip-range={{cnf["service_cluster_ip_range"]}} - --secure-port={{cnf["k8sAPIport"]}} - --advertise-address={{cnf["master_ip"]}} - - --admission-control=NamespaceLifecycle,LimitRanger,ServiceAccount,DefaultStorageClass,ResourceQuota - #- --admission-control=NamespaceLifecycle,LimitRanger,SecurityContextDeny,ServiceAccount,ResourceQuota + - --enable-admission-plugins=NamespaceLifecycle,LimitRanger,ServiceAccount,DefaultStorageClass,ResourceQuota + #- --enable-admission-plugins=NamespaceLifecycle,LimitRanger,SecurityContextDeny,ServiceAccount,ResourceQuota - --tls-cert-file=/etc/kubernetes/ssl/apiserver.pem - --tls-private-key-file=/etc/kubernetes/ssl/apiserver-key.pem - --client-ca-file=/etc/kubernetes/ssl/ca.pem diff --git a/src/ClusterBootstrap/template/master/kube-controller-manager.yaml b/src/ClusterBootstrap/template/master/kube-controller-manager.yaml index a0d95a5c7..babce354d 100755 --- a/src/ClusterBootstrap/template/master/kube-controller-manager.yaml +++ b/src/ClusterBootstrap/template/master/kube-controller-manager.yaml @@ -10,7 +10,7 @@ spec: image: {{cnf["dockers"]["container"]["hyperkube"]["fullname"]}} command: - /hyperkube - - controller-manager + - kube-controller-manager - --master=http://127.0.0.1:8080 - --leader-elect=true - --service-account-private-key-file=/etc/kubernetes/ssl/apiserver-key.pem @@ -39,4 +39,4 @@ spec: name: ssl-certs-kubernetes - hostPath: path: /usr/share/ca-certificates - name: ssl-certs-host \ No newline at end of file + name: ssl-certs-host diff --git a/src/ClusterBootstrap/template/master/kube-proxy.yaml b/src/ClusterBootstrap/template/master/kube-proxy.yaml deleted file mode 100755 index 85e4f5332..000000000 --- a/src/ClusterBootstrap/template/master/kube-proxy.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: kube-proxy - namespace: kube-system -spec: - hostNetwork: true - containers: - - name: kube-proxy - image: {{cnf["dockers"]["container"]["hyperkube"]["fullname"]}} - command: - - /hyperkube - - proxy - - --master=http://127.0.0.1:8080 - - --proxy-mode=iptables - securityContext: - privileged: true - volumeMounts: - - mountPath: /etc/ssl/certs - name: ssl-certs-host - readOnly: true - volumes: - - hostPath: - path: /usr/share/ca-certificates - name: ssl-certs-host \ No newline at end of file diff --git a/src/ClusterBootstrap/template/master/kube-scheduler.yaml b/src/ClusterBootstrap/template/master/kube-scheduler.yaml index 47aff7542..49639615f 100755 --- a/src/ClusterBootstrap/template/master/kube-scheduler.yaml +++ b/src/ClusterBootstrap/template/master/kube-scheduler.yaml @@ -13,7 +13,7 @@ spec: - /kube-scheduler {% else %} - /hyperkube - - scheduler + - kube-scheduler {% endif %} - --master=http://127.0.0.1:8080 - --leader-elect=true diff --git a/src/ClusterBootstrap/template/master/kubelet.service b/src/ClusterBootstrap/template/master/kubelet.service index b7953cc70..380989d37 100755 --- a/src/ClusterBootstrap/template/master/kubelet.service +++ b/src/ClusterBootstrap/template/master/kubelet.service @@ -13,22 +13,20 @@ ExecStartPre=/bin/bash -c 'if lspci | grep -qE "[0-9a-fA-F][0-9a-fA-F]:[0-9a-fA- # https://github.com/kubernetes/kubernetes/issues/48937 # Glusterfs currently need docker-disable-shared-pid option, will evaluate in future kubernete release # +# https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/ ExecStart=/opt/bin/kubelet \ --kubeconfig=/etc/kubernetes/worker-kubeconfig.yaml \ - --require-kubeconfig=true \ --register-with-taints=node-role.kubernetes.io/master=:NoSchedule \ - --pod-infra-container-image {{cnf["dockers"]["container"]["podinfra"]["fullname"]}} \ + --pod-infra-container-image={{cnf["dockers"]["container"]["podinfra"]["fullname"]}} \ --container-runtime=docker \ - --allow-privileged=true \ - --feature-gates="Accelerators=true" \ + --feature-gates="DevicePlugins=true,PodShareProcessNamespace=true" \ --pod-manifest-path=/etc/kubernetes/manifests \ --network-plugin=cni \ --cluster_dns={{cnf["dns-server-ip"]}} \ - --docker-disable-shared-pid \ --cluster_domain=cluster.local #ExecStop=-/usr/bin/rkt stop --uuid-file=/var/run/kubelet-pod.uuid Restart=always RestartSec=10 [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target diff --git a/src/ClusterBootstrap/template/master/post-master-deploy.sh b/src/ClusterBootstrap/template/master/post-upgrade.sh similarity index 59% rename from src/ClusterBootstrap/template/master/post-master-deploy.sh rename to src/ClusterBootstrap/template/master/post-upgrade.sh index 5b43be742..7e1aa3838 100755 --- a/src/ClusterBootstrap/template/master/post-master-deploy.sh +++ b/src/ClusterBootstrap/template/master/post-upgrade.sh @@ -1,16 +1,10 @@ sudo cp /etc/kubernetes/ssl/ca.pem /etc/ssl/etcd/ca.pem -sudo cp /etc/kubernetes/ssl/ca-key.pem /etc/ssl/etcd//ca-key.pem +sudo cp /etc/kubernetes/ssl/ca-key.pem /etc/ssl/etcd/ca-key.pem sudo cp /etc/kubernetes/ssl/apiserver.pem /etc/ssl/etcd/apiserver.pem sudo cp /etc/kubernetes/ssl/apiserver-key.pem /etc/ssl/etcd/apiserver-key.pem -sudo chmod +x /opt/bin/* +sudo chmod +x /opt/bin/* sudo systemctl daemon-reload -sudo systemctl stop flanneld sudo systemctl stop kubelet -sudo systemctl start flanneld -sudo systemctl stop docker -sudo systemctl start docker sudo docker pull {{cnf["dockers"]["container"]["hyperkube"]["fullname"]}} sudo systemctl start kubelet -sudo systemctl start rpc-statd -sudo systemctl enable flanneld sudo systemctl enable kubelet diff --git a/src/ClusterBootstrap/template/master/pre-master-deploy.sh b/src/ClusterBootstrap/template/master/pre-upgrade.sh similarity index 61% rename from src/ClusterBootstrap/template/master/pre-master-deploy.sh rename to src/ClusterBootstrap/template/master/pre-upgrade.sh index a90bf7612..8de62e320 100755 --- a/src/ClusterBootstrap/template/master/pre-master-deploy.sh +++ b/src/ClusterBootstrap/template/master/pre-upgrade.sh @@ -1,13 +1,20 @@ +sudo systemctl stop kubelet +sudo timeout 10 docker rm -f $(docker ps -a | grep 'k8s_kube\|k8s_POD' | awk '{print $1}') +sudo mkdir -p /etc/kubernetes +sudo mkdir -p /opt/addons +sudo rm -r /etc/kubernetes +sudo rm -r /opt/addons +sudo rm -r /opt/cni +sudo systemctl daemon-reload + +# pre deployment sudo mkdir -p /etc/kubernetes -sudo mkdir -p /etc/systemd/system/flanneld.service.d -sudo mkdir -p /etc/systemd/system/docker.service.d -sudo mkdir -p /etc/flannel sudo mkdir -p /etc/kubernetes/manifests sudo mkdir -p /etc/kubernetes/ssl/ sudo mkdir -p /etc/kubernetes/pki/ -sudo mkdir -p /etc/ssl/etcd sudo mkdir -p /opt/addons sudo mkdir -p /opt/bin +sudo mkdir -p /opt/cni/bin sudo chown -R $USER /etc/kubernetes sudo chown -R $USER /etc/flannel sudo chown -R $USER /opt/bin diff --git a/src/ClusterBootstrap/template/master/ubuntu/deploy.list b/src/ClusterBootstrap/template/master/ubuntu/deploy.list index a6823776c..e6919ef38 100755 --- a/src/ClusterBootstrap/template/master/ubuntu/deploy.list +++ b/src/ClusterBootstrap/template/master/ubuntu/deploy.list @@ -19,6 +19,7 @@ ./deploy/WebUI/appsettings.json,/etc/WebUI/appsettings.json ./deploy/RestfulAPI/config.yaml,/etc/RestfulAPI/config.yaml ./deploy/master/restapi-kubeconfig.yaml,/etc/kubernetes/restapi-kubeconfig.yaml +./deploy/services/clusterroles/clusterrolebindings.yaml,/etc/kubernetes/clusterroles/clusterrolebindings.yaml ./deploy/bin/macvlan,/opt/cni/bin/macvlan ./deploy/bin/dhcp,/opt/cni/bin/dhcp ./deploy/bin/loopback,/opt/cni/bin/loopback @@ -26,9 +27,11 @@ ./deploy/bin/ipvlan,/opt/cni/bin/ipvlan ./deploy/bin/bridge,/opt/cni/bin/bridge ./deploy/bin/tuning,/opt/cni/bin/tuning -./deploy/bin/noop,/opt/cni/bin/noop ./deploy/bin/host-local,/opt/cni/bin/host-local -./deploy/bin/cnitool,/opt/cni/bin/cnitool ./deploy/bin/flannel,/opt/cni/bin/flannel -./deploy/services/clusterroles/clusterrolebindings.yaml,/etc/kubernetes/clusterroles/clusterrolebindings.yaml +./deploy/bin/host-device,/opt/cni/bin/host-device +./deploy/bin/portmap,/opt/cni/bin/portmap +./deploy/bin/sample,/opt/cni/bin/sample +./deploy/bin/vlan,/opt/cni/bin/vlan ./deploy/services/clusterroles/clusterroles.yaml,/etc/kubernetes/clusterroles/clusterroles.yaml +./deploy/kubelet/10-weave.conf,/etc/cni/net.d/10-weave.conf diff --git a/src/ClusterBootstrap/template/master/deploy.list b/src/ClusterBootstrap/template/master/upgrade.list similarity index 68% rename from src/ClusterBootstrap/template/master/deploy.list rename to src/ClusterBootstrap/template/master/upgrade.list index 1fae61239..c2507c6ab 100755 --- a/src/ClusterBootstrap/template/master/deploy.list +++ b/src/ClusterBootstrap/template/master/upgrade.list @@ -8,9 +8,6 @@ ./deploy/ssl/aggregator/proxy-client.key,/etc/kubernetes/pki/proxy-client.key ./deploy/master/basicauth,/etc/kubernetes/basicauth ./deploy/master/worker-kubeconfig.yaml,/etc/kubernetes/worker-kubeconfig.yaml -./deploy/master/40-ExecStartPre-symlink.conf,/etc/systemd/system/flanneld.service.d/40-ExecStartPre-symlink.conf -./deploy/master/40-flannel.conf,/etc/systemd/system/docker.service.d/40-flannel.conf -./deploy/master/options.env,/etc/flannel/options.env ./deploy/master/kubelet.service,/etc/systemd/system/kubelet.service ./deploy/master/kube-apiserver.yaml,/etc/kubernetes/manifests/kube-apiserver.yaml ./deploy/master/kube-controller-manager.yaml,/etc/kubernetes/manifests/kube-controller-manager.yaml @@ -18,9 +15,20 @@ ./deploy/bin/kubelet,/opt/bin/kubelet ./deploy/bin/kubectl,/opt/bin/kubectl ./deploy/kube-addons,/opt/addons/kube-addons -./deploy/WebUI/appsettings.json,/etc/WebUI/appsettings.json -./deploy/RestfulAPI/config.yaml,/etc/RestfulAPI/config.yaml -./deploy/master/restapi-kubeconfig.yaml,/etc/kubernetes/restapi-kubeconfig.yaml ./deploy/master/dns-kubeconfig.yaml,/etc/kubernetes/dns-kubeconfig.yaml ./deploy/services/clusterroles/clusterrolebindings.yaml,/etc/kubernetes/clusterroles/clusterrolebindings.yaml -./deploy/services/clusterroles/clusterroles.yaml,/etc/kubernetes/clusterroles/clusterroles.yaml \ No newline at end of file +./deploy/services/clusterroles/clusterroles.yaml,/etc/kubernetes/clusterroles/clusterroles.yaml +./deploy/bin/macvlan,/opt/cni/bin/macvlan +./deploy/bin/dhcp,/opt/cni/bin/dhcp +./deploy/bin/loopback,/opt/cni/bin/loopback +./deploy/bin/ptp,/opt/cni/bin/ptp +./deploy/bin/ipvlan,/opt/cni/bin/ipvlan +./deploy/bin/bridge,/opt/cni/bin/bridge +./deploy/bin/tuning,/opt/cni/bin/tuning +./deploy/bin/host-local,/opt/cni/bin/host-local +./deploy/bin/flannel,/opt/cni/bin/flannel +./deploy/bin/host-device,/opt/cni/bin/host-device +./deploy/bin/portmap,/opt/cni/bin/portmap +./deploy/bin/sample,/opt/cni/bin/sample +./deploy/bin/vlan,/opt/cni/bin/vlan +./deploy/kubelet/10-weave.conf,/etc/cni/net.d/10-weave.conf diff --git a/src/ClusterBootstrap/template/nfs/nfs_config.sh.template b/src/ClusterBootstrap/template/nfs/nfs_config.sh.template new file mode 100755 index 000000000..0568baf20 --- /dev/null +++ b/src/ClusterBootstrap/template/nfs/nfs_config.sh.template @@ -0,0 +1,29 @@ +{% if "vg_disks" in cnf %} +{% for volgrp, disks in cnf["vg_disks"].items() %} +sudo vgcreate {{ volgrp }} {{disks}} +{% endfor %} +{% endif %} + +{% if "logical_vol" in cnf %} +{% for lv, lv_param in cnf["logical_vol"].items() %} +sudo lvcreate -l {{lv_param["percentage"]}}%FREE -n {{lv}} {{lv_param["volgrp"]}} +sudo mkfs.ext4 /dev/mapper/{{lv_param["volgrp"] | replace('-','--')}}-{{lv | replace('-','--')}} +sudo mkdir -p {{lv_param["mnt"]}} +sudo mount /dev/{{lv_param["volgrp"]}}/{{lv}} {{lv_param["mnt"]}} +echo "UUID=$(sudo blkid | grep {{lv | replace('-','--')}} | sed -n 's/.*UUID=\"\(.*\)\" TYPE.*/\1/p') {{lv_param["mnt"]}} ext4 defaults,discard 0 0" | sudo tee -a /etc/fstab +{% endfor %} +{% endif %} + +# setup NFS service +sudo apt-get update +sudo apt-get install -y nfs-kernel-server + +{% for mnt_name, mnt_setting in cnf["mnt_point"].items() %} +sudo mkdir -p {{mnt_setting["filesharename"]}} +sudo chown nobody:nogroup {{mnt_setting["filesharename"]}} +echo "{{mnt_setting["filesharename"]}} {{cnf["cloud_config"]["vnet_range"]}}(rw,sync,no_subtree_check,no_root_squash)" | sudo tee -a /etc/exports +echo "{{mnt_setting["filesharename"]}} {{cnf["cloud_config"]["samba_range"]}}(rw,fsid=1,nohide,insecure,sync,no_subtree_check,no_root_squash)" | sudo tee -a /etc/exports +{% endfor %} + +sudo systemctl restart nfs-kernel-server.service +sudo exportfs -a diff --git a/src/ClusterBootstrap/template/secret/pass_secret.sh.template b/src/ClusterBootstrap/template/secret/pass_secret.sh.template new file mode 100755 index 000000000..e04b3fc52 --- /dev/null +++ b/src/ClusterBootstrap/template/secret/pass_secret.sh.template @@ -0,0 +1,5 @@ +{% for regi_name, regi_cred in cnf["registry_credential"].items() %} +docker login {{ regi_name }} -u {{ regi_cred["username"] }} -p {{ regi_cred["password"] }} +{% endfor %} +chown -R {{cnf["cloud_config"]["default_admin_username"]}}:{{cnf["cloud_config"]["default_admin_username"]}} /home/{{cnf["cloud_config"]["default_admin_username"]}}/.docker/ +/opt/bin/kubectl create secret generic regcred --from-file=.dockerconfigjson=/home/{{cnf["cloud_config"]["default_admin_username"]}}/.docker/config.json --type=kubernetes.io/dockerconfigjson --dry-run -o yaml | /opt/bin/kubectl apply -f - \ No newline at end of file diff --git a/src/ClusterBootstrap/template/ssl/gencerts_aggregator.sh b/src/ClusterBootstrap/template/ssl/gencerts_aggregator.sh index fb5011685..d8c9d8e66 100755 --- a/src/ClusterBootstrap/template/ssl/gencerts_aggregator.sh +++ b/src/ClusterBootstrap/template/ssl/gencerts_aggregator.sh @@ -126,8 +126,11 @@ function generate-aggregator-certs { function setup-easyrsa { (set -x cd "${KUBE_TEMP}" - curl -L -O --connect-timeout 20 --retry 6 --retry-delay 2 https://storage.googleapis.com/kubernetes-release/easy-rsa/easy-rsa.tar.gz - tar xzf easy-rsa.tar.gz + # change away from using googleapis + curl -L -O --connect-timeout 20 --retry 6 --retry-delay 2 https://github.com/OpenVPN/easy-rsa/archive/v3.0.5.tar.gz + # tar to easy-rsa-v3.0.5 + tar xzf v3.0.5.tar.gz + mv easy-rsa-3.0.5 easy-rsa-master mkdir easy-rsa-master/kubelet cp -r easy-rsa-master/easyrsa3/* easy-rsa-master/kubelet mkdir easy-rsa-master/aggregator diff --git a/src/ClusterBootstrap/template/storage/auto_share/auto_share.py b/src/ClusterBootstrap/template/storage/auto_share/auto_share.py index 47f596e88..bf757a855 100755 --- a/src/ClusterBootstrap/template/storage/auto_share/auto_share.py +++ b/src/ClusterBootstrap/template/storage/auto_share/auto_share.py @@ -179,12 +179,15 @@ def mount_fileshare(verbose=True): for k,v in allmountpoints.iteritems(): if "curphysicalmountpoint" in v and istrue(v, "autoshare", True): physicalmountpoint = v["curphysicalmountpoint"] + # gives mounted information only, would not write anything or carry out mount action output = pipe_with_output("mount", "grep %s" % v["curphysicalmountpoint"], verbose=False) umounts = [] existmounts = [] for line in output.splitlines(): words = line.split() - if len(words)>3 and words[1]=="on": + # pitfall: words[2] might be prefix of v["curphysicalmountpoint"], then a mount point would be missed + # so we should check whether they are equal, if so, we know the specified path on NFS node was previously mounted to infra/worker. + if len(words)>3 and words[1]=="on" and words[2] == v["curphysicalmountpoint"]: if verbose: logging.debug( "%s on %s" % (words[0], words[2]) ) # check if mount point exists, automatic create directory if non exist diff --git a/src/ClusterBootstrap/utils.py b/src/ClusterBootstrap/utils.py index d96bebab3..b700b5275 100755 --- a/src/ClusterBootstrap/utils.py +++ b/src/ClusterBootstrap/utils.py @@ -99,6 +99,9 @@ def render_template_directory(template_dir, target_dir,config, verbose=False, ex if not os.path.exists( markfile ): # print "Write DO_NOT_WRITE" open( markfile, 'w').close() + if os.path.isfile(os.path.join(template_dir, "pre-render.sh")): + pre_reder = os.path.join(template_dir, "pre-render.sh") + os.system("sh " + pre_reder) filenames = os.listdir(template_dir) for filename in filenames: if filename == "copy_dir": @@ -166,7 +169,7 @@ def sudo_scp (identity_file, source, target, user, host,changePermission=False, cmd += " ; sudo chmod +x %s" % target if verbose: print cmd - SSH_exec_cmd(identity_file, user, host, cmd, False) + SSH_exec_cmd(identity_file, user, host, cmd, verbose) # Execute a remote SSH cmd with identity file (private SSH key), user, host # Return the output of the remote command to local diff --git a/src/ClusterManager/ResourceInfo.py b/src/ClusterManager/ResourceInfo.py new file mode 100644 index 000000000..2b1c145ad --- /dev/null +++ b/src/ClusterManager/ResourceInfo.py @@ -0,0 +1,44 @@ +import math + +class ResourceInfo: + def __init__(self, res={}): + self.CategoryToCountMap = {} + for key in res: + self.CategoryToCountMap[key] = int(res[key]) + + def ToSerializable(self): + return self.CategoryToCountMap + + @staticmethod + def Difference(resInfo1, resInfo2): + diff = ResourceInfo() + diff.Add(resInfo1) + diff.Subtract(resInfo2) + return diff + + def GetFraction(self, numeratorResInfo, denominatorResInfo): + fraction = ResourceInfo() + for key in self.CategoryToCountMap: + if key in numeratorResInfo.CategoryToCountMap and key in denominatorResInfo.CategoryToCountMap: + fraction.Add(ResourceInfo({key : \ + int(math.ceil(float(self.CategoryToCountMap[key]) * numeratorResInfo.CategoryToCountMap[key] / denominatorResInfo.CategoryToCountMap[key]))})) + return fraction + + def Add(self, otherResourceInfo): + for key in otherResourceInfo.CategoryToCountMap: + if key not in self.CategoryToCountMap: + self.CategoryToCountMap[key] = 0 + self.CategoryToCountMap[key] += otherResourceInfo.CategoryToCountMap[key] + return self + + def CanSatisfy(self, otherResourceInfo): + for key in otherResourceInfo.CategoryToCountMap: + if (otherResourceInfo.CategoryToCountMap[key] > 0) and ((key not in self.CategoryToCountMap) or (self.CategoryToCountMap[key] < otherResourceInfo.CategoryToCountMap[key])): + return False + return True + + def Subtract(self, otherResourceInfo): + for key in otherResourceInfo.CategoryToCountMap: + if otherResourceInfo.CategoryToCountMap[key] > 0: + self.CategoryToCountMap[key] -= otherResourceInfo.CategoryToCountMap[key] + return self diff --git a/src/ClusterManager/cluster_manager.py b/src/ClusterManager/cluster_manager.py index 80a540560..87d7cecad 100755 --- a/src/ClusterManager/cluster_manager.py +++ b/src/ClusterManager/cluster_manager.py @@ -1,90 +1,156 @@ -import json +import yaml +import subprocess32 import os -import time -import argparse -import uuid -import subprocess +import logging +import logging.config import sys +import time import datetime - -import yaml -from jinja2 import Environment, FileSystemLoader, Template -import base64 - -import re - -import thread +import argparse import threading -import random - -import textwrap -import logging -import logging.config - -import job_manager -import user_manager -import node_manager -import joblog_manager -import command_manager - -from multiprocessing import Process, Manager - - -def create_log( logdir = '/var/log/dlworkspace' ): - if not os.path.exists( logdir ): - os.system("mkdir -p " + logdir ) - with open('logging.yaml') as f: +import traceback +import signal + +from prometheus_client.twisted import MetricsResource +from prometheus_client import Histogram + +from twisted.web.server import Site +from twisted.web.resource import Resource +from twisted.internet import reactor + +logger = logging.getLogger(__name__) + +manager_iteration_histogram = Histogram("manager_iteration_latency_seconds", + "latency for manager to iterate", + buckets=(2.5, 5.0, 10.0, 20.0, 40.0, 80.0, 160.0, float("inf")), + labelnames=("name",)) + + +class HealthResource(Resource): + def render_GET(self, request): + request.setHeader("Content-Type", "text/html; charset=utf-8") + return "Ok".encode("utf-8") + +def exporter_thread(port): + root = Resource() + root.putChild(b"metrics", MetricsResource()) + root.putChild(b"healthz", HealthResource()) + factory = Site(root) + reactor.listenTCP(port, factory) + reactor.run(installSignalHandlers=False) + +def setup_exporter_thread(port): + t = threading.Thread(target=exporter_thread, args=(port,), + name="exporter") + t.start() + return t + +def create_log(logdir="/var/log/dlworkspace"): + if not os.path.exists(logdir): + os.system("mkdir -p " + logdir) + with open("logging.yaml") as f: logging_config = yaml.load(f) + logging_config["handlers"]["file"]["filename"] = logdir + "/clustermanager.log" + logging.config.dictConfig(logging_config) + +def dumpstacks(signal, frame): + id2name = dict([(th.ident, th.name) for th in threading.enumerate()]) + code = [] + for threadId, stack in sys._current_frames().items(): + code.append("\n# Thread: %s(%d)" % (id2name.get(threadId,""), threadId)) + for filename, lineno, name, line in traceback.extract_stack(stack): + code.append('File: "%s", line %d, in %s' % (filename, lineno, name)) + if line: + code.append(" %s" % (line.strip())) + print "\n".join(code) + sys.stdout.flush() + sys.stderr.flush() + +def register_stack_trace_dump(): + signal.signal(signal.SIGTRAP, dumpstacks) + +def update_file_modification_time(path): + if not os.path.isfile(path): + f = open(path, "w") f.close() - logging_config["handlers"]["file"]["filename"] = logdir+"/clustermanager.log" - logging.config.dictConfig(logging_config) + mod_time = time.mktime(datetime.datetime.now().timetuple()) + os.utime(path, (mod_time, mod_time)) +def get_elapsed_seconds(path): + mtime = datetime.datetime.fromtimestamp(os.path.getmtime(path)) + return (datetime.datetime.now() - mtime).seconds -def Run(): +def Run(args): + register_stack_trace_dump() create_log() - - logging.info( "Starting job manager... " ) - proc_job = Process(target=job_manager.Run) - proc_job.start() - - logging.info( "Starting user manager... " ) - proc_user = Process(target=user_manager.Run) - proc_user.start() - - - logging.info( "Starting node manager... " ) - proc_node = Process(target=node_manager.Run) - proc_node.start() - - - logging.info( "Starting joblogging manager... " ) - proc_joblog = Process(target=joblog_manager.Run) - proc_joblog.start() - - logging.info( "Starting command manager... " ) - proc_command = Process(target=command_manager.Run) - proc_command.start() - - - proc_job.join() - proc_user.join() - proc_node.join() - proc_joblog.join() - proc_command.join() - pass - -if __name__ == '__main__': - - #parser = argparse.ArgumentParser( prog='cluster_manager.py', - # formatter_class=argparse.RawDescriptionHelpFormatter, - # description=textwrap.dedent('''\ - # ''') ) - #parser.add_argument("help", - # help = "Show the usage of this program" ) - - #args = parser.parse_args() - - - Run() \ No newline at end of file + cwd = os.path.dirname(__file__) + cmds = { + "job_manager": + ["python", os.path.join(cwd, "job_manager.py"), "--port", str(args.j)], + "user_manager": + ["python", os.path.join(cwd, "user_manager.py"), "--port", str(args.u)], + "node_manager": + ["python", os.path.join(cwd, "node_manager.py"), "--port", str(args.n)], + "joblog_manager": + ["python", os.path.join(cwd, "joblog_manager.py"), "--port", str(args.l)], + "command_manager": + ["python", os.path.join(cwd, "command_manager.py"), "--port", str(args.c)], + "endpoint_manager": + ["python", os.path.join(cwd, "endpoint_manager.py"), "--port", str(args.e)], + } + + FNULL = open(os.devnull, "w") + + childs = {} + + while True: + try: + work(cmds, childs, FNULL) + except Exception as e: + logger.exception("caught exception while doing work") + time.sleep(60) + +def work(cmds, childs, FNULL): + for key, cmd in cmds.items(): + child = childs.get(key) + need_start = False + + if child is None or child.poll() is not None: + if child is not None: + logger.info("%s is dead restart it", cmd) + need_start = True + else: + sec = get_elapsed_seconds(key) + if sec <= args.tictoc: + continue + logger.info("%s did not update file for %d seconds, restart it", + key, sec) + child.send_signal(signal.SIGTRAP) # try to print their stacktrace + time.sleep(1) + child.kill() + sys.stdout.flush() + sys.stderr.flush() + need_start = True + + if need_start: + update_file_modification_time(key) + try: + childs[key] = subprocess32.Popen(cmd, stdin=FNULL) + except Exception as e: + logger.exception("caught exception when trying to start %s, ignore", cmd) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--tictoc", help="how many seconds to wait until kill subprocess", type=int, default=600) + parser.add_argument("-j", help="port of job_manager", type=int, default=9200) + parser.add_argument("-u", help="port of user_manager", type=int, default=9201) + parser.add_argument("-n", help="port of node_manager", type=int, default=9202) + parser.add_argument("-l", help="port of joblog_manager", type=int, default=9203) + parser.add_argument("-c", help="port of command_manager", type=int, default=9204) + parser.add_argument("-e", help="port of endpoint_manager", type=int, default=9205) + args = parser.parse_args() + + sys.exit(Run(args)) diff --git a/src/ClusterManager/command_manager.py b/src/ClusterManager/command_manager.py index 6038c86c3..86458001a 100755 --- a/src/ClusterManager/command_manager.py +++ b/src/ClusterManager/command_manager.py @@ -8,7 +8,6 @@ import datetime import copy - sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../storage")) sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../utils")) @@ -21,7 +20,6 @@ from jinja2 import Environment, FileSystemLoader, Template from config import config, GetStoragePath from DataHandler import DataHandler -from node_manager import create_log from node_manager import get_cluster_status import base64 @@ -32,8 +30,10 @@ import random import logging -import logging.config +from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time + +logger = logging.getLogger(__name__) def RunCommand(command): dataHandler = DataHandler() @@ -42,21 +42,40 @@ def RunCommand(command): dataHandler.Close() return True +def create_log(logdir = '/var/log/dlworkspace'): + if not os.path.exists(logdir): + os.system("mkdir -p " + logdir) + with open('logging.yaml') as f: + logging_config = yaml.full_load(f) + f.close() + logging_config["handlers"]["file"]["filename"] = logdir+"/command_manager.log" + logging.config.dictConfig(logging_config) def Run(): + register_stack_trace_dump() + create_log() + while True: - try: - dataHandler = DataHandler() - pendingCommands = dataHandler.GetPendingCommands() - for command in pendingCommands: - try: - print "Processing command: %s" % (command["id"]) - RunCommand(command) - except Exception as e: - print e - except Exception as e: - print e + update_file_modification_time("command_manager") + + with manager_iteration_histogram.labels("command_manager").time(): + try: + dataHandler = DataHandler() + pendingCommands = dataHandler.GetPendingCommands() + for command in pendingCommands: + try: + logger.info("Processing command: %s", command["id"]) + RunCommand(command) + except Exception as e: + logger.exception("run command failed") + except Exception as e: + logger.exception("getting command failed") time.sleep(1) if __name__ == '__main__': - Run() \ No newline at end of file + parser = argparse.ArgumentParser() + parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9204) + args = parser.parse_args() + setup_exporter_thread(args.port) + + Run() diff --git a/src/ClusterManager/dist_pod_template.py b/src/ClusterManager/dist_pod_template.py new file mode 100644 index 000000000..c2d6ddff7 --- /dev/null +++ b/src/ClusterManager/dist_pod_template.py @@ -0,0 +1,157 @@ +import os +import sys +import uuid +import datetime +import random +import json +import copy +import yaml +from jinja2 import Template +from job import Job + +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils")) +from config import config +from osUtils import mkdirsAsUser + + +class DistPodTemplate(): + def __init__(self, template, enable_custom_scheduler=False): + self.template = template + self.enable_custom_scheduler = enable_custom_scheduler + + @staticmethod + def generate_launch_script(dist_role, dist_role_idx, user_id, job_path, cmd): + # change ssh folder permission here because the setup permission + # script in launch_ps_job function may have race condition with init_user.sh script. + # results in no such user error + + local_pod_path = os.path.join(config["storage-mount-path"], "work/", job_path, "{}-{}".format(dist_role, dist_role_idx)) + if not os.path.exists(local_pod_path): + mkdirsAsUser(local_pod_path, user_id) + file_name = "job_command.sh" + launch_script_file = os.path.join(local_pod_path, file_name) + with open(launch_script_file, 'w') as f: + f.write(cmd) + f.close() + + launchCMD = ["bash", "/pod/scripts/bootstrap.sh"] + return launchCMD + + def generate_pod(self, pod): + assert(isinstance(self.template, Template)) + + dist_id = pod["distId"] + job_id = pod["jobId"] + job_path = pod["jobPath"] + + pod["podName"] = "{}-{}".format(job_id, dist_id) + + random.seed(datetime.datetime.now()) + if "hostNetwork" in pod and pod["hostNetwork"]: + pod["sshPort"] = random.randint(40000, 49999) + else: + pod["sshPort"] = int(random.random() * 1000 + 3000) + + if (pod["distRole"] == "worker"): + pod["gpuLimit"] = pod["resourcegpu"] + else: + pod["gpuLimit"] = 0 + + if "envs" not in pod: + pod["envs"] = [] + pod["envs"].append({"name": "DLWS_ROLE_NAME", "value": pod["distRole"]}) + pod["envs"].append({"name": "DLWS_ROLE_IDX", "value": pod["distRoleIdx"]}) + + if "labels" not in pod: + pod["labels"] = [] + pod["labels"].append({"name": "distRole", "value": pod["distRole"]}) + pod["labels"].append({"name": "distRoleIdx", "value": pod["distRoleIdx"]}) + pod["labels"].append({"name": "sshPort", "value": pod["sshPort"]}) + + cmd = pod["cmd"] + pod["LaunchCMD"] = DistPodTemplate.generate_launch_script(pod["distRole"], pod["distRoleIdx"], pod["userId"], job_path, cmd) + + pod_yaml = self.template.render(job=pod) + return yaml.full_load(pod_yaml) + + def generate_pods(self, job): + """ + Return (pods, errors) + """ + assert(isinstance(job, Job)) + params = job.params + + if any(required_field not in params for required_field in + [ + "jobtrainingtype", + "jobName", + "jobPath", + "workPath", + "dataPath", + "cmd", + "userId", + "resourcegpu", + "userName", + ]): + return None, "Missing required parameters!" + assert(params["jobtrainingtype"] == "PSDistJob") + + job.job_path = params["jobPath"] + job.work_path = params["workPath"] + job.data_path = params["dataPath"] + # TODO user's mountpoints first, but should after 'job_path' + job.add_mountpoints(job.job_path_mountpoint()) + job.add_mountpoints({"name": "home", "containerPath": "/home/{}".format(job.get_alias()), "hostPath": job.get_homefolder_hostpath(), "enabled": True}) + if "mountpoints" in params: + job.add_mountpoints(params["mountpoints"]) + job.add_mountpoints(job.work_path_mountpoint()) + job.add_mountpoints(job.data_path_mountpoint()) + params["mountpoints"] = job.mountpoints + + params["user_email"] = params["userName"] + params["homeFolderHostpath"] = job.get_homefolder_hostpath() + params["pod_ip_range"] = job.get_pod_ip_range() + params["usefreeflow"] = job.is_freeflow_enabled() + params["jobNameLabel"] = ''.join(e for e in params["jobName"] if e.isalnum()) + params["rest-api"] = job.get_rest_api_url() + + if "nodeSelector" not in params: + params["nodeSelector"] = {} + if "gpuType" in params: + params["nodeSelector"]["gpuType"] = params["gpuType"] + assignedRack = job.get_rack() + if assignedRack is not None: + params["nodeSelector"]["rack"] = assignedRack + + params["numworker"] = int(params["numpsworker"]) + params["numps"] = int(params["numps"]) + + if "envs" not in params: + params["envs"] = [] + params["envs"].append({"name": "DLWS_NUM_GPU_PER_WORKER", "value": params["resourcegpu"]}) + + if "hostNetwork" in params and params["hostNetwork"]: + params["envs"].append({"name": "DLWS_HOST_NETWORK", "value": "enable"}) + params["envs"].append({"name": "DLWS_WORKER_NUM", "value": params["numworker"]}) + + pods = [] + nums = {"ps": int(params["numps"]), "worker": int(params["numpsworker"])} + for role in ["ps", "worker"]: + for idx in range(nums[role]): + pod = copy.deepcopy(params) + pod["distRole"] = role + pod["distRoleIdx"] = idx + pod["distId"] = "%s%d" % (role, idx) + # mount /pod + local_pod_path = job.get_hostpath(job.job_path, "%s-%d" % (role, idx)) + pod["mountpoints"].append({"name": "pod", "containerPath": "/pod", "hostPath": local_pod_path, "enabled": True}) + + + pods.append(pod) + + k8s_pods = [] + for pod in pods: + k8s_pod = self.generate_pod(pod) + k8s_pods.append(k8s_pod) + + return k8s_pods, None diff --git a/src/ClusterManager/endpoint_manager.py b/src/ClusterManager/endpoint_manager.py new file mode 100755 index 000000000..d6912ed35 --- /dev/null +++ b/src/ClusterManager/endpoint_manager.py @@ -0,0 +1,279 @@ + +import json +import os +import time +import sys +import datetime +import copy +import base64 +import traceback +import random +import re +import logging +import yaml +import logging.config + +import argparse +from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time + +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils")) +import k8sUtils +from config import config, GetStoragePath, GetWorkPath +from DataHandler import DataHandler + +from job_deployer import JobDeployer + +logger = logging.getLogger(__name__) +deployer = JobDeployer() + + +def is_ssh_server_ready(pod_name): + bash_script = "sudo service ssh status" + output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) + if output == "": + return False + return True + + +def query_ssh_port(pod_name): + bash_script = "grep ^Port /etc/ssh/sshd_config | cut -d' ' -f2" + status_code, output = deployer.pod_exec(pod_name, ["/bin/bash", "-c", bash_script]) + if status_code != 0: + raise RuntimeError("Query ssh port failed: {}".format(pod_name)) + if not output: + return 22 + return int(output) + + +def start_ssh_server(pod_name, user_name, host_network=False, ssh_port=22): + '''Setup the ssh server in container, and return the listening port.''' + bash_script = "sudo bash -c 'apt-get update && apt-get install -y openssh-server && cd /home/" + user_name + " && (chown " + user_name + " -R .ssh; chmod 600 -R .ssh/*; chmod 700 .ssh; true) && service ssh restart'" + + # ssh_port = 22 + + # modify the script for HostNewtork + if host_network: + # if the ssh_port is default value 22, randomly choose one + if ssh_port == 22: + ssh_port = random.randint(40000, 49999) + # bash_script = "sed -i '/^Port 22/c Port "+str(ssh_port)+"' /etc/ssh/sshd_config && "+bash_script + # TODO refine the script later + bash_script = "sudo bash -c 'apt-get update && apt-get install -y openssh-server && sed -i \"s/^Port/#&/\" /etc/ssh/sshd_config && echo \"Port " + str(ssh_port) + "\" >> /etc/ssh/sshd_config && cd /home/" + user_name + " && (chown " + user_name + " -R .ssh; chmod 600 -R .ssh/*; chmod 700 .ssh; true) && service ssh restart'" + + # TODO setup reasonable timeout + # output = k8sUtils.kubectl_exec("exec %s %s" % (jobId, " -- " + bash_script), 1) + output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) + if output == "": + raise Exception("Failed to setup ssh server in container. JobId: %s " % pod_name) + return ssh_port + + +def get_k8s_endpoint(endpoint_description_path): + endpoint_description_path = os.path.join(config["storage-mount-path"], endpoint_description_path) + return k8sUtils.kubectl_exec("get -o json -f %s" % endpoint_description_path) + + +def generate_node_port_service(job_id, pod_name, endpoint_id, name, target_port): + endpoint_description = """kind: Service +apiVersion: v1 +metadata: + name: {2} + labels: + run: {0} + jobId: {0} + podName: {1} +spec: + type: NodePort + selector: + podName: {1} + ports: + - name: {3} + protocol: "TCP" + targetPort: {4} + port: {4} +""".format(job_id, pod_name, endpoint_id, name, target_port) + logger.info("endpointDescription: %s", endpoint_description) + return endpoint_description + + +def create_node_port(endpoint): + endpoint_description = generate_node_port_service(endpoint["jobId"], endpoint["podName"], endpoint["id"], endpoint["name"], endpoint["podPort"]) + endpoint_description_path = os.path.join(config["storage-mount-path"], endpoint["endpointDescriptionPath"]) + logger.info("endpointDescriptionPath: %s", endpoint_description_path) + with open(endpoint_description_path, 'w') as f: + f.write(endpoint_description) + + result = k8sUtils.kubectl_create(endpoint_description_path) + if result == "": + raise Exception("Failed to create NodePort for ssh. JobId: %s " % endpoint["jobId"]) + + logger.info("Submitted endpoint %s to k8s, returned with status %s", endpoint["jobId"], result) + + +def setup_ssh_server(user_name, pod_name, host_network=False): + '''Setup ssh server on pod and return the port''' + # setup ssh server only is the ssh server is not up + if not is_ssh_server_ready(pod_name): + logger.info("Ssh server is not ready for pod: %s. Setup ...", pod_name) + ssh_port = start_ssh_server(pod_name, user_name, host_network) + else: + ssh_port = query_ssh_port(pod_name) + logger.info("Ssh server is ready for pod: %s. Ssh listen on %s", pod_name, ssh_port) + return ssh_port + + +def setup_jupyter_server(user_name, pod_name): + + jupyter_port = random.randint(40000, 49999) + bash_script = "sudo bash -c 'export DEBIAN_FRONTEND=noninteractive; apt-get update && apt-get install -y python3-pip && python3 -m pip install --upgrade pip && python3 -m pip install jupyter && cd /home/" + user_name + " && runuser -l " + user_name + " -c \"jupyter notebook --no-browser --ip=0.0.0.0 --NotebookApp.token= --port=" + str(jupyter_port) + " &>/dev/null &\"'" + output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) + if output == "": + raise Exception("Failed to start jupyter server in container. JobId: %s " % pod_name) + return jupyter_port + + +def setup_tensorboard(user_name, pod_name): + tensorboard_port = random.randint(40000, 49999) + bash_script = "sudo bash -c 'export DEBIAN_FRONTEND=noninteractive; pip install tensorboard; runuser -l " + user_name + " -c \"mkdir -p ~/tensorboard/\${DLWS_JOB_ID}/logs; nohup tensorboard --logdir=~/tensorboard/\${DLWS_JOB_ID}/logs --port=" + str(tensorboard_port) + " &>/dev/null &\"'" + output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) + if output == "": + raise Exception("Failed to start tensorboard in container. JobId: %s " % pod_name) + return tensorboard_port + + +def start_endpoint(endpoint): + # pending, running, stopped + logger.info("Starting endpoint: %s", endpoint) + + # podName + pod_name = endpoint["podName"] + user_name = endpoint["username"] + host_network = endpoint["hostNetwork"] + + port_name = endpoint["name"] + if port_name == "ssh": + endpoint["podPort"] = setup_ssh_server(user_name, pod_name, host_network) + elif port_name == "ipython": + endpoint["podPort"] = setup_jupyter_server(user_name, pod_name) + elif port_name == "tensorboard": + endpoint["podPort"] = setup_tensorboard(user_name, pod_name) + else: + endpoint["podPort"] = int(endpoint["podPort"]) + + # create NodePort + create_node_port(endpoint) + + +def start_endpoints(): + try: + data_handler = DataHandler() + try: + pending_endpoints = data_handler.GetPendingEndpoints() + + for endpoint_id, endpoint in pending_endpoints.items(): + try: + job = data_handler.GetJob(jobId=endpoint["jobId"])[0] + if job["jobStatus"] != "running": + continue + + # get endpointDescriptionPath + # job["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" + endpoint_description_dir = re.search("(.*/)[^/\.]+.yaml", job["jobDescriptionPath"]).group(1) + endpoint["endpointDescriptionPath"] = os.path.join(endpoint_description_dir, endpoint_id + ".yaml") + + logger.info("\n\n\n\n\n\n----------------Begin to start endpoint %s", endpoint["id"]) + output = get_k8s_endpoint(endpoint["endpointDescriptionPath"]) + if(output != ""): + endpoint_description = json.loads(output) + endpoint["endpointDescription"] = endpoint_description + endpoint["status"] = "running" + pod = k8sUtils.GetPod("podName=" + endpoint["podName"]) + if "items" in pod and len(pod["items"]) > 0: + endpoint["nodeName"] = pod["items"][0]["spec"]["nodeName"] + else: + start_endpoint(endpoint) + + endpoint["lastUpdated"] = datetime.datetime.now().isoformat() + data_handler.UpdateEndpoint(endpoint) + except Exception as e: + logger.warning("Process endpoint failed {}".format(endpoint), exc_info=True) + except Exception as e: + logger.exception("start endpoint failed") + finally: + data_handler.Close() + except Exception as e: + logger.exception("close data handler failed") + + +def cleanup_endpoints(): + try: + data_handler = DataHandler() + try: + dead_endpoints = data_handler.GetDeadEndpoints() + for endpoint_id, dead_endpoint in dead_endpoints.items(): + try: + logger.info("\n\n\n\n\n\n----------------Begin to cleanup endpoint %s", endpoint_id) + endpoint_description_path = os.path.join(config["storage-mount-path"], dead_endpoint["endpointDescriptionPath"]) + still_running = get_k8s_endpoint(endpoint_description_path) + # empty mean not existing + if still_running == "": + logger.info("Endpoint already gone %s", endpoint_id) + status = "stopped" + else: + output = k8sUtils.kubectl_delete(endpoint_description_path) + # 0 for success + if output == 0: + status = "stopped" + logger.info("Succeed cleanup endpoint %s", endpoint_id) + else: + # TODO will need to clean it up eventually + status = "unknown" + logger.info("Clean dead endpoint %s failed, endpoints: %s", endpoint_id, dead_endpoint) + + # we are not changing status from "pending", "pending" endpoints are planed to setup later + if dead_endpoint["status"] != "pending": + dead_endpoint["status"] = status + dead_endpoint["lastUpdated"] = datetime.datetime.now().isoformat() + data_handler.UpdateEndpoint(dead_endpoint) + except Exception as e: + logger.warning("Clanup endpoint failed {}".format(dead_endpoint), exc_info=True) + except Exception as e: + logger.exception("cleanup endpoint failed") + finally: + data_handler.Close() + except Exception as e: + logger.exception("close data handler failed") + +def create_log(logdir = '/var/log/dlworkspace'): + if not os.path.exists(logdir): + os.system("mkdir -p " + logdir) + with open('logging.yaml') as f: + logging_config = yaml.full_load(f) + f.close() + logging_config["handlers"]["file"]["filename"] = logdir+"/endpoint_manager.log" + logging.config.dictConfig(logging_config) + + +def Run(): + register_stack_trace_dump() + create_log() + + while True: + update_file_modification_time("endpoint_manager") + + with manager_iteration_histogram.labels("endpoint_manager").time(): + # start endpoints + start_endpoints() + time.sleep(1) + + # clean up endpoints for jobs which is NOT running + cleanup_endpoints() + time.sleep(1) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9205) + args = parser.parse_args() + setup_exporter_thread(args.port) + + Run() diff --git a/src/ClusterManager/job.py b/src/ClusterManager/job.py new file mode 100644 index 000000000..993f35b58 --- /dev/null +++ b/src/ClusterManager/job.py @@ -0,0 +1,169 @@ +import sys +import os +import random +from datetime import date +from marshmallow import Schema, fields, pprint, post_load, validate +from jinja2 import Environment, FileSystemLoader, Template + +import logging +import logging.config + +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils")) +from osUtils import mkdirsAsUser + + +# TODO remove it latter +def create_log(logdir='.'): + if not os.path.exists(logdir): + os.system("mkdir -p " + logdir) + with open('logging.yaml') as f: + logging_config = yaml.full_load(f) + f.close() + logging_config["handlers"]["file"]["filename"] = logdir + "/jobmanager.log" + logging.config.dictConfig(logging_config) + + +class Job: + def __init__(self, + cluster, + job_id, + email, + mountpoints=None, + job_path="", + work_path="", + data_path="", + params=None, + ): + """ + job_id: an unique string for the job. + email: user's email. + cluster: cluster config. + job_path: relative path, on shared storage, for example "user_alias/jobs/date/job_id". + work_path: relative path, on shared storage, for example "user_alias". + """ + self.cluster = cluster + self.job_id = job_id + self.email = email + self.mountpoints = mountpoints + self.job_path = job_path + self.work_path = work_path + self.data_path = data_path + self.params = params + + def add_mountpoints(self, mountpoint): + ''' + 1. Silently skip if the name/hostPath/containerPath duplicates with an existing one. + 2. Name would be normalized. + + Mountpoint example: + { + "enabled":true, + "containerPath":"/home/username", + "hostPath":"/dlwsdata/work/username", + "name":"homefolder" + } + ''' + if mountpoint is None: + return + if self.mountpoints is None: + self.mountpoints = [] + + # add each items in the list one by one + if isinstance(mountpoint, list): + for m in mountpoint: + self.add_mountpoints(m) + return + + # only allow alphanumeric in "name" + if "name" not in mountpoint or mountpoint["name"] == "": + mountpoint["name"] = mountpoint["containerPath"] + mountpoint["name"] = ''.join(c for c in mountpoint["name"] if c.isalnum()) + + # skip dulicate entry + for item in self.mountpoints: + if item["name"] == mountpoint["name"] or item["containerPath"] == mountpoint["containerPath"] or item["hostPath"] == mountpoint["hostPath"]: + logging.warn("Duplciate mountpoint: %s" % mountpoint) + return + + self.mountpoints.append(mountpoint) + + def get_alias(self): + return self.email.split("@")[0].strip() + + def get_hostpath(self, *path_relate_to_workpath): + """return os.path.join(self.cluster["storage-mount-path"], "work", *path_relate_to_workpath)""" + return os.path.join(self.cluster["storage-mount-path"], "work", *path_relate_to_workpath) + + def get_homefolder_hostpath(self): + return self.get_hostpath(self.get_alias()) + + def job_path_mountpoint(self): + assert(len(self.job_path) > 0) + job_host_path = self.get_hostpath(self.job_path) + return {"name": "job", "containerPath": "/job", "hostPath": job_host_path, "enabled": True} + + def work_path_mountpoint(self): + assert(len(self.work_path) > 0) + work_host_path = self.get_hostpath(self.work_path) + return {"name": "work", "containerPath": "/work", "hostPath": work_host_path, "enabled": True} + + def data_path_mountpoint(self): + assert(self.data_path is not None) + data_host_path = os.path.join(self.cluster["storage-mount-path"], "storage", self.data_path) + return {"name": "data", "containerPath": "/data", "hostPath": data_host_path, "enabled": True} + + def get_template(self): + """Return jinja template.""" + path = os.path.abspath(os.path.join(self.cluster["root-path"], "Jobs_Templete", "pod.yaml.template")) + ENV = Environment(loader=FileSystemLoader("/")) + template = ENV.get_template(path) + assert(isinstance(template, Template)) + return template + + def is_custom_scheduler_enabled(self): + return self._get_cluster_config("kube_custom_scheduler") + + def get_rest_api_url(self): + return self._get_cluster_config("rest-api") + + def get_pod_ip_range(self): + return self._get_cluster_config("pod_ip_range") + + def is_freeflow_enabled(self): + return self._get_cluster_config("usefreeflow") + + def get_rack(self): + racks = self._get_cluster_config("racks") + if racks is None or len(racks) == 0: + return None + # TODO why random.choice? + return random.choice(racks) + + def _get_cluster_config(self, key): + if key in self.cluster: + return self.cluster[key] + return None + + +class JobSchema(Schema): + cluster = fields.Dict(required=True) + job_id = fields.String(required=True, + # Correctly mappging the name + dump_to="jobId", load_from="jobId", + # We use the id as "name" in k8s object. + # By convention, the "names" of Kubernetes resources should be + # up to maximum length of 253 characters and consist of lower case + # alphanumeric characters, -, and ., + # but certain resources have more specific restrictions. + validate=validate.Regexp(r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$', + error="'{input}' does not match expected pattern {regex}.")) + email = fields.Email(required=True, dump_to="userName", load_from="userName") + mountpoints = fields.Dict(required=False) + job_path = fields.String(required=False, dump_to="jobPath", load_from="jobPath") + work_path = fields.String(required=False, dump_to="workPath", load_from="workPath") + data_path = fields.String(required=False, dump_to="dataPath", load_from="dataPath") + params = fields.Dict(required=False) + + @post_load + def make_user(self, data, **kwargs): + return Job(**data) diff --git a/src/ClusterManager/job_deployer.py b/src/ClusterManager/job_deployer.py new file mode 100644 index 000000000..d50c337fa --- /dev/null +++ b/src/ClusterManager/job_deployer.py @@ -0,0 +1,195 @@ +import yaml +import os +import logging +import logging.config +import timeit +import functools + +from kubernetes import client, config +from kubernetes.client.rest import ApiException +from kubernetes.stream import stream +from kubernetes.stream.ws_client import ERROR_CHANNEL, STDERR_CHANNEL, STDOUT_CHANNEL + +from prometheus_client import Histogram + +job_deployer_fn_histogram = Histogram("job_deployer_fn_latency_seconds", + "latency for executing job deployer (seconds)", + buckets=(.05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0, + 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, float("inf")), + labelnames=("fn_name",)) + +def record(fn): + @functools.wraps(fn) + def wrapped(*args, **kwargs): + start = timeit.default_timer() + try: + return fn(*args, **kwargs) + finally: + elapsed = timeit.default_timer() - start + job_deployer_fn_histogram.labels(fn.__name__).observe(elapsed) + return wrapped + + +# The config will be loaded from default location. +config.load_kube_config() +k8s_client = client.CoreV1Api() + + +class JobDeployer: + + def __init__(self): + self.v1 = k8s_client + self.namespace = "default" + self.pretty = "pretty_example" + + @record + def create_pod(self, body): + api_response = self.v1.create_namespaced_pod( + namespace=self.namespace, + body=body, + pretty=self.pretty, + ) + return api_response + + @record + def delete_pod(self, name, grace_period_seconds=None): + body = client.V1DeleteOptions() + body.grace_period_seconds = grace_period_seconds + api_response = self.v1.delete_namespaced_pod( + name=name, + namespace=self.namespace, + pretty=self.pretty, + body=body, + grace_period_seconds=grace_period_seconds, + ) + return api_response + + @record + def create_service(self, body): + api_response = self.v1.create_namespaced_service( + namespace=self.namespace, + body=body, + pretty=self.pretty, + ) + return api_response + + @record + def delete_service(self, name): + api_response = self.v1.delete_namespaced_service( + name=name, + namespace=self.namespace, + pretty=self.pretty, + body=client.V1DeleteOptions(), + ) + return api_response + + @record + def cleanup_pods(self, pod_names, force=False): + errors = [] + grace_period_seconds = 0 if force else None + for pod_name in pod_names: + try: + self.delete_pod(pod_name, grace_period_seconds) + except Exception as e: + if isinstance(e, ApiException) and 404 == e.status: + return [] + message = "Delete pod failed: {}".format(pod_name) + logging.warning(message, exc_info=True) + errors.append({"message": message, "exception": e}) + return errors + + @record + def cleanup_services(self, services): + errors = [] + for service in services: + assert(isinstance(service, client.V1Service)) + try: + service_name = service.metadata.name + self.delete_service(service_name) + except ApiException as e: + message = "Delete service failed: {}".format(service_name) + logging.warning(message, exc_info=True) + errors.append({"message": message, "exception": e}) + return errors + + @record + def create_pods(self, pods): + # TODO instead of delete, we could check update existiong ones. During refactoring, keeping the old way. + pod_names = [pod["metadata"]["name"] for pod in pods] + self.cleanup_pods(pod_names) + created = [] + for pod in pods: + created_pod = self.create_pod(pod) + created.append(created_pod) + logging.info("Create pod succeed: %s" % created_pod.metadata.name) + return created + + @record + def get_pods(self, field_selector="", label_selector=""): + api_response = self.v1.list_namespaced_pod( + namespace=self.namespace, + pretty=self.pretty, + field_selector=field_selector, + label_selector=label_selector, + ) + logging.debug("Get pods: {}".format(api_response)) + return api_response.items + + @record + def get_services_by_label(self, label_selector): + api_response = self.v1.list_namespaced_service( + namespace=self.namespace, + pretty=self.pretty, + label_selector=label_selector, + ) + return api_response.items + + @record + def delete_job(self, job_id, force=False): + label_selector = "run={}".format(job_id) + + # query pods then delete + pods = self.get_pods(label_selector=label_selector) + pod_names = [pod.metadata.name for pod in pods] + pod_errors = self.cleanup_pods(pod_names, force) + + # query services then delete + services = self.get_services_by_label(label_selector) + service_errors = self.cleanup_services(services) + + errors = pod_errors + service_errors + return errors + + @record + def pod_exec(self, pod_name, exec_command, timeout=60): + """work as the command (with timeout): kubectl exec 'pod_name' 'exec_command'""" + try: + logging.info("Exec on pod {}: {}".format(pod_name, exec_command)) + client = stream( + self.v1.connect_get_namespaced_pod_exec, + name=pod_name, + namespace=self.namespace, + command=exec_command, + stderr=True, + stdin=False, + stdout=True, + tty=False, + _preload_content=False, + ) + client.run_forever(timeout=timeout) + + err = yaml.full_load(client.read_channel(ERROR_CHANNEL)) + if err is None: + return [-1, "Timeout"] + + if err["status"] == "Success": + status_code = 0 + else: + logging.debug("Exec on pod {} failed. cmd: {}, err: {}.".format(pod_name, exec_command, err)) + status_code = int(err["details"]["causes"][0]["message"]) + output = client.read_all() + logging.info("Exec on pod {}, status: {}, cmd: {}, output: {}".format(pod_name, status_code, exec_command, output)) + return [status_code, output] + except ApiException as err: + logging.error("Exec on pod {} error. cmd: {}, err: {}.".format(pod_name, exec_command, err), exc_info=True) + return [-1, err.message] diff --git a/src/ClusterManager/job_manager.py b/src/ClusterManager/job_manager.py index 2a7fb83d5..86fe8433b 100755 --- a/src/ClusterManager/job_manager.py +++ b/src/ClusterManager/job_manager.py @@ -7,7 +7,7 @@ import sys import datetime import copy - +import traceback sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../storage")) sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../utils")) @@ -16,14 +16,15 @@ import k8sUtils import joblog_manager from osUtils import mkdirsAsUser +import notify import yaml from jinja2 import Environment, FileSystemLoader, Template from config import config, GetStoragePath, GetWorkPath from DataHandler import DataHandler -from node_manager import create_log from node_manager import get_cluster_status import base64 +from ResourceInfo import ResourceInfo import re @@ -33,814 +34,477 @@ import logging import logging.config +from job import Job, JobSchema +from pod_template import PodTemplate +from dist_pod_template import DistPodTemplate +from job_deployer import JobDeployer +from job_role import JobRole - -nvidiaDriverPath = config["nvidiaDriverPath"] - - - -def printlog(msg): - print "%s - %s" % (datetime.datetime.utcnow().strftime("%x %X"),msg) - -def LoadJobParams(jobParamsJsonStr): - return json.loads(jobParamsJsonStr) - -def cmd_exec(cmdStr): - try: - output = subprocess.check_output(["bash","-c", cmdStr]) - except Exception as e: - print e - output = "" - return output - - +from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time +def all_pods_not_existing(job_id): + job_deployer = JobDeployer() + job_roles = JobRole.get_job_roles(job_id) + statuses = [job_role.status() for job_role in job_roles] + logging.info("Job: {}, status: {}".format(job_id, statuses)) + return all([status == "NotFound" for status in statuses]) def SubmitJob(job): - jobParams = json.loads(base64.b64decode(job["jobParams"])) - if jobParams["jobtrainingtype"] == "RegularJob": - SubmitRegularJob(job) - elif jobParams["jobtrainingtype"] == "PSDistJob": - SubmitPSDistJob(job) - -def CheckMountPoints(mplist, mp): - ret = True - for item in mplist: - if item["name"] == mp["name"] or item["containerPath"] == mp["containerPath"] or item["hostPath"] == mp["hostPath"]: - ret = False - return ret + # check if existing any pod with label: run=job_id + assert("jobId" in job) + job_id = job["jobId"] + if not all_pods_not_existing(job_id): + logging.warning("Waiting until previously pods are cleaned up! Job {}".format(job_id)) + job_deployer = JobDeployer() + errors = job_deployer.delete_job(job_id, force=True) + if errors: + logging.warning("Force delete job {}: {}".format(job_id, errors)) + return -def SubmitRegularJob(job): ret = {} dataHandler = DataHandler() try: - jobParams = json.loads(base64.b64decode(job["jobParams"])) - - jobParams["pvc_job"] = "jobs-" + jobParams["jobId"] - jobParams["pvc_work"] = "work-" + jobParams["jobId"] - jobParams["pvc_data"] = "storage-" + jobParams["jobId"] - - - if "jobPath" not in jobParams or len(jobParams["jobPath"].strip()) == 0: - dataHandler.SetJobError(jobParams["jobId"],"ERROR: job-path does not exist") + # TODO refine later + # before resubmit the job, reset the endpoints + # update all endpoint to status 'pending', so it would restart when job is ready + endpoints = dataHandler.GetJobEndpoints(job_id) + for endpoint_id, endpoint in endpoints.items(): + endpoint["status"] = "pending" + logging.info("Reset endpoint status to 'pending': {}".format(endpoint_id)) + dataHandler.UpdateEndpoint(endpoint) + + job["cluster"] = config + job_object, errors = JobSchema().load(job) + # TODO assert job_object is a Job + assert(isinstance(job_object, Job)) + + job_object.params = json.loads(base64.b64decode(job["jobParams"])) + + # inject gid, uid and user + # TODO it should return only one entry + user_info = dataHandler.GetIdentityInfo(job_object.params["userName"])[0] + job_object.params["gid"] = user_info["gid"] + job_object.params["uid"] = user_info["uid"] + job_object.params["user"] = job_object.get_alias() + + enable_custom_scheduler = job_object.is_custom_scheduler_enabled() + if job_object.params["jobtrainingtype"] == "RegularJob": + pod_template = PodTemplate(job_object.get_template(), enable_custom_scheduler) + elif job_object.params["jobtrainingtype"] == "PSDistJob": + pod_template = DistPodTemplate(job_object.get_template()) + else: + dataHandler.SetJobError(job_object.job_id, "ERROR: invalid jobtrainingtype: %s" % job_object.params["jobtrainingtype"]) return False - if "workPath" not in jobParams or len(jobParams["workPath"].strip()) == 0: - dataHandler.SetJobError(jobParams["jobId"],"ERROR: work-path does not exist") + pods, error = pod_template.generate_pods(job_object) + if error: + dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error) return False - #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0: - # dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist") - # return False - - - jobPath,workPath,dataPath = GetStoragePath(jobParams["jobPath"],jobParams["workPath"],jobParams["dataPath"]) - - - localJobPath = os.path.join(config["storage-mount-path"],jobPath) - - if not os.path.exists(localJobPath): - if "userId" in jobParams: - mkdirsAsUser(localJobPath,jobParams["userId"]) - mkdirsAsUser(os.path.join(localJobPath,"models"),jobParams["userId"]) - else: - mkdirsAsUser(localJobPath,"0") - mkdirsAsUser(os.path.join(localJobPath,"models"),"0") - - jobParams["LaunchCMD"] = "" - if "cmd" not in jobParams: - jobParams["cmd"] = "" - - if isinstance(jobParams["cmd"], basestring) and not jobParams["cmd"] == "": - launchScriptPath = os.path.join(localJobPath,"launch-%s.sh" % jobParams["jobId"]) - with open(launchScriptPath, 'w') as f: - f.write("#!/bin/bash -x\n") - f.write(jobParams["cmd"] + "\n") - f.close() - if "userId" in jobParams: - os.system("chown -R %s %s" % (jobParams["userId"], launchScriptPath)) - jobParams["LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams["jobId"] - - - jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" - - jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"] if e.isalnum()) - - ENV = Environment(loader=FileSystemLoader("/")) - - jobTempDir = os.path.join(config["root-path"],"Jobs_Templete") - jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template") - - jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath) - jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath) - jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath) - jobParams["nvidiaDriverPath"] = nvidiaDriverPath - - - jobParams["userNameLabel"] = getAlias(jobParams["userName"]) - jobParams["rest-api"] = config["rest-api"] - - if "mountpoints" not in jobParams: - jobParams["mountpoints"] = [] - for onemount in jobParams["mountpoints"]: - onemount["name"] = onemount["containerPath"].replace("/","") + job_description = "\n---\n".join([yaml.dump(pod) for pod in pods]) + job_description_path = "jobfiles/" + time.strftime("%y%m%d") + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml" + local_jobDescriptionPath = os.path.realpath(os.path.join(config["storage-mount-path"], job_description_path)) + if not os.path.exists(os.path.dirname(local_jobDescriptionPath)): + os.makedirs(os.path.dirname(local_jobDescriptionPath)) + with open(local_jobDescriptionPath, 'w') as f: + f.write(job_description) - mp = {"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath, "enabled":True} - if CheckMountPoints(jobParams["mountpoints"],mp): - jobParams["mountpoints"].append(mp) - - mp = {"name":"job","containerPath":"/job","hostPath":jobParams["hostjobPath"], "enabled":True} - if CheckMountPoints(jobParams["mountpoints"],mp): - jobParams["mountpoints"].append(mp) - - mp = {"name":"work","containerPath":"/work","hostPath":jobParams["hostworkPath"], "enabled":True} - if CheckMountPoints(jobParams["mountpoints"],mp): - jobParams["mountpoints"].append(mp) - - mp = {"name":"data","containerPath":"/data","hostPath":jobParams["hostdataPath"], "enabled":True} - if CheckMountPoints(jobParams["mountpoints"],mp): - jobParams["mountpoints"].append(mp) - - userAlias = getAlias(jobParams["userName"]) - - mp = {"name":"sshkey","containerPath":"/home/%s/.ssh" % userAlias,"hostPath":os.path.join(config["storage-mount-path"], GetWorkPath(userAlias)+"/.ssh"), "readOnly":True, "enabled":True} - if CheckMountPoints(jobParams["mountpoints"],mp): - jobParams["mountpoints"].append(mp) + job_deployer = JobDeployer() + try: + pods = job_deployer.create_pods(pods) + ret["output"] = "Created pods: {}".format([pod.metadata.name for pod in pods]) + except Exception as e: + ret["output"] = "Error: %s" % e.message + logging.error(e, exc_info=True) + ret["jobId"] = job_object.job_id - jobParams["pod_ip_range"] = config["pod_ip_range"] - if "usefreeflow" in config: - jobParams["usefreeflow"] = config["usefreeflow"] - else: - jobParams["usefreeflow"] = False - - print ("Render Job: %s" % jobParams) - jobDescriptionList = [] - - pods = [] - if "hyperparametername" in jobParams and "hyperparameterstartvalue" in jobParams and "hyperparameterendvalue" in jobParams and "hyperparameterstep" in jobParams: - i = int(jobParams["hyperparameterstartvalue"]) - end = int(jobParams["hyperparameterendvalue"]) - step = int(jobParams["hyperparameterstep"]) - c = 0 - while (i <= end): - pod = {} - pod["podName"] = jobParams["jobId"]+"-pod-"+str(c) - pod["envs"] = [{"name":jobParams["hyperparametername"],"value":i}] - i += step - c += 1 - pods.append(pod) - else: - pod = {} - pod["podName"] = jobParams["jobId"] - pod["envs"] = [] - pods.append(pod) - - if "env" not in jobParams: - jobParams["env"] = [] - jobParams["commonenv"] = copy.copy(jobParams["env"]) - - - for pod in pods: - jobParams["podName"] = pod["podName"] - jobParams["env"] = jobParams["commonenv"] + pod["envs"] - - if "kube_custom_scheduler" in config and config["kube_custom_scheduler"]: - container = {} - container["requests"] = {"alpha.gpu/numgpu" : int(jobParams["resourcegpu"])} - podInfo = {} - podInfo["podname"] = jobParams["podName"] - if "useGPUTopology" in jobParams and jobParams["useGPUTopology"]: - # add topology constraints explicitly - for testing - # if (jobParams["resourcegpu"] >= 2): - # # both cards in same inner group - # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/0/cards"] = 1 - # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/1/cards"] = 1 - # if (jobParams["resourcegpu"] >= 3): - # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/2/cards"] = 1 - # if (jobParams["resourcegpu"] >= 4): - # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/3/cards"] = 1 - # if (jobParams["resourcegpu"] >= 5): - # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/4/cards"] = 1 - # if (jobParams["resourcegpu"] >= 6): - # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/5/cards"] = 1 - # if (jobParams["resourcegpu"] >= 7): - # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/6/cards"] = 1 - # if (jobParams["resourcegpu"] >= 8): - # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/7/cards"] = 1 - podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 1} - else: - # for cases when desired topology is explictly given or not desired - podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 0} - podInfo["runningcontainer"] = {jobParams["podName"] : container} - - if "annotations" not in jobParams: - jobParams["annotations"] = {} - jobParams["annotations"]["pod.alpha/DeviceInformation"] = "'" + json.dumps(podInfo) + "'" - jobParams["resourcegpu"] = 0 # gpu requests specified through annotation - - template = ENV.get_template(os.path.abspath(jobTemp)) - job_description = template.render(job=jobParams) - jobDescriptionList.append(job_description) - - if ("interactivePort" in jobParams and len(jobParams["interactivePort"].strip()) > 0): - ports = [p.strip() for p in re.split(",|;",jobParams["interactivePort"]) if len(p.strip()) > 0 and p.strip().isdigit()] - for portNum in ports: - jobParams["serviceId"] = "interactive-" + jobParams["podName"] + "-" + portNum - jobParams["port"] = portNum - jobParams["port-name"] = "interactive" - jobParams["port-type"] = "TCP" - - serviceTemplate = ENV.get_template(os.path.join(jobTempDir,"KubeSvc.yaml.template")) - - stemplate = ENV.get_template(serviceTemplate) - interactiveMeta = stemplate.render(svc=jobParams) - jobDescriptionList.append(interactiveMeta) - - - jobDescription = "\n---\n".join(jobDescriptionList) - - jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) - if not os.path.exists(os.path.dirname(os.path.realpath(jobDescriptionPath))): - os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) - if os.path.isfile(jobDescriptionPath): - output = k8sUtils.kubectl_delete(jobDescriptionPath) - - with open(jobDescriptionPath, 'w') as f: - f.write(jobDescription) - - output = k8sUtils.kubectl_create(jobDescriptionPath) - logging.info("Submitted job %s to k8s, returned with status %s" %(job["jobId"], output)) - - ret["output"] = output - - ret["jobId"] = jobParams["jobId"] - - - if "userName" not in jobParams: - jobParams["userName"] = "" - - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","scheduling") - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescriptionPath",jobParams["jobDescriptionPath"]) - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescription",base64.b64encode(jobDescription)) - + dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus", "scheduling") + dataHandler.UpdateJobTextField(job_object.job_id, "jobDescriptionPath", job_description_path) + dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription", base64.b64encode(job_description)) + dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated", datetime.datetime.now().isoformat()) jobMeta = {} - jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] - jobMeta["jobPath"] = jobParams["jobPath"] - jobMeta["workPath"] = jobParams["workPath"] - jobMeta["jobPath"] = jobParams["jobPath"] - jobMeta["LaunchCMD"] = jobParams["LaunchCMD"] + jobMeta["jobDescriptionPath"] = job_description_path + jobMeta["jobPath"] = job_object.job_path + jobMeta["workPath"] = job_object.work_path + # the command of the first container + jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command jobMetaStr = base64.b64encode(json.dumps(jobMeta)) - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobMeta",jobMetaStr) + dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta", jobMetaStr) except Exception as e: - print e + logging.error("Submit job failed: %s" % job, exc_info=True) ret["error"] = str(e) - retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) + retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","error") - dataHandler.UpdateJobTextField(jobParams["jobId"],"errorMsg","Cannot submit job!" + str(e)) - + dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") + dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "Cannot submit job!" + str(e)) + dataHandler.Close() return ret - -def SubmitPSDistJob(job): - ret = {} +def KillJob(job_id, desiredState="killed"): dataHandler = DataHandler() - - try: - jobParams = json.loads(base64.b64decode(job["jobParams"])) - jobParams["rest-api"] = config["rest-api"] - distJobParams = {} - distJobParams["ps"] = [] - distJobParams["worker"] = [] - assignedRack = None - if len(config["racks"]) > 0: - assignedRack = random.choice(config["racks"]) - if jobParams["jobtrainingtype"] == "PSDistJob": - jobDescriptionList = [] - nums = {"ps":int(jobParams["numps"]),"worker":int(jobParams["numpsworker"])} - for role in ["ps","worker"]: - for i in range(nums[role]): - distJobParam=copy.deepcopy(jobParams) - distJobParam["distId"] = "%s%d" % (role,i) - distJobParam["distRole"] = role - - if "jobPath" not in distJobParam or len(distJobParam["jobPath"].strip()) == 0: - dataHandler.SetJobError(distJobParam["jobId"],"ERROR: job-path does not exist") - return False - - distJobParam["distJobPath"] = os.path.join(distJobParam["jobPath"],distJobParam["distId"]) - - if "workPath" not in distJobParam or len(distJobParam["workPath"].strip()) == 0: - dataHandler.SetJobError(distJobParam["jobId"],"ERROR: work-path does not exist") - return False - - if "dataPath" not in distJobParam or len(distJobParam["dataPath"].strip()) == 0: - dataHandler.SetJobError(distJobParam["jobId"],"ERROR: data-path does not exist") - return False - - jobPath,workPath,dataPath = GetStoragePath(distJobParam["distJobPath"],distJobParam["workPath"],distJobParam["dataPath"]) - - localJobPath = os.path.join(config["storage-mount-path"],jobPath) - if not os.path.exists(localJobPath): - if "userId" in distJobParam: - mkdirsAsUser(localJobPath,distJobParam["userId"]) - else: - mkdirsAsUser(localJobPath,0) - - - distJobParam["LaunchCMD"] = "" - if "cmd" not in distJobParam: - distJobParam["cmd"] = "" - -################One choice is that we only wait for certain time. -# launchCMD = """ -##!/bin/bash -#mkdir -p /opt -#echo "[DLWorkspace System]: Waiting for all containers are ready..." -## wait for at most 10 mins. -#for i in {1..200}; do -# if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then -# sleep 3 -# else -# break -# fi -#done -#if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then -# echo "[DLWorkspace System]: Waiting for containers: timeout! Restarting..." -# exit 1 -#else -# echo "[DLWorkspace System]: All containers are ready, launching training job..." -# chmod +x /opt/run_dist_job.sh -# /opt/run_dist_job.sh -#fi -#""" - - - launchCMD = """ -#!/bin/bash -mkdir -p /opt -echo "[DLWorkspace System]: Waiting for all containers are ready..." -while [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; do - sleep 3 -done -echo "[DLWorkspace System]: All containers are ready, launching training job..." -chmod +x /opt/run_dist_job.sh -/opt/run_dist_job.sh -""" - - launchScriptPath = os.path.join(localJobPath,"launch-%s.sh" % distJobParam["jobId"]) - with open(launchScriptPath, 'w') as f: - f.write(launchCMD) - f.close() - distJobParam["LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % distJobParam["jobId"] - - - - distJobParam["jobNameLabel"] = ''.join(e for e in distJobParam["jobName"] if e.isalnum()) - distJobParam["userNameLabel"] = getAlias(jobParams["userName"]) - ENV = Environment(loader=FileSystemLoader("/")) - - jobTempDir = os.path.join(config["root-path"],"Jobs_Templete") - jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template") - - distJobParam["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath) - distJobParam["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath) - distJobParam["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath) - distJobParam["nvidiaDriverPath"] = nvidiaDriverPath - - if "mountpoints" not in distJobParam: - distJobParam["mountpoints"] = [] - - distJobParam["mountpoints"].append({"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath}) - distJobParam["mountpoints"].append({"name":"job","containerPath":"/job","hostPath":distJobParam["hostjobPath"]}) - distJobParam["mountpoints"].append({"name":"work","containerPath":"/work","hostPath":distJobParam["hostworkPath"]}) - distJobParam["mountpoints"].append({"name":"data","containerPath":"/data","hostPath":distJobParam["hostdataPath"]}) - distJobParam["pod_ip_range"] = config["pod_ip_range"] - if "usefreeflow" in config and config["usefreeflow"] == "True": - distJobParam["usefreeflow"] = config["usefreeflow"] - else: - distJobParam["usefreeflow"] = False - - - random.seed(datetime.datetime.now()) - distJobParam["containerPort"] = int(random.random()*1000+3000) - - if assignedRack is not None: - if "nodeSelector" not in distJobParam: - distJobParam["nodeSelector"] = {} - distJobParam["nodeSelector"]["rack"] = assignedRack - - template = ENV.get_template(os.path.abspath(jobTemp)) - job_description = template.render(job=distJobParam) - - jobDescriptionList.append(job_description) - - distJobParams[role].append(distJobParam) - - jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" - jobDescription = "\n---\n".join(jobDescriptionList) - - - jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) - if not os.path.exists(os.path.dirname(os.path.realpath(jobDescriptionPath))): - os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) - if os.path.isfile(jobDescriptionPath): - output = k8sUtils.kubectl_delete(jobDescriptionPath) - - with open(jobDescriptionPath, 'w') as f: - f.write(jobDescription) - - output = k8sUtils.kubectl_create(jobDescriptionPath) - - ret["output"] = output - - ret["jobId"] = jobParams["jobId"] - - - if "userName" not in jobParams: - jobParams["userName"] = "" - - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","scheduling") - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescriptionPath",jobParams["jobDescriptionPath"]) - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescription",base64.b64encode(jobDescription)) - - - jobMeta = {} - jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] - jobMeta["jobPath"] = jobParams["jobPath"] - jobMeta["workPath"] = jobParams["workPath"] - jobMeta["jobPath"] = jobParams["jobPath"] - jobMeta["LaunchCMD"] = jobParams["LaunchCMD"] - jobMeta["distJobParams"] = distJobParams - - jobMetaStr = base64.b64encode(json.dumps(jobMeta)) - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobMeta",jobMetaStr) - except Exception as e: - print e - ret["error"] = str(e) - retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) - if retries >= 5: - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","error") - dataHandler.UpdateJobTextField(jobParams["jobId"],"errorMsg","Cannot submit job!" + str(e)) - - return ret - -def KillJob(job): - dataHandler = DataHandler() - result, detail = k8sUtils.GetJobStatus(job["jobId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatusDetail",base64.b64encode(json.dumps(detail))) - logging.info("Killing job %s, with status %s, %s" %(job["jobId"], result,detail)) - if "jobDescriptionPath" in job and job["jobDescriptionPath"] is not None: - jobDescriptionPath = os.path.join(config["storage-mount-path"], job["jobDescriptionPath"]) - if os.path.isfile(jobDescriptionPath): - if k8sUtils.kubectl_delete(jobDescriptionPath) == 0: - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","killed") - return True - else: - dataHandler.UpdateJobTextField(job["jobId"],"errorMsg","Cannot delete job from Kubernetes Cluster!") + result, detail = k8sUtils.GetJobStatus(job_id) + dataHandler.UpdateJobTextField(job_id, "jobStatusDetail", base64.b64encode(json.dumps(detail))) + logging.info("Killing job %s, with status %s, %s" % (job_id, result, detail)) + + job_deployer = JobDeployer() + errors = job_deployer.delete_job(job_id, force=True) + + if len(errors) == 0: + dataHandler.UpdateJobTextField(job_id, "jobStatus", desiredState) + dataHandler.UpdateJobTextField(job_id, "lastUpdated", datetime.datetime.now().isoformat()) + dataHandler.Close() + return True else: - dataHandler.UpdateJobTextField(job["jobId"],"errorMsg","Cannot find job description file!") + dataHandler.UpdateJobTextField(job_id, "jobStatus", "error") + dataHandler.UpdateJobTextField(job_id, "lastUpdated", datetime.datetime.now().isoformat()) + dataHandler.Close() + logging.error("Kill job failed with errors: {}".format(errors)) + return False - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","error") - return False - -def getAlias(username): - if "@" in username: - username = username.split("@")[0].strip() - - if "/" in username: - username = username.split("/")[1].strip() - - return username +def GetJobTotalGpu(jobParams): + numWorkers = 1 + if "numpsworker" in jobParams: + numWorkers = int(jobParams["numpsworker"]) + return int(jobParams["resourcegpu"]) * numWorkers def ApproveJob(job): - dataHandler = DataHandler() - dataHandler.ApproveJob(job["jobId"]) - dataHandler.Close() - return True - - - -def AutoApproveJob(job): - cluster_status = get_cluster_status() - jobUser = getAlias(job["userName"]) - jobParams = json.loads(base64.b64decode(job["jobParams"])) - jobGPU = int(jobParams["resourcegpu"]) - - currentGPU = 0 - for user in cluster_status["user_status"]: - if user["userName"] == jobUser: - currentGPU = int(user["userGPU"]) - - if currentGPU == 0 or currentGPU + jobGPU <= 4: - ApproveJob(job) + try: + job_id = job["jobId"] + vcName = job["vcName"] + jobParams = json.loads(base64.b64decode(job["jobParams"])) + job_total_gpus = GetJobTotalGpu(jobParams) + + dataHandler = DataHandler() + + if "preemptionAllowed" in jobParams and jobParams["preemptionAllowed"] is True: + logging.info("Job {} preemptible, approve!".format(job_id)) + dataHandler.UpdateJobTextField(job_id, "jobStatus", "queued") + return True + + vcList = dataHandler.ListVCs() + vc = None + for item in vcList: + if item["vcName"] == vcName: + vc = item + break + if vc is None: + logging.warning("Vc not exising! job {}, vc {}".format(job_id, vcName)) + return False + metadata = json.loads(vc["metadata"]) + + if "user_quota" in metadata: + user_running_jobs = dataHandler.GetJobList(job["userName"], vcName, status="running,queued,scheduling", op=("=", "or")) + running_gpus = 0 + for running_job in user_running_jobs: + running_jobParams = json.loads(base64.b64decode(running_job["jobParams"])) + # ignore preemptible GPUs + if "preemptionAllowed" in running_jobParams and running_jobParams["preemptionAllowed"] is True: + continue + running_job_total_gpus = GetJobTotalGpu(running_jobParams) + running_gpus += running_job_total_gpus + + logging.info("Job {} require {}, used quota (exclude preemptible GPUs) {}, with user quota of {}.".format(job_id, job_total_gpus, running_gpus, metadata["user_quota"])) + if job_total_gpus > 0 and int(metadata["user_quota"]) < (running_gpus + job_total_gpus): + logging.info("Job {} excesses the user quota: {} + {} > {}. Will need approve from admin.".format(job_id, running_gpus, job_total_gpus, metadata["user_quota"])) + return False + + dataHandler.UpdateJobTextField(job_id, "jobStatus", "queued") + return True + except Exception as e: + logging.warning(e, exc_info=True) + finally: + dataHandler.Close() UnusualJobs = {} -def UpdateJobStatus(job): +def UpdateJobStatus(job, notifier=None): + assert(job["jobStatus"] == "scheduling" or job["jobStatus"] == "running") dataHandler = DataHandler() jobParams = json.loads(base64.b64decode(job["jobParams"])) + result = check_job_status(job["jobId"]) + logging.info("++++++++ Job status: {} {}".format(job["jobId"], result)) - if job["jobStatus"] == "scheduling" and jobParams["jobtrainingtype"] == "PSDistJob": - launch_ps_dist_job(jobParams) - - - jobPath,workPath,dataPath = GetStoragePath(jobParams["jobPath"],jobParams["workPath"],jobParams["dataPath"]) - localJobPath = os.path.join(config["storage-mount-path"],jobPath) - logPath = os.path.join(localJobPath,"logs/joblog.txt") - + jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) + localJobPath = os.path.join(config["storage-mount-path"], jobPath) + logPath = os.path.join(localJobPath, "logs/joblog.txt") - result, detail = k8sUtils.GetJobStatus(job["jobId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatusDetail",base64.b64encode(json.dumps(detail))) - - logging.info("job %s status: %s,%s" % (job["jobId"], result, json.dumps(detail))) - jobDescriptionPath = os.path.join(config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None if "userId" not in jobParams: - jobParams["userId"] = "0" - if result.strip() == "Succeeded": - joblog_manager.extract_job_log(job["jobId"],logPath,jobParams["userId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","finished") + jobParams["userId"] = "0" + + if result == "Succeeded": + joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) + dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile(jobDescriptionPath): - k8sUtils.kubectl_delete(jobDescriptionPath) + k8sUtils.kubectl_delete(jobDescriptionPath) - elif result.strip() == "Running": + + if notifier is not None: + notifier.notify(notify.new_job_state_change_message( + job["userName"], job["jobId"], result.strip())) + elif result == "Running": if job["jobStatus"] != "running": - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","running") - - if "interactivePort" in jobParams: - serviceAddress = k8sUtils.GetServiceAddress(job["jobId"]) - serviceAddress = base64.b64encode(json.dumps(serviceAddress)) - dataHandler.UpdateJobTextField(job["jobId"],"endpoints",serviceAddress) - - elif result.strip() == "Failed": - printlog("Job %s fails, cleaning..." % job["jobId"]) - joblog_manager.extract_job_log(job["jobId"],logPath,jobParams["userId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","failed") - dataHandler.UpdateJobTextField(job["jobId"],"errorMsg",detail) + started_at = datetime.datetime.now().isoformat() + detail = [{"startedAt": started_at, "message": "started at: {}".format(started_at)}] + dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) + dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") + + elif result == "Failed": + logging.warning("Job %s fails, cleaning...", job["jobId"]) + + if notifier is not None: + notifier.notify(notify.new_job_state_change_message( + job["userName"], job["jobId"], result.strip())) + + joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) + dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") + dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "pod failed") + if jobDescriptionPath is not None and os.path.isfile(jobDescriptionPath): - k8sUtils.kubectl_delete(jobDescriptionPath) + k8sUtils.kubectl_delete(jobDescriptionPath) - elif result.strip() == "Unknown": + elif result == "Unknown" or result == "NotFound": if job["jobId"] not in UnusualJobs: + logging.warning("!!! Job status ---{}---, job: {}".format(result, job["jobId"])) UnusualJobs[job["jobId"]] = datetime.datetime.now() - elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: + # TODO + # 1) May need to reduce the timeout. + # It takes minutes before pod turns into "Unknown", we may don't need to wait so long. + # 2) If node resume before we resubmit the job, the job will end in status 'NotFound'. + elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 30: del UnusualJobs[job["jobId"]] - retries = dataHandler.AddandGetJobRetries(job["jobId"]) - if retries >= 5: - printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","error") - dataHandler.UpdateJobTextField(job["jobId"],"errorMsg","cannot launch the job.") - if jobDescriptionPath is not None and os.path.isfile(jobDescriptionPath): - k8sUtils.kubectl_delete(jobDescriptionPath) - else: - printlog("Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"] , retries)) - SubmitJob(job) - elif result.strip() == "PendingHostPort": - printlog("Cannot find host ports for job :%s, re-launch the job with different host ports " % (job["jobId"])) - - SubmitJob(job) - - if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: - del UnusualJobs[job["jobId"]] -def UpdateDistJobStatus(job): - dataHandler = DataHandler() - jobParams = json.loads(base64.b64decode(job["jobParams"])) - - if "userId" not in jobParams: - jobParams["userId"] = "0" + # TODO refine later + # before resubmit the job, reset the endpoints + # update all endpoint to status 'pending', so it would restart when job is ready + endpoints = dataHandler.GetJobEndpoints(job["jobId"]) + for endpoint_id, endpoint in endpoints.items(): + endpoint["status"] = "pending" + logging.info("Reset endpoint status to 'pending': {}".format(endpoint_id)) + dataHandler.UpdateEndpoint(endpoint) - jobPath,workPath,dataPath = GetStoragePath(jobParams["jobPath"],jobParams["workPath"],jobParams["dataPath"]) - localJobPath = os.path.join(config["storage-mount-path"],jobPath) - logPath = os.path.join(localJobPath,"logs/joblog.txt") - - - result, detail = k8sUtils.GetJobStatus(job["jobId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatusDetail",base64.b64encode(detail)) - - logging.info("job %s status: %s,%s" % (job["jobId"], result, json.dumps(detail))) - - jobDescriptionPath = os.path.join(config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None + logging.warning("Job {} fails in Kubernetes as {}, delete and re-submit.".format(job["jobId"], result)) + KillJob(job["jobId"], "queued") + if result != "Unknown" and result != "NotFound" and job["jobId"] in UnusualJobs: + del UnusualJobs[job["jobId"]] - jobId = jobParams["jobId"] - workerPodInfo = k8sUtils.GetPod("distRole=worker,run=" + jobId) - psPodInfo = k8sUtils.GetPod("distRole=ps,run=" + jobId) - if "items" in workerPodInfo and len(workerPodInfo["items"]) == int(jobParams["numpsworker"]) and "items" in psPodInfo and len(psPodInfo["items"]) == int(jobParams["numps"]): - if job["jobStatus"] == "scheduling" : - launch_ps_dist_job(jobParams) - if job["jobStatus"] == "running": - result, detail = GetDistJobStatus(job["jobId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatusDetail",base64.b64encode(detail)) - - printlog("job %s status: %s" % (job["jobId"], result)) - - jobDescriptionPath = os.path.join(config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None - - if result.strip() == "Succeeded": - joblog_manager.extract_job_log(job["jobId"],logPath,jobParams["userId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","finished") - if jobDescriptionPath is not None and os.path.isfile(jobDescriptionPath): - k8sUtils.kubectl_delete(jobDescriptionPath) - - elif result.strip() == "Running": - joblog_manager.extract_job_log(job["jobId"],logPath,jobParams["userId"]) - if job["jobStatus"] != "running": - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","running") - if "interactivePort" in jobParams: - serviceAddress = k8sUtils.GetServiceAddress(job["jobId"]) - serviceAddress = base64.b64encode(json.dumps(serviceAddress)) - dataHandler.UpdateJobTextField(job["jobId"],"endpoints",serviceAddress) - - elif result.strip() == "Failed": - printlog("Job %s fails, cleaning..." % job["jobId"]) - joblog_manager.extract_job_log(job["jobId"],logPath,jobParams["userId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","failed") - dataHandler.UpdateJobTextField(job["jobId"],"errorMsg",detail) - if jobDescriptionPath is not None and os.path.isfile(jobDescriptionPath): - k8sUtils.kubectl_delete(jobDescriptionPath) - - elif result.strip() == "Unknown": - if job["jobId"] not in UnusualJobs: - UnusualJobs[job["jobId"]] = datetime.datetime.now() - elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: - del UnusualJobs[job["jobId"]] - retries = dataHandler.AddandGetJobRetries(job["jobId"]) - if retries >= 5: - printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) - dataHandler.UpdateJobTextField(job["jobId"],"jobStatus","error") - dataHandler.UpdateJobTextField(job["jobId"],"errorMsg","cannot launch the job.") - if jobDescriptionPath is not None and os.path.isfile(jobDescriptionPath): - k8sUtils.kubectl_delete(jobDescriptionPath) - else: - printlog("Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"] , retries)) - SubmitJob(job) - - if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: - del UnusualJobs[job["jobId"]] - - pass - - - - -def run_dist_cmd_on_pod(podId, cmd, outputfile): - remotecmd = "exec %s -- %s" % (podId,cmd) - print remotecmd - k8sUtils.kubectl_exec_output_to_file(remotecmd,outputfile) - - - -class Kube_RemoteCMD_Thread(threading.Thread): - def __init__(self, jobId, podId, cmd, outputfile): - threading.Thread.__init__(self) - self.jobId = jobId - self.podId = podId - self.cmd = cmd - self.outputfile = outputfile - def run(self): - run_dist_cmd_on_pod(self.podId, self.cmd, self.outputfile) - - -def launch_ps_dist_job(jobParams): - jobId = jobParams["jobId"] - workerPodInfo = k8sUtils.GetPod("distRole=worker,run=" + jobId) - psPodInfo = k8sUtils.GetPod("distRole=ps,run=" + jobId) - if "items" in workerPodInfo and len(workerPodInfo["items"]) == int(jobParams["numpsworker"]) and "items" in psPodInfo and len(psPodInfo["items"]) == int(jobParams["numps"]): - podStatus = [k8sUtils.check_pod_status(pod) for pod in workerPodInfo["items"] + psPodInfo["items"] ] - if all([status == "Running" for status in podStatus]): - ps_pod_names = [pod["metadata"]["name"] for pod in psPodInfo["items"]] - worker_pod_names = [pod["metadata"]["name"] for pod in workerPodInfo["items"]] - - ps_pod_ips = [pod["status"]["podIP"] for pod in psPodInfo["items"]] - worker_pod_ips = [pod["status"]["podIP"] for pod in workerPodInfo["items"]] - - ps_num = len(psPodInfo["items"]) - worker_num = len(workerPodInfo["items"]) - - ps_ports = [int(item["metadata"]["labels"]["distPort"]) for item in psPodInfo["items"]] - worker_ports = [int(item["metadata"]["labels"]["distPort"]) for item in workerPodInfo["items"]] - - #port range: 30000~31000 - #rndList = range(max(1000,ps_num + worker_num)) - #random.shuffle(rndList) - #ps_ports = [rndList[i] + 30000 for i in range(ps_num)] - #worker_ports = [rndList[i + ps_num] + 30000 for i in range(worker_num)] - - ps_hosts = ",".join(["%s:%s" % (ps_pod_ips[i],ps_ports[i]) for i in range(ps_num)]) - worker_hosts = ",".join(["%s:%s" % (worker_pod_ips[i],worker_ports[i]) for i in range(worker_num)]) - - ps_files = ["/tmp/" + str(uuid.uuid4()) for i in range(ps_num)] - worker_files = ["/tmp/" + str(uuid.uuid4()) for i in range(worker_num)] - - ps_cmd = ["%s --ps_hosts=%s --worker_hosts=%s --job_name=ps --task_index=%d 2>&1 | tee %s" % (jobParams["cmd"], ps_hosts,worker_hosts,i,ps_files[i]) for i in range(ps_num)] - worker_cmd = ["%s --ps_hosts=%s --worker_hosts=%s --job_name=worker --task_index=%d 2>&1 | tee %s" % (jobParams["cmd"], ps_hosts,worker_hosts,i,worker_files[i]) for i in range(worker_num)] - - - for i in range(ps_num): - os.system("mkdir -p %s" % ps_files[i]) - ps_files[i] = os.path.join(ps_files[i],"run_dist_job.sh") - with open(ps_files[i], 'w') as f: - f.write(ps_cmd[i] + "\n") - f.close() - if "userId" in jobParams: - os.system("chown -R %s %s" % (jobParams["userId"], ps_files[i])) - remotecmd = "cp %s %s:/opt/run_dist_job.sh" % (ps_files[i],ps_pod_names[i]) - k8sUtils.kubectl_exec(remotecmd) - k8sUtils.kubectl_exec("exec %s touch /opt/run_dist_job" % ps_pod_names[i]) - - - for i in range(worker_num): - os.system("mkdir -p %s" % worker_files[i]) - worker_files[i] = os.path.join(worker_files[i],"run_dist_job.sh") - with open(worker_files[i], 'w') as f: - f.write(worker_cmd[i] + "\n") - f.close() - if "userId" in jobParams: - os.system("chown -R %s %s" % (jobParams["userId"], worker_files[i])) - remotecmd = "cp %s %s:/opt/run_dist_job.sh" % (worker_files[i],worker_pod_names[i]) - k8sUtils.kubectl_exec(remotecmd) - k8sUtils.kubectl_exec("exec %s touch /opt/run_dist_job" % worker_pod_names[i]) + dataHandler.Close() - dataHandler = DataHandler() - dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","running") - #ps_threads = [Kube_RemoteCMD_Thread(jobId,ps_pod_names[i],ps_cmd[i],ps_logfiles[i]) for i in range(ps_num)] - #worker_threads = [Kube_RemoteCMD_Thread(jobId,worker_pod_names[i],worker_cmd[i],worker_logfiles[i]) for i in range(worker_num)] - - #for t in ps_threads: - # t.start() +# TODO refine later +def check_job_status(job_id): + job_deployer = JobDeployer() + job_roles = JobRole.get_job_roles(job_id) + + if len(job_roles) < 1: + return "NotFound" + + # role status in ["NotFound", "Pending", "Running", "Succeeded", "Failed", "Unknown"] + # TODO ??? when ps/master role "Succeeded", return Succeeded + for job_role in job_roles: + if job_role.role_name not in ["master", "ps"]: + continue + if job_role.status() == "Succeeded": + logging.info("Job: {}, Succeeded!".format(job_id)) + return "Succeeded" + + statuses = [job_role.status() for job_role in job_roles] + logging.info("Job: {}, status: {}".format(job_id, statuses)) + + details = [] + for job_role in job_roles: + details.append(job_role.pod_details().to_dict()) + logging.info("Job {}, details: {}".format(job_id, details)) + + if "Failed" in statuses: + return "Failed" + if "Unknown" in statuses: + return "Unknown" + if "NotFound" in statuses: + return "NotFound" + if "Pending" in statuses: + return "Pending" + + return "Running" + +def create_log(logdir = '/var/log/dlworkspace'): + if not os.path.exists(logdir): + os.system("mkdir -p " + logdir) + with open('logging.yaml') as f: + logging_config = yaml.full_load(f) + f.close() + logging_config["handlers"]["file"]["filename"] = logdir+"/jobmanager.log" + logging.config.dictConfig(logging_config) - #for t in worker_threads: - # t.start() +def get_priority_dict(): + try: + dataHandler = DataHandler() + priority_dict = dataHandler.get_job_priority() + return priority_dict + except Exception as e: + logging.warning("Fetch job priority dict failed!", exc_info=True) + return {} + finally: + dataHandler.Close() - #while (True): - #for t in ps_threads: - # print t.isAlive() - #time.sleep(5) - #cmd = "test" - #thread.start_new_thread( run_dist_cmd_on_pod, - #(workerPodInfo["items"][0]["metadata"]["name"], cmd) ) +def get_job_priority(priority_dict, job_id): + if job_id in priority_dict.keys(): + return priority_dict[job_id] + return 100 +def TakeJobActions(jobs): + dataHandler = DataHandler() + vcList = dataHandler.ListVCs() + clusterStatus, _ = dataHandler.GetClusterStatus() + dataHandler.Close() + cluster_gpu_capacity = clusterStatus["gpu_capacity"] + cluster_gpu_reserved = clusterStatus["gpu_reserved"] + globalTotalRes = ResourceInfo(cluster_gpu_capacity) + globalReservedRes = ResourceInfo(cluster_gpu_reserved) + + vc_resources = {} + localResInfo = ResourceInfo() + globalResInfo = ResourceInfo.Difference(globalTotalRes, globalReservedRes) + + priority_dict = get_priority_dict() + logging.info("Job priority dict: {}".format(priority_dict)) + + for vc in vcList: + vcTotalRes = ResourceInfo(json.loads(vc["quota"])) + clusterTotalRes = ResourceInfo(clusterStatus["gpu_capacity"]) + clusterReservedRes = ResourceInfo(clusterStatus["gpu_reserved"]) + vcReservedRes = clusterReservedRes.GetFraction(vcTotalRes, clusterTotalRes) + vc_resources[vc["vcName"]] = ResourceInfo.Difference(vcTotalRes, vcReservedRes) + + jobsInfo = [] + for job in jobs: + if job["jobStatus"] in ["queued", "scheduling", "running"]: + singleJobInfo = {} + singleJobInfo["job"] = job + job_params = json.loads(base64.b64decode(job["jobParams"])) + singleJobInfo["preemptionAllowed"] = job_params["preemptionAllowed"] + singleJobInfo["jobId"] = job_params["jobId"] + jobGpuType = "any" + if "gpuType" in job_params: + jobGpuType = job_params["gpuType"] + singleJobInfo["globalResInfo"] = ResourceInfo({jobGpuType : GetJobTotalGpu(job_params)}) + singleJobInfo["sortKey"] = str(job["jobTime"]) + priority = get_job_priority(priority_dict, singleJobInfo["jobId"]) + if singleJobInfo["preemptionAllowed"]: + singleJobInfo["sortKey"] = "1_{:06d}_{}".format(priority, singleJobInfo["sortKey"]) + else: + singleJobInfo["sortKey"] = "0_{:06d}_{}".format(priority, singleJobInfo["sortKey"]) + singleJobInfo["allowed"] = False + jobsInfo.append(singleJobInfo) + + jobsInfo.sort(key=lambda x: x["sortKey"]) + + logging.info("TakeJobActions : local resources : %s" % (vc_resources)) + logging.info("TakeJobActions : global resources : %s" % (globalResInfo.CategoryToCountMap)) + + for sji in jobsInfo: + logging.info("TakeJobActions : job : %s : %s : %s" % (sji["jobId"], sji["globalResInfo"].CategoryToCountMap, sji["sortKey"])) + vc_name = sji["job"]["vcName"] + vc_resource = vc_resources[vc_name] + + if (vc_resource.CanSatisfy(sji["globalResInfo"])): + vc_resource.Subtract(sji["globalResInfo"]) + globalResInfo.Subtract(sji["globalResInfo"]) + sji["allowed"] = True + logging.info("TakeJobActions : local assignment : %s : %s" % (sji["jobId"], sji["globalResInfo"].CategoryToCountMap)) + + for sji in jobsInfo: + if sji["preemptionAllowed"] and (sji["allowed"] is False): + if globalResInfo.CanSatisfy(sji["globalResInfo"]): + logging.info("TakeJobActions : job : %s : %s" % (sji["jobId"], sji["globalResInfo"].CategoryToCountMap)) + # Strict FIFO policy not required for global (bonus) tokens since these jobs are anyway pre-emptible. + globalResInfo.Subtract(sji["globalResInfo"]) + sji["allowed"] = True + logging.info("TakeJobActions : global assignment : %s : %s" % (sji["jobId"], sji["globalResInfo"].CategoryToCountMap)) + + logging.info("TakeJobActions : global resources : %s" % (globalResInfo.CategoryToCountMap)) + + for sji in jobsInfo: + try: + if sji["job"]["jobStatus"] == "queued" and (sji["allowed"] is True): + SubmitJob(sji["job"]) + logging.info("TakeJobActions : submitting job : %s : %s" % (sji["jobId"], sji["sortKey"])) + elif sji["preemptionAllowed"] and (sji["job"]["jobStatus"] == "scheduling" or sji["job"]["jobStatus"] == "running") and (sji["allowed"] is False): + KillJob(sji["job"]["jobId"], "queued") + logging.info("TakeJobActions : pre-empting job : %s : %s" % (sji["jobId"], sji["sortKey"])) + except Exception as e: + logging.error("Process job failed {}".format(sji["job"]), exc_info=True) -def create_log( logdir = '/var/log/dlworkspace' ): - if not os.path.exists( logdir ): - os.system("mkdir -p " + logdir ) - with open('logging.yaml') as f: - logging_config = yaml.load(f) - f.close() - logging_config["handlers"]["file"]["filename"] = logdir+"/jobmanager.log" - logging.config.dictConfig(logging_config) + logging.info("TakeJobActions : job desired actions taken") def Run(): + register_stack_trace_dump() + notifier = notify.Notifier(config.get("job-manager")) + notifier.start() + create_log() while True: - - try: - config["racks"] = k8sUtils.get_node_labels("rack") - config["skus"] = k8sUtils.get_node_labels("sku") - except Exception as e: - print e - - try: - dataHandler = DataHandler() - pendingJobs = dataHandler.GetPendingJobs() - printlog("updating status for %d jobs" % len(pendingJobs)) - for job in pendingJobs: + update_file_modification_time("job_manager") + + with manager_iteration_histogram.labels("job_manager").time(): + try: + config["racks"] = k8sUtils.get_node_labels("rack") + config["skus"] = k8sUtils.get_node_labels("sku") + except Exception as e: + logging.exception("get node labels failed") + + try: + dataHandler = DataHandler() + pendingJobs = dataHandler.GetPendingJobs() + TakeJobActions(pendingJobs) + + pendingJobs = dataHandler.GetPendingJobs() + logging.info("Updating status for %d jobs" % len(pendingJobs)) + for job in pendingJobs: + try: + logging.info("Processing job: %s, status: %s" % (job["jobId"], job["jobStatus"])) + if job["jobStatus"] == "killing": + KillJob(job["jobId"], "killed") + elif job["jobStatus"] == "pausing": + KillJob(job["jobId"], "paused") + elif job["jobStatus"] == "scheduling" or job["jobStatus"] == "running": + UpdateJobStatus(job, notifier) + elif job["jobStatus"] == "unapproved": + ApproveJob(job) + except Exception as e: + logging.warning(e, exc_info=True) + except Exception as e: + logging.warning("Process job failed!", exc_info=True) + finally: try: - print "Processing job: %s, status: %s" % (job["jobId"], job["jobStatus"]) - if job["jobStatus"] == "queued": - SubmitJob(job) - elif job["jobStatus"] == "killing": - KillJob(job) - elif job["jobStatus"] == "scheduling" or job["jobStatus"] == "running" : - UpdateJobStatus(job) - elif job["jobStatus"] == "unapproved" : - AutoApproveJob(job) - except Exception as e: - print e - except Exception as e: - print e + dataHandler.Close() + except: + pass time.sleep(1) + if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9200) + args = parser.parse_args() + setup_exporter_thread(args.port) + Run() - #print k8sUtils.get_pod_events("d493d41c-45ea-4e85-8ca4-01c3533cd727") diff --git a/src/ClusterManager/job_role.py b/src/ClusterManager/job_role.py new file mode 100644 index 000000000..66ad9b2ad --- /dev/null +++ b/src/ClusterManager/job_role.py @@ -0,0 +1,69 @@ +import logging +import logging.config +from job_deployer import JobDeployer + + +class JobRole: + MARK_ROLE_READY_FILE = "/pod/running/ROLE_READY" + + @staticmethod + def get_job_roles(job_id): + deployer = JobDeployer() + pods = deployer.get_pods(label_selector="run={}".format(job_id)) + + job_roles = [] + for pod in pods: + pod_name = pod.metadata.name + if "distRole" in pod.metadata.labels: + role = pod.metadata.labels["distRole"] + else: + role = "master" + job_role = JobRole(role, pod_name) + job_roles.append(job_role) + return job_roles + + def __init__(self, role_name, pod_name): + self.role_name = role_name + self.pod_name = pod_name + + def status(self): + """ + Return role status in ["NotFound", "Pending", "Running", "Succeeded", "Failed", "Unknown"] + It's slightly different from pod phase, when pod is running: + CONTAINER_READY -> WORKER_READY -> JOB_READY (then the job finally in "Running" status.) + """ + # pod-phase: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase + # node condition: https://kubernetes.io/docs/concepts/architecture/nodes/#condition + deployer = JobDeployer() + pods = deployer.get_pods(field_selector="metadata.name={}".format(self.pod_name)) + logging.debug("Pods: {}".format(pods)) + if(len(pods) < 1): + return "NotFound" + + assert(len(pods) == 1) + self.pod = pods[0] + phase = self.pod.status.phase + + # !!! Pod is running, doesn't mean "Role" is ready and running. + if(phase == "Running"): + # Found that phase won't turn into "Unkonwn" even when we get 'unknown' from kubectl + if self.pod.status.reason == "NodeLost": + return "Unknown" + + # Check if the user command had been ran. + if not self.isRoleReady(): + return "Pending" + + return phase + + # TODO should call after status(), or the self.pod would be None + def pod_details(self): + return self.pod + + def isFileExisting(self, file): + deployer = JobDeployer() + status_code, _ = deployer.pod_exec(self.pod_name, ["/bin/sh", "-c", "ls -lrt {}".format(file)]) + return status_code == 0 + + def isRoleReady(self): + return self.isFileExisting(JobRole.MARK_ROLE_READY_FILE) diff --git a/src/ClusterManager/job_status.pdf b/src/ClusterManager/job_status.pdf new file mode 100644 index 000000000..c9756f120 Binary files /dev/null and b/src/ClusterManager/job_status.pdf differ diff --git a/src/ClusterManager/joblog_manager.py b/src/ClusterManager/joblog_manager.py index b10630c1e..b43232a3b 100755 --- a/src/ClusterManager/joblog_manager.py +++ b/src/ClusterManager/joblog_manager.py @@ -23,8 +23,6 @@ from multiprocessing import Process, Manager - - sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../storage")) sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../utils")) @@ -34,10 +32,13 @@ from config import config, GetStoragePath from DataHandler import DataHandler +from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time -def create_log( logdir = '/var/log/dlworkspace' ): +logger = logging.getLogger(__name__) + +def create_log(logdir = '/var/log/dlworkspace'): if not os.path.exists( logdir ): - os.system("mkdir -p " + logdir ) + os.system("mkdir -p " + logdir) with open('logging.yaml') as f: logging_config = yaml.load(f) f.close() @@ -109,7 +110,7 @@ def extract_job_log(jobId,logPath,userId): f.close() os.system("chown -R %s %s" % (userId, containerLogPath)) except Exception as e: - print e + logger.exception("write container log failed") if len(trimlogstr.strip()) > 0: @@ -149,15 +150,24 @@ def update_job_logs(): def Run(): + register_stack_trace_dump() create_log() logging.info("start to update job logs ...") while True: - try: - update_job_logs() - except Exception as e: - print e + update_file_modification_time("joblog_manager") + + with manager_iteration_histogram.labels("joblog_manager").time(): + try: + update_job_logs() + except Exception as e: + logger.exception("update job logs failed") time.sleep(1) if __name__ == '__main__': - Run() \ No newline at end of file + parser = argparse.ArgumentParser() + parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9203) + args = parser.parse_args() + setup_exporter_thread(args.port) + + Run() diff --git a/src/ClusterManager/logging.yaml b/src/ClusterManager/logging.yaml index b276c6d8d..a486bc5aa 100755 --- a/src/ClusterManager/logging.yaml +++ b/src/ClusterManager/logging.yaml @@ -1,26 +1,27 @@ -version: 1 -formatters: - simple: - format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s' -handlers: - console: - class: logging.StreamHandler - level: DEBUG - formatter: simple - stream: ext://sys.stdout - file: - class : logging.handlers.RotatingFileHandler - formatter: simple - filename: /var/log/dlworkspace/clustermanager.log - # roll over at 10MB - maxBytes: 10240000 - # At most 10 logging files - backupCount: 10 -loggers: - basic: - level: DEBUG - handlers: ['console','file'] - propagate: no -root: - level: DEBUG - handlers: ['console','file'] \ No newline at end of file +version: 1 +disable_existing_loggers: False +formatters: + simple: + format: '%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s' +handlers: + console: + class: logging.StreamHandler + level: INFO + formatter: simple + stream: ext://sys.stdout + file: + class : logging.handlers.RotatingFileHandler + formatter: simple + filename: /var/log/dlworkspace/clustermanager.log + # roll over at 10MB + maxBytes: 10240000 + # At most 10 logging files + backupCount: 10 +loggers: + basic: + level: INFO + handlers: ['console','file'] + propagate: no +root: + level: INFO + handlers: ['console','file'] diff --git a/src/ClusterManager/node_manager.py b/src/ClusterManager/node_manager.py index 592d3cdab..fb0de3193 100755 --- a/src/ClusterManager/node_manager.py +++ b/src/ClusterManager/node_manager.py @@ -10,6 +10,7 @@ import yaml from jinja2 import Environment, FileSystemLoader, Template import base64 +from ResourceInfo import ResourceInfo import re @@ -38,11 +39,12 @@ from config import config from DataHandler import DataHandler +from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time -def create_log( logdir = '/var/log/dlworkspace' ): - if not os.path.exists( logdir ): - os.system("mkdir -p " + logdir ) +def create_log(logdir = '/var/log/dlworkspace'): + if not os.path.exists(logdir): + os.system("mkdir -p " + logdir) with open('logging.yaml') as f: logging_config = yaml.load(f) f.close() @@ -64,11 +66,9 @@ def check_cluster_status_change(o_cluster_status,cluster_status): def get_job_gpu_usage(jobId): try: - if "webportal_node" in config: - hostaddress = config["webportal_node"] - else: - hostaddress = "127.0.0.1" - url = """http://"""+hostaddress+""":8086/query?db=collectd&epoch=ms&q=SELECT+max%28%22value%22%29+FROM+%22jobcuda_value%22+WHERE+%28%22host%22+%3D~+%2F%5E"""+jobId+"""%24%2F+AND+%22type%22+%3D+%27percent%27+AND+%22type_instance%22+%3D+%27gpu_util%27+AND+%22instance%22+%3D~+%2F%5Egpu0%24%2F%29+AND+time+%3E%3D+now%28%29+-+480m+fill%28null%29%3B""" + hostaddress = config.get("prometheus_node", "127.0.0.1") + + url = """http://"""+hostaddress+""":9091/prometheus/api/v1/query?query=avg%28avg_over_time%28task_gpu_percent%7Bpod_name%3D%22""" + jobId + """%22%7D%5B4h%5D%29%29+by+%28pod_name%2C+instance%2C+username%29""" curl = pycurl.Curl() curl.setopt(pycurl.URL, url) @@ -80,7 +80,7 @@ def get_job_gpu_usage(jobId): curl.perform() responseStr = buff.getvalue() curl.close() - gpuUsage = int(json.loads(responseStr)["results"][0]["series"][0]["values"][0][1]) + gpuUsage = int(float(json.loads(responseStr)["data"]["result"][0]["value"][1])) except Exception as e: gpuUsage = None @@ -88,7 +88,7 @@ def get_job_gpu_usage(jobId): def get_cluster_status(): cluster_status={} - gpuStr = "alpha.kubernetes.io/nvidia-gpu" + gpuStr = "nvidia.com/gpu" try: output = k8sUtils.kubectl_exec(" get nodes -o yaml") nodeInfo = yaml.load(output) @@ -100,47 +100,52 @@ def get_cluster_status(): node_status = {} node_status["name"] = node["metadata"]["name"] node_status["labels"] = node["metadata"]["labels"] + node_status["gpuType"] = "" + + node_status["scheduled_service"] = [] + for l,s in node_status["labels"].iteritems(): + if s == "active" and l != "all" and l != "default": + node_status["scheduled_service"].append(l) + if l == "gpuType": + node_status["scheduled_service"].append(s) + node_status["gpuType"] = s + if (gpuStr in node["status"]["allocatable"]): - node_status["gpu_allocatable"] = int(node["status"]["allocatable"][gpuStr]) + node_status["gpu_allocatable"] = ResourceInfo({node_status["gpuType"]: int(node["status"]["allocatable"][gpuStr])}).ToSerializable() else: - node_status["gpu_allocatable"] = 0 + node_status["gpu_allocatable"] = ResourceInfo().ToSerializable() if (gpuStr in node["status"]["capacity"]): - node_status["gpu_capacity"] = int(node["status"]["capacity"][gpuStr]) + node_status["gpu_capacity"] = ResourceInfo({node_status["gpuType"] : int(node["status"]["capacity"][gpuStr])}).ToSerializable() else: - node_status["gpu_capacity"] = 0 - node_status["gpu_used"] = 0 + node_status["gpu_capacity"] = ResourceInfo().ToSerializable() + node_status["gpu_used"] = ResourceInfo().ToSerializable() node_status["InternalIP"] = "unknown" node_status["pods"] = [] if "annotations" in node["metadata"]: if "node.alpha/DeviceInformation" in node["metadata"]["annotations"]: node_info = json.loads(node["metadata"]["annotations"]["node.alpha/DeviceInformation"]) - node_status["gpu_capacity"] = max(int(node_info["capacity"]["alpha.gpu/numgpu"]), node_status["gpu_capacity"]) - node_status["gpu_allocatable"] = max(int(node_info["allocatable"]["alpha.gpu/numgpu"]), node_status["gpu_allocatable"]) + if (int(node_info["capacity"]["alpha.gpu/numgpu"]) > ResourceInfo(node_status["gpu_capacity"]).TotalCount()): + node_status["gpu_capacity"] = ResourceInfo({node_status["gpuType"]: int(node_info["capacity"]["alpha.gpu/numgpu"])}).ToSerializable() + if (int(node_info["allocatable"]["alpha.gpu/numgpu"]) > ResourceInfo(node_status["gpu_allocatable"]).TotalCount()): + node_status["gpu_allocatable"] = ResourceInfo({node_status["gpuType"] : int(node_info["allocatable"]["alpha.gpu/numgpu"])}).ToSerializable() if "addresses" in node["status"]: for addr in node["status"]["addresses"]: if addr["type"] == "InternalIP": node_status["InternalIP"] = addr["address"] - node_status["scheduled_service"] = [] - for l,s in node_status["labels"].iteritems(): - if s == "active" and l != "all" and l != "default": - node_status["scheduled_service"].append(l) - if "unschedulable" in node["spec"] and node["spec"]["unschedulable"]: node_status["unschedulable"] = True else: node_status["unschedulable"] = False if "status" in node and "conditions" in node["status"]: - for condi in node["status"]: + for condi in node["status"]["conditions"]: if "type" in condi and condi["type"] == "Ready" and "status" in condi and condi["status"] == "Unknown": node_status["unschedulable"] = True - nodes_status[node_status["name"]] = node_status - output = k8sUtils.kubectl_exec(" get pods -o yaml") podsInfo = yaml.load(output) if "items" in podsInfo: @@ -180,72 +185,83 @@ def get_cluster_status(): pod_name += " (gpu #:" + str(containerGPUs) + ")" if node_name in nodes_status: - nodes_status[node_name]["gpu_used"] += gpus + nodes_status[node_name]["gpu_used"] = ResourceInfo(nodes_status[node_name]["gpu_used"]).Add(ResourceInfo({nodes_status[node_name]["gpuType"] : gpus})).ToSerializable() nodes_status[node_name]["pods"].append(pod_name) - if username is not None: - if username not in user_status: - user_status[username] = gpus - else: - user_status[username] += gpus - - gpu_avaliable = 0 - gpu_reserved = 0 - gpu_capacity = 0 - gpu_unschedulable = 0 - gpu_schedulable = 0 - gpu_used = 0 + if username is not None: + if username not in user_status: + user_status[username] = ResourceInfo({nodes_status[node_name]["gpuType"] : gpus}) + else: + user_status[username].Add(ResourceInfo({nodes_status[node_name]["gpuType"] : gpus})) + gpu_avaliable = ResourceInfo() + gpu_reserved = ResourceInfo() + gpu_capacity = ResourceInfo() + gpu_unschedulable = ResourceInfo() + gpu_schedulable = ResourceInfo() + gpu_used = ResourceInfo() for node_name, node_status in nodes_status.iteritems(): if node_status["unschedulable"]: - gpu_unschedulable += node_status["gpu_capacity"] + gpu_unschedulable.Add(ResourceInfo(node_status["gpu_capacity"])) + gpu_reserved.Add(ResourceInfo.Difference(ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_used"]))) else: - gpu_avaliable += (node_status["gpu_allocatable"] - node_status["gpu_used"]) - gpu_schedulable += node_status["gpu_capacity"] - gpu_unschedulable += (node_status["gpu_capacity"] - node_status["gpu_allocatable"]) + gpu_avaliable.Add(ResourceInfo.Difference(ResourceInfo(node_status["gpu_allocatable"]), ResourceInfo(node_status["gpu_used"]))) + gpu_schedulable.Add(ResourceInfo(node_status["gpu_capacity"])) + gpu_unschedulable.Add(ResourceInfo.Difference(ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_allocatable"]))) + gpu_reserved.Add(ResourceInfo.Difference(ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_allocatable"]))) - gpu_reserved += (node_status["gpu_capacity"] - node_status["gpu_allocatable"]) - gpu_used +=node_status["gpu_used"] - gpu_capacity += node_status["gpu_capacity"] + gpu_used.Add(ResourceInfo(node_status["gpu_used"])) + gpu_capacity.Add(ResourceInfo(node_status["gpu_capacity"])) cluster_status["user_status"] = [] for user_name, user_gpu in user_status.iteritems(): - cluster_status["user_status"].append({"userName":user_name, "userGPU":user_gpu}) + cluster_status["user_status"].append({"userName":user_name, "userGPU":user_gpu.ToSerializable()}) - cluster_status["gpu_avaliable"] = gpu_avaliable - cluster_status["gpu_capacity"] = gpu_capacity - cluster_status["gpu_unschedulable"] = gpu_unschedulable - cluster_status["gpu_used"] = gpu_used - cluster_status["gpu_reserved"] = gpu_reserved + cluster_status["gpu_avaliable"] = gpu_avaliable.ToSerializable() + cluster_status["gpu_capacity"] = gpu_capacity.ToSerializable() + cluster_status["gpu_unschedulable"] = gpu_unschedulable.ToSerializable() + cluster_status["gpu_used"] = gpu_used.ToSerializable() + cluster_status["gpu_reserved"] = gpu_reserved.ToSerializable() cluster_status["node_status"] = [node_status for node_name, node_status in nodes_status.iteritems()] except Exception as e: - print e + logging.exception("get cluster status") + dataHandler = DataHandler() cluster_status["AvaliableJobNum"] = dataHandler.GetActiveJobsCount() - cluster_status["TotalJobNum"] = dataHandler.GetALLJobsCount() + if "cluster_status" in config and check_cluster_status_change(config["cluster_status"],cluster_status): logging.info("updating the cluster status...") dataHandler.UpdateClusterStatus(cluster_status) else: logging.info("nothing changed in cluster, skipping the cluster status update...") + config["cluster_status"] = copy.deepcopy(cluster_status) dataHandler.Close() return cluster_status def Run(): + register_stack_trace_dump() create_log() logging.info("start to update nodes usage information ...") config["cluster_status"] = None + while True: - try: - get_cluster_status() - except Exception as e: - print e - logging.info(str(e)) + update_file_modification_time("node_manager") + + with manager_iteration_histogram.labels("node_manager").time(): + try: + get_cluster_status() + except Exception as e: + logging.exception("get cluster status failed") time.sleep(30) if __name__ == '__main__': - Run() \ No newline at end of file + parser = argparse.ArgumentParser() + parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9202) + args = parser.parse_args() + setup_exporter_thread(args.port) + + Run() diff --git a/src/ClusterManager/pod_template.py b/src/ClusterManager/pod_template.py new file mode 100644 index 000000000..ecd228fcc --- /dev/null +++ b/src/ClusterManager/pod_template.py @@ -0,0 +1,145 @@ +import os +import sys +import json +import yaml +from jinja2 import Template +from job import Job + +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils")) +from osUtils import mkdirsAsUser + + +class PodTemplate(): + def __init__(self, template, enable_custom_scheduler=False): + self.template = template + self.enable_custom_scheduler = enable_custom_scheduler + + @staticmethod + def generate_launch_script(job_id, path_to_save, user_id, gpu_num, user_script): + if not os.path.exists(path_to_save): + mkdirsAsUser(path_to_save, user_id) + + file_name = "job_command.sh" + launch_script_file = os.path.join(path_to_save, file_name) + with open(launch_script_file, 'w') as f: + f.write(user_script) + os.system("sudo chown %s %s" % (user_id, launch_script_file)) + luanch_cmd = ["bash", "/pod/scripts/bootstrap.sh"] + return luanch_cmd + + def generate_pod(self, pod): + assert(isinstance(self.template, Template)) + if self.enable_custom_scheduler: + if "useGPUTopology" in pod and pod["useGPUTopology"]: + gpu_topology_flag = 1 + else: + # for cases when desired topology is explictly given or not desired + gpu_topology_flag = 0 + pod_name = pod["podName"] + request_gpu = int(pod["gpuLimit"]) + + podInfo = { + "podname": pod_name, + "requests": { + "alpha.gpu/gpu-generate-topology": gpu_topology_flag + }, + "runningcontainer": { + pod_name: { + "requests": {"alpha.gpu/numgpu": request_gpu} + }, + }, + } + + if "annotations" not in pod: + pod["annotations"] = {} + pod["annotations"]["pod.alpha/DeviceInformation"] = "'" + json.dumps(podInfo) + "'" + # gpu requests specified through annotation + pod["gpuLimit"] = 0 + + pod_yaml = self.template.render(job=pod) + return yaml.full_load(pod_yaml) + + def generate_pods(self, job): + """ + Return (pods, errors) + """ + + assert(isinstance(job, Job)) + params = job.params + if any(required_field not in params for required_field in + [ + "jobtrainingtype", + "jobName", + "jobPath", + "workPath", + "dataPath", + "cmd", + "userId", + "resourcegpu", + "userName", + ]): + return None, "Missing required parameters!" + + job.job_path = params["jobPath"] + job.work_path = params["workPath"] + job.data_path = params["dataPath"] + # TODO user's mountpoints first, but should after 'job_path' + job.add_mountpoints(job.job_path_mountpoint()) + job.add_mountpoints({"name": "home", "containerPath": "/home/{}".format(job.get_alias()), "hostPath": job.get_homefolder_hostpath(), "enabled": True}) + if "mountpoints" in params: + job.add_mountpoints(params["mountpoints"]) + job.add_mountpoints(job.work_path_mountpoint()) + job.add_mountpoints(job.data_path_mountpoint()) + params["mountpoints"] = job.mountpoints + + params["user_email"] = params["userName"] + params["homeFolderHostpath"] = job.get_homefolder_hostpath() + params["pod_ip_range"] = job.get_pod_ip_range() + params["usefreeflow"] = job.is_freeflow_enabled() + params["jobNameLabel"] = ''.join(e for e in params["jobName"] if e.isalnum()) + params["rest-api"] = job.get_rest_api_url() + + if "nodeSelector" not in params: + params["nodeSelector"] = {} + if "gpuType" in params: + params["nodeSelector"]["gpuType"] = params["gpuType"] + + local_pod_path = job.get_hostpath(job.job_path, "master") + params["LaunchCMD"] = PodTemplate.generate_launch_script(params["jobId"], local_pod_path, params["userId"], params["resourcegpu"], params["cmd"]) + + if "envs" not in params: + params["envs"] =[] + params["envs"].append({"name": "DLWS_ROLE_NAME", "value": "master"}) + params["envs"].append({"name": "DLWS_NUM_GPU_PER_WORKER", "value": params["resourcegpu"]}) + + pods = [] + if all(hyper_parameter in params for hyper_parameter in ["hyperparametername", "hyperparameterstartvalue", "hyperparameterendvalue", "hyperparameterstep"]): + env_name = params["hyperparametername"] + start = int(params["hyperparameterstartvalue"]) + end = int(params["hyperparameterendvalue"]) + step = int(params["hyperparameterstep"]) + + for idx, val in enumerate(range(start, end, step)): + pod = params.copy() + pod["podName"] = "{0}-pod-{1}".format(job.job_id, idx) + pod["envs"].append({"name": env_name, "value": val}) + pods.append(pod) + else: + pod = params.copy() + pod["podName"] = job.job_id + pods.append(pod) + + k8s_pods = [] + for pod in pods: + pod["numps"] = 0 + pod["numworker"] = 1 + pod["fragmentGpuJob"] = True + pod["gpuLimit"] = pod["resourcegpu"] + + # mount /pod + pod_path = job.get_hostpath(job.job_path, "master") + pod["mountpoints"].append({"name": "pod", "containerPath": "/pod", "hostPath": pod_path, "enabled": True}) + + k8s_pod = self.generate_pod(pod) + k8s_pods.append(k8s_pod) + return k8s_pods, None diff --git a/src/ClusterManager/requirements.txt b/src/ClusterManager/requirements.txt new file mode 100644 index 000000000..07245f179 --- /dev/null +++ b/src/ClusterManager/requirements.txt @@ -0,0 +1,5 @@ +marshmallow==2.19.5 +kubernetes==10.0.0 +PyYAML>=5.1.1 +prometheus-client==0.7.1 +twisted==19.2.1 diff --git a/src/ClusterManager/test_job.py b/src/ClusterManager/test_job.py new file mode 100644 index 000000000..76fa8e299 --- /dev/null +++ b/src/ClusterManager/test_job.py @@ -0,0 +1,176 @@ +import unittest +import json +import sys +import os +from job import Job, JobSchema + +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils")) +from config import config + + +VALID_JOB_ATTRIBUTES = { + "cluster": config, + "jobId": "ce7dca49-28df-450a-a03b-51b9c2ecc69c", + "userName": "user@foo.com", + "jobPath": "user_alias/jobs/date/job_id", +} + + +class TestJobSchema(unittest.TestCase): + + def test_loads(self): + job_json = json.dumps(VALID_JOB_ATTRIBUTES) + + job, errors = JobSchema().loads(job_json) + self.assertFalse(errors) + self.assertEqual(job.job_id, VALID_JOB_ATTRIBUTES["jobId"]) + self.assertEqual(job.email, VALID_JOB_ATTRIBUTES["userName"]) + + def test_job_id_schema(self): + job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES) + self.assertFalse(errors) + + # uppercase + attrs = VALID_JOB_ATTRIBUTES.copy() + attrs.update({"jobId": "First-job"}) + job, errors = JobSchema().load(attrs) + self.assertTrue("jobId" in errors) + + # space + attrs = VALID_JOB_ATTRIBUTES.copy() + attrs.update({"jobId": "first job"}) + job, errors = JobSchema().load(attrs) + self.assertTrue("jobId" in errors) + + def test_dump(self): + job = Job( + cluster=config, + job_id="test-job", + email="user@foo.com" + ) + + result, errors = JobSchema().dump(job) + + self.assertFalse(errors) + self.assertEqual(result["jobId"], "test-job") + self.assertEqual(result["userName"], "user@foo.com") + + +class TestJob(unittest.TestCase): + + def create_a_job(self): + job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES) + self.assertFalse(errors) + return job + + def test_add_mountpoints_with_none(self): + job = self.create_a_job() + job.add_mountpoints(None) + + def test_add_mountpoints_without_name(self): + job = self.create_a_job() + + # add one mountpoint without "name" + mountpoint1 = { + "enabled": True, + "containerPath": "/home/username", + "hostPath": "/dlwsdata/work/username", + } + job.add_mountpoints(mountpoint1) + self.assertEqual(1, len(job.mountpoints)) + + def test_add_mountpoints(self): + job = self.create_a_job() + + # add one mountpoint + mountpoint1 = { + "enabled": True, + "containerPath": "/home/username", + "hostPath": "/dlwsdata/work/username", + "name": "homefolder" + } + job.add_mountpoints(mountpoint1) + self.assertEqual(1, len(job.mountpoints)) + + # would silently skip + job.add_mountpoints(mountpoint1) + self.assertEqual(1, len(job.mountpoints)) + + # name would be normalized, only allow alphanumeric, so it would be a duplicate + mountpoint1a = { + "enabled": True, + "containerPath": "/home/path", + "hostPath": "/dlwsdata/work/path", + "name": "homefolder-" + } + job.add_mountpoints(mountpoint1a) + self.assertEqual(1, len(job.mountpoints)) + + # add another mountpoint + mountpoint2 = { + "enabled": True, + "containerPath": "/home/path1", + "hostPath": "/dlwsdata/work/path1", + "name": "homepath1" + } + job.add_mountpoints(mountpoint2) + self.assertEqual(2, len(job.mountpoints)) + + # add a list + mountpoints = [{ + "enabled": True, + "containerPath": "/home/path2", + "hostPath": "/dlwsdata/work/path2", + "name": "homepath2" + }] + job.add_mountpoints(mountpoints) + self.assertEqual(3, len(job.mountpoints)) + + def test_get_homefolder_hostpath(self): + job = self.create_a_job() + self.assertEqual("/dlwsdata/work/user", job.get_homefolder_hostpath()) + + def test_get_hostpath(self): + job = self.create_a_job() + self.assertEqual("user_alias/jobs/date/job_id", job.job_path) + self.assertEqual("/dlwsdata/work/user_alias/jobs/date/job_id", job.get_hostpath(job.job_path)) + + def test_job_work_data_mountpoints(self): + job = self.create_a_job() + + job.job_path = "user_alias/jobs/date/job_id" + job.work_path = "user_alias" + job.data_path = "" + + self.assertEqual("/dlwsdata/work/user_alias/jobs/date/job_id", job.job_path_mountpoint()["hostPath"]) + self.assertEqual("/dlwsdata/work/user_alias", job.work_path_mountpoint()["hostPath"]) + self.assertEqual("/dlwsdata/storage/", job.data_path_mountpoint()["hostPath"]) + + job.add_mountpoints(job.job_path_mountpoint()) + job.add_mountpoints(job.work_path_mountpoint()) + job.add_mountpoints(job.data_path_mountpoint()) + self.assertEquals(3, len(job.mountpoints)) + + def test_get_template(self): + job = self.create_a_job() + + self.assertIsNotNone(job.get_template()) + + def test_is_custom_scheduler_enabled(self): + job = self.create_a_job() + + self.assertFalse(job.is_custom_scheduler_enabled()) + + # TODO !!! notice, it would change all the 'cluster' settings + job.cluster["kube_custom_scheduler"] = True + self.assertTrue(job.is_custom_scheduler_enabled()) + + def test_get_rest_api_url(self): + job = self.create_a_job() + + self.assertEqual("http://faked.uri/", job.get_rest_api_url()) + + def test_get_rack(self): + job = self.create_a_job() + + self.assertEqual(None, job.get_rack()) diff --git a/src/ClusterManager/test_job_deployer.py b/src/ClusterManager/test_job_deployer.py new file mode 100644 index 000000000..b5f0ff9df --- /dev/null +++ b/src/ClusterManager/test_job_deployer.py @@ -0,0 +1,135 @@ +import unittest +import kubernetes +import yaml +import string +import random +import time +from kubernetes.client.rest import ApiException + +from job_deployer import JobDeployer + +import logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s", + handlers=[ + logging.StreamHandler() + ] +) + + +class TestJobDeployer(unittest.TestCase): + + def create_job_deployer(self): + job_deployer = JobDeployer() + self.assertIsNotNone(job_deployer) + return job_deployer + + def create_pod(self, pod_name): + job_deployer = self.create_job_deployer() + raw_yaml = """ +apiVersion: v1 +kind: Pod +metadata: + name: {} +spec: + containers: + - name: busybox + image: busybox + args: + - sleep + - "1000000" + """.format(pod_name) + body = yaml.full_load(raw_yaml) + + # with self.assertRaises(ApiException): + job_deployer.create_pod(body) + + def test_delete_pod(self): + pod_name = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16)) + self.create_pod(pod_name) + + job_deployer = self.create_job_deployer() + + job_deployer.delete_pod(pod_name) + + def test_cleanup_pods(self): + job_deployer = self.create_job_deployer() + pod_names = ["pod-1", "pod-2"] + + job_deployer.cleanup_pods(pod_names) + + def test_get_pod_by_label(self): + job_deployer = self.create_job_deployer() + label_selector = "run=some_job_id" + + pods = job_deployer.get_pods(label_selector=label_selector) + + self.assertEqual(0, len(pods)) + + def test_get_services_by_label(self): + job_deployer = self.create_job_deployer() + label_selector = "run=some_job_id" + + services = job_deployer.get_services_by_label(label_selector) + + self.assertEqual(0, len(services)) + + def test_create_endpoint(self): + job_deployer = self.create_job_deployer() + raw_yaml = """ +apiVersion: v1 +kind: Service +metadata: + name: test-service +spec: + selector: + app: MyApp + ports: + - protocol: TCP + port: 80 + targetPort: 9376 + """ + body = yaml.full_load(raw_yaml) + + # with self.assertRaises(ApiException): + job_deployer.create_service(body) + + def test_delete_service(self): + job_deployer = self.create_job_deployer() + + job_deployer.delete_service("test-service") + + def test_pod_exec(self): + job_deployer = self.create_job_deployer() + + pod_name = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16)) + self.create_pod(pod_name) + time.sleep(3) + + exec_command = [ + '/bin/sh', + '-c', + 'echo This message goes to stderr >&2 && echo This message goes to stdout' + ] + + status_code, ouput = job_deployer.pod_exec(pod_name, exec_command) + self.assertEqual(0, status_code) + + bad_command = [ + '/bin/sh', + '-c', + 'echo This message goes to stderr >&2 && xecho This message goes to stdout; sleep 3; exit 8' + ] + status_code, ouput = job_deployer.pod_exec(pod_name, bad_command) + self.assertEqual(8, status_code) + + bad_command = [ + '/bin/sh', + '-c', + 'echo This message goes to stderr >&2 && xecho This message goes to stdout; sleep 3; exit 8' + ] + status_code, ouput = job_deployer.pod_exec(pod_name, bad_command, 1) + self.assertEqual(-1, status_code) + + job_deployer.delete_pod(pod_name) diff --git a/src/ClusterManager/test_job_role.py b/src/ClusterManager/test_job_role.py new file mode 100644 index 000000000..919332053 --- /dev/null +++ b/src/ClusterManager/test_job_role.py @@ -0,0 +1,34 @@ +import unittest +from job_role import JobRole + + +class TestJobRole(unittest.TestCase): + + def test_status_Running(self): + job_role = JobRole("master", "bd3d090a-53b6-4616-9b6c-fe4a86fd68ea-ps0") + + role_status = job_role.status() + self.assertEqual("Running", role_status) + + def test_status_NotFound(self): + job_role = JobRole("master", "bd3d090a-53b6-4616-9b6c-fe4a86fd68ea-ps0-not-found") + + role_status = job_role.status() + self.assertEqual("NotFound", role_status) + + def test_status_Pending(self): + # Pod is running, but mark file not existing: JobRole.MARK_POD_READY_FILE + job_role = JobRole("master", "nginx-cm7kf") + + role_status = job_role.status() + self.assertEqual("Pending", role_status) + + def test_get_job_roles_dist_job(self): + job_roles = JobRole.get_job_roles("bd3d090a-53b6-4616-9b6c-fe4a86fd68ea") + + self.assertEqual(3, len(job_roles)) + + def test_get_job_roles_regular_job(self): + job_roles = JobRole.get_job_roles("8ca7fcdf-c4e7-4687-a3fa-1eeea97415c4") + + self.assertEqual(1, len(job_roles)) diff --git a/src/ClusterManager/test_pod_template.py b/src/ClusterManager/test_pod_template.py new file mode 100644 index 000000000..f7b00537b --- /dev/null +++ b/src/ClusterManager/test_pod_template.py @@ -0,0 +1,196 @@ +import unittest +import json +import yaml +import sys +import os +from job import Job, JobSchema +from pod_template import PodTemplate + +sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils")) +from config import config + +VALID_JOB_ATTRIBUTES = { + "cluster": config, + "jobId": "ce7dca49-28df-450a-a03b-51b9c2ecc69c", + "userName": "user@foo.com", + "jobPath": "user_alias/jobs/date/job_id" +} + +job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES) +assert(not errors) + + +class TestPodTemplate(unittest.TestCase): + + def test_generate_launch_script(self): + job_id = "ce7dca49-28df-450a-a03b-51b9c2ecc69c" + path_to_save = "/tmp" + user_id = "20000" + gpu_num = 3 + user_script = "sleep infinity" + + script_file = PodTemplate.generate_launch_script(job_id, path_to_save, user_id, gpu_num, user_script) + + # return the container command + self.assertListEqual(["bash", "/pod/scripts/bootstrap.sh"], script_file) + + def test_pod_template_without_custer_scheduler(self): + enable_custom_scheduler = False + pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) + + pod = {"gpuLimit": 2} + data = pod_template.generate_pod(pod) + + # not eanbled custom scheduler, set the resource limits: spec.containers[].resources.limits + self.assertEqual(pod["gpuLimit"], data["spec"]["containers"][0]["resources"]["limits"]["nvidia.com/gpu"]) + # metadata.annotations["pod.alpha/DeviceInformation"] should be empty + self.assertTrue(("annotations" not in data["metadata"]) or ("pod.alpha/DeviceInformation" not in data["metadata"]["annotations"])) + + def test_generate_pod_with_envs(self): + enable_custom_scheduler = False + pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) + + pod = { + "gpuLimit": 2, + "envs": [{"name": "my_env_name", "value": "my_env_value"}], + } + data = pod_template.generate_pod(pod) + + self.assertIn({"name": "my_env_name", "value": "my_env_value"}, data["spec"]["containers"][0]["env"]) + + def test_generate_pod_with_labels(self): + enable_custom_scheduler = False + pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) + + pod = { + "gpuLimit": 2, + "labels": [{"name": "my_label_name", "value": "my_label_value"}], + } + data = pod_template.generate_pod(pod) + + self.assertEqual("my_label_value", data["metadata"]["labels"]["my_label_name"]) + + def test_pod_template_with_custom_scheduler(self): + enable_custom_scheduler = True + pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) + + gpu_num = 2 + pod = { + "podName": "790a6b30-560f-44a4-a9f0-5d1458dcb0d1-pod-0", + "gpuLimit": gpu_num, + } + data = pod_template.generate_pod(pod) + + # eanbled custom scheduler would clear the resource limits: spec.containers[].resources.limits + self.assertEqual(0, data["spec"]["containers"][0]["resources"]["limits"]["nvidia.com/gpu"]) + + # metadata.annotations["pod.alpha/DeviceInformation"] should be set + # annotations = data["metadata"]["annotations"] + device_annotation = json.loads(data["metadata"]["annotations"]["pod.alpha/DeviceInformation"]) + self.assertEqual(gpu_num, device_annotation["runningcontainer"][pod["podName"]]["requests"]["alpha.gpu/numgpu"]) + # disabled topology + self.assertEqual(0, device_annotation["requests"]["alpha.gpu/gpu-generate-topology"]) + + def test_pod_template_with_custom_scheduler_use_topology(self): + enable_custom_scheduler = True + pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) + + gpu_num = 2 + pod = { + "podName": "790a6b30-560f-44a4-a9f0-5d1458dcb0d1-pod-0", + "gpuLimit": gpu_num, + "useGPUTopology": True + } + data = pod_template.generate_pod(pod) + + # eanbled custom scheduler, clear the resource limits: spec.containers[].resources.limits + self.assertEqual(0, data["spec"]["containers"][0]["resources"]["limits"]["nvidia.com/gpu"]) + + # metadata.annotations["pod.alpha/DeviceInformation"] should be set: + # { + # "requests":{ + # "alpha.gpu/gpu-generate-topology":1 + # }, + # "runningcontainer":{ + # "790a6b30-560f-44a4-a9f0-5d1458dcb0d1-pod-0":{ + # "requests":{ + # "alpha.gpu/numgpu":2 + # } + # } + # }, + # "podname":"790a6b30-560f-44a4-a9f0-5d1458dcb0d1-pod-0" + # } + + # annotations = data["metadata"]["annotations"] + device_annotation = json.loads(data["metadata"]["annotations"]["pod.alpha/DeviceInformation"]) + self.assertEqual(gpu_num, device_annotation["runningcontainer"][pod["podName"]]["requests"]["alpha.gpu/numgpu"]) + # enabled topology + self.assertEqual(1, device_annotation["requests"]["alpha.gpu/gpu-generate-topology"]) + + def test_generate_pods_missing_required_params(self): + enable_custom_scheduler = True + pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) + + job.params = {} + job_description, error = pod_template.generate_pods(job) + + self.assertIsNone(job_description) + self.assertTrue(error) + self.assertEqual("Missing required parameters!", error) + + def test_generate_pods(self): + enable_custom_scheduler = True + pod_template = PodTemplate(job.get_template(), enable_custom_scheduler) + + job.params = { + "gid": "20000", + "uid": "20000", + "user": "user", + "mountpoints": [ + { + "description": "NFS (remote file share)", + "enabled": True, + "containerPath": "/home/user", + "hostPath": "/dlwsdata/work/user", + "name": "homefolder" + } + ], + "image": "indexserveregistry.azurecr.io/deepscale:1.0", + "userId": "20000", + "dataPath": "", + "jobId": "140782a0-7f6d-4039-9801-fd6294c7c88a", + "isParent": 1, + "jobType": "training", + "jobPath": "user/jobs/190627/140782a0-7f6d-4039-9801-fd6294c7c88a", + "containerUserId": "0", + "resourcegpu": 1, + "env": [ + ], + "enabledatapath": True, + "runningasroot": True, + "interactivePorts": [ + + ], + "preemptionAllowed": False, + "jobtrainingtype": "RegularJob", + "do_log": False, + "is_interactive": False, + "familyToken": "72fc61265bcb4416b68b44c82d120b3b", + "enableworkpath": True, + "vcName": "vc1", + "userName": "user@foo.com", + "workPath": "user", + "cmd": "sleep infinity", + "jobName": "test-job", + "enablejobpath": True, + "gpuType": "P40", + "ssh": True + } + + pods, error = pod_template.generate_pods(job) + + self.assertFalse(error) + # generate list of pod yamls + self.assertTrue(list, type(pods)) + self.assertEqual(1, len(pods)) + self.assertIsNotNone(pods[0]["spec"]["containers"][0]["command"]) diff --git a/src/ClusterManager/user_manager.py b/src/ClusterManager/user_manager.py index 1357fcd9a..eb85a79a1 100755 --- a/src/ClusterManager/user_manager.py +++ b/src/ClusterManager/user_manager.py @@ -34,11 +34,12 @@ from config import config from DataHandler import DataHandler +from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time -def create_log( logdir = '/var/log/dlworkspace' ): - if not os.path.exists( logdir ): - os.system("mkdir -p " + logdir ) +def create_log(logdir = '/var/log/dlworkspace'): + if not os.path.exists(logdir): + os.system("mkdir -p " + logdir) with open('logging.yaml') as f: logging_config = yaml.load(f) f.close() @@ -61,7 +62,7 @@ def set_user_directory(): logging.info("Found a new user %s" %username) logging.info("Creating home directory %s for user %s" % (userpath, username)) os.system("mkdir -p "+userpath) - os.system("chown -R "+userid+":"+"500000513 "+userpath) + os.system("chown -R "+str(userid)+":"+"500000513 "+userpath) sshkeypath = os.path.join(userpath,".ssh/id_rsa") pubkeypath = os.path.join(userpath,".ssh/id_rsa.pub") @@ -70,25 +71,35 @@ def set_user_directory(): logging.info("Creating sshkey for user %s" % (username)) os.system("mkdir -p "+os.path.dirname(sshkeypath)) os.system("ssh-keygen -t rsa -b 4096 -f %s -P ''" % sshkeypath) - os.system("chown -R "+userid+":"+"500000513 "+userpath) + os.system("chown -R "+str(userid)+":"+"500000513 "+userpath) os.system("chmod 700 -R "+os.path.dirname(sshkeypath)) if not os.path.exists(authorized_keyspath): logging.info("Creating authorized_keys for user %s" % (username)) - os.system("chown -R "+userid+":"+"500000513 "+authorized_keyspath) + os.system("chown -R "+str(userid)+":"+"500000513 "+authorized_keyspath) os.system("cat "+pubkeypath+" >> "+authorized_keyspath) os.system("chmod 644 "+authorized_keyspath) def Run(): + register_stack_trace_dump() create_log() logging.info("start to update user directory...") + while True: - try: - set_user_directory() - except Exception as e: - print e + update_file_modification_time("user_manager") + + with manager_iteration_histogram.labels("user_manager").time(): + try: + set_user_directory() + except Exception as e: + logging.exception("set user directory failed") time.sleep(1) if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--port", "-p", help="port of exporter", type=int, default=9201) + args = parser.parse_args() + setup_exporter_thread(args.port) + Run() diff --git a/src/Jobs_Templete/DistJob.yaml.template b/src/Jobs_Templete/DistJob.yaml.template deleted file mode 100755 index be2022f0b..000000000 --- a/src/Jobs_Templete/DistJob.yaml.template +++ /dev/null @@ -1,112 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: {{ job["jobId"] }}-{{ job["distId"] }} - labels: - run: {{ job["jobId"] }} - jobName: {{ job["jobNameLabel"] }} - distRole: {{ job["distRole"] }} - distPort: "{{job["containerPort"]}}" - userName: {{ job["userNameLabel"] }} -spec: - #hostNetwork: true - {% if job["nodeSelector"]|length > 0 %} - nodeSelector: - {% for key, value in job["nodeSelector"].items() %} - {{key}}: {{value}} - {% endfor %} - {% endif %} - {% if job["dnsPolicy"] %} - dnsPolicy: {{ job["dnsPolicy" ]}} - {% endif %} - {% if job["hostNetwork"] %} - hostNetwork: true - {% endif %} - {% if job["hostIPC"] %} - hostIPC: true - {% endif %} - containers: - - name: {{ job["jobId"] }} - image: {{ job["image"] }} - imagePullPolicy: Always - command: {{ job["LaunchCMD"] }} - #container port and host port should be same. - {% if job["isPrivileged"] %} - securityContext: - privileged: true - {% endif %} - ports: - - containerPort: {{job["containerPort"]}} - hostPort: {{job["containerPort"]}} - {% if job["distRole"] =="worker" %} - resources: - limits: - alpha.kubernetes.io/nvidia-gpu: {{ job["resourcegpu"] }} - {% if not job["cpurequest"] %} - requests: - cpu: 1.0 - {% else %} - requests: - cpu: job["cpurequest"] - {% endif %} - {% if job["memoryrequest"] %} - requests: - memory: job["memoryrequest"] - {% endif %} - {% endif %} - volumeMounts: - - mountPath: /freeflow - name: freeflow - {% for mp in job["mountpoints"] %} - - mountPath: {{ mp.containerPath }} - name: {{ mp.name }} - {% endfor %} - {% if not job["dnsPolicy"] %} - - mountPath: /etc/resolv.conf - name: resolv - {% endif %} - env: - - name: FAMILY_TOKEN - value: {{ job["familyToken"] }} - - name: DLWS_REST_API - value: {{ job["rest-api"] }} - - name: VNET_PREFIX - value: {{ job["pod_ip_range"] }} - - name: LD_PRELOAD - value: "/freeflow/libfsocket.so" - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - {% for env in job["env"] %} - - name: {{ env.name }} - value: {{ env.value }} - {% endfor %} - - restartPolicy: Never - volumes: - - name: freeflow - hostPath: - path: /freeflow - {% if not job["dnsPolicy"] %} - - name: resolv - hostPath: - path: /etc/resolv.conf - {% endif %} - - {% for mp in job["mountpoints"] %} - - name: {{ mp.name }} - {% if mp.emptydir %} - emptyDir: {} - {% else %} - hostPath: - path: {{ mp.hostPath }} - {% if mp.type %} - type: {{ mp.type }} - {% endif %} - {% endif %} - {% endfor %} \ No newline at end of file diff --git a/src/Jobs_Templete/RegularJob.yaml.template b/src/Jobs_Templete/RegularJob.yaml.template deleted file mode 100755 index b2ece2cb7..000000000 --- a/src/Jobs_Templete/RegularJob.yaml.template +++ /dev/null @@ -1,124 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: {{ job["podName"] }} - labels: - run: {{ job["jobId"] }} - podName: {{ job["podName"] }} - jobName: {{ job["jobNameLabel"] }} - jobId: {{job["jobId"]}} - userName: {{ job["userNameLabel"] }} - {% if "annotations" in job %} - annotations: - {% for annotationKey,annotationVal in job["annotations"].items() %} - {{ annotationKey }}: {{ annotationVal }} - {% endfor %} - {% endif %} -spec: - {% if job["resourcegpu"]|int < 8 %} - nodeSelector: - FragmentGPUJob: active - {% endif %} - {% if job["dnsPolicy"] %} - dnsPolicy: {{ job["dnsPolicy" ]}} - {% endif %} - {% if job["hostNetwork"] %} - hostNetwork: true - {% endif %} - {% if job["hostIPC"] %} - hostIPC: true - {% endif %} - containers: - - name: {{ job["podName"] }} - image: {{ job["image"] }} - imagePullPolicy: Always - command: {{ job["LaunchCMD"] }} - securityContext: - runAsUser: {{ job["containerUserId"] }} - {% if job["isPrivileged"] %} - privileged: true - {% endif %} - resources: - limits: - alpha.kubernetes.io/nvidia-gpu: {{ job["resourcegpu"] }} - {% if not job["cpurequest"] %} - requests: - cpu: 1.0 - {% else %} - requests: - cpu: job["cpurequest"] - {% endif %} - {% if job["memoryrequest"] %} - requests: - memory: job["memoryrequest"] - {% endif %} - - volumeMounts: - {% if not job["dnsPolicy"] %} - - mountPath: /etc/resolv.conf - name: resolv - {% endif %} - {% for mp in job["mountpoints"] %} - {% if mp.enabled %} - - mountPath: {{ mp.containerPath }} - name: {{ mp.name }} - {% if mp.readOnly %} - readOnly: true - {% endif %} - {% endif %} - - {% endfor %} - {% if job["usefreeflow"] %} - - mountPath: /freeflow - name: freeflow - {% endif %} - env: - - name: FAMILY_TOKEN - value: {{ job["familyToken"] }} - - name: DLWS_REST_API - value: {{ job["rest-api"] }} - {% if job["usefreeflow"] %} - - name: VNET_PREFIX - value: {{ job["pod_ip_range"] }} - - name: LD_PRELOAD - value: "/freeflow/libfsocket.so" - {% endif %} - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - {% for env in job["env"] %} - - name: {{ env.name }} - value: "{{ env.value }}" - {% endfor %} - - restartPolicy: Never - volumes: - {% if not job["dnsPolicy"] %} - - name: resolv - hostPath: - path: /etc/resolv.conf - {% endif %} - {% for mp in job["mountpoints"] %} - {% if mp.enabled %} - - name: {{ mp.name }} - {% if mp.emptydir %} - emptyDir: {} - {% else %} - hostPath: - path: {{ mp.hostPath }} - {% if mp.type %} - type: {{ mp.type }} - {% endif %} - {% endif %} - {% endif %} - {% endfor %} - {% if job["usefreeflow"] %} - - name: freeflow - hostPath: - path: /freeflow - {% endif %} \ No newline at end of file diff --git a/src/Jobs_Templete/pod.yaml.template b/src/Jobs_Templete/pod.yaml.template new file mode 100755 index 000000000..af3cc2652 --- /dev/null +++ b/src/Jobs_Templete/pod.yaml.template @@ -0,0 +1,258 @@ +{% if job["distRole"] %} +{% set jobRole = job["distRole"] %} +{% else %} +{% set jobRole = "worker" %} # treat regular job's pod as worker role +{% endif %} + +apiVersion: v1 +kind: Pod +metadata: + name: {{ job["podName"] }} + labels: + run: {{ job["jobId"] }} + podName: {{ job["podName"] }} + jobName: {{ job["jobNameLabel"] }} + jobId: {{ job["jobId"] }} + jobRole: {{ jobRole }} + userName: {{ job["user"] }} + vcName: {{ job["vcName"] }} + type: job + 'gpu-request': '{{ job["gpuLimit"]|int }}' + + {% for label in job["labels"] %} + {{label.name}}: "{{label.value}}" + {% endfor %} + + {% if "gpuType" in job %} + {% if job["gpuType"]|length > 0 %} + gpuType: {{ job["gpuType"] }} + {% endif %} + {% endif %} + preemptionAllowed: "{{ job["preemptionAllowed"] }}" + + {% if "annotations" in job %} + annotations: + {% for annotationKey,annotationVal in job["annotations"].items() %} + {{ annotationKey }}: {{ annotationVal }} + {% endfor %} + {% endif %} + +spec: + nodeSelector: + worker: active + {% if job["nodeSelector"]|length > 0 %} + {% for key, value in job["nodeSelector"].items() %} + {{key}}: {{value}} + {% endfor %} + {% endif %} + {% if job["fragmentGpuJob"] %} + FragmentGPUJob: active + {% endif %} + affinity: + podAffinity: + {% if jobRole == "ps" %} + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: # try to put worker & ps in same node + matchExpressions: + - key: "jobId" + operator: In + values: + - "{{ job["jobId"] }}" + - key: "jobRole" + operator: In + values: + - "worker" + topologyKey: "kubernetes.io/hostname" + {% else %} + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 50 # For regular jobs, distributed jobs will consume all gpus in node + podAffinityTerm: + labelSelector: + matchExpressions: + - key: "type" + operator: In + values: + - "job" + topologyKey: "kubernetes.io/hostname" + - weight: 100 # For distributed jobs, try to cluster pod of same job into one region + podAffinityTerm: + labelSelector: + matchExpressions: + - key: jobId + operator: In + values: + - "{{ job["jobId"] }}" + topologyKey: "failure-domain.beta.kubernetes.io/region" + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 50 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: jobId + operator: In + values: + - "{{ job["jobId"] }}" + topologyKey: "failure-domain.beta.kubernetes.io/zone" + {% endif %} + {% if job["dnsPolicy"] %} + dnsPolicy: {{ job["dnsPolicy" ]}} + {% endif %} + {% if job["hostNetwork"] %} + hostNetwork: true + {% endif %} + {% if job["hostIPC"] %} + hostIPC: true + {% endif %} + containers: + - name: {{ job["podName"] }} + image: {{ job["image"] }} + imagePullPolicy: Always + command: {{ job["LaunchCMD"] }} + securityContext: + runAsUser: {{ job["containerUserId"] }} + {% if job["isPrivileged"] %} + privileged: true + {% endif %} + capabilities: + add: + - IPC_LOCK + - SYS_ADMIN + resources: + limits: + nvidia.com/gpu: {{ job["gpuLimit"] }} + {% if not job["cpurequest"] %} + requests: + cpu: 1.0 + {% else %} + requests: + cpu: job["cpurequest"] + {% endif %} + {% if job["memoryrequest"] %} + requests: + memory: job["memoryrequest"] + {% endif %} + volumeMounts: + - name: "dlws-scripts" + mountPath: /pod/scripts + readOnly: true + - name: ssh-volume + mountPath: /home/{{ job["user"] }}/.ssh + - name: id-rsa-volume + mountPath: /home/{{ job["user"] }}/.ssh/id_rsa + readOnly: true + - name: id-rsa-pub-volume + mountPath: /home/{{ job["user"] }}/.ssh/id_rsa.pub + readOnly: true + - name: authorized-keys-volume + mountPath: /home/{{ job["user"] }}/.ssh/authorized_keys + readOnly: true + {% if not job["dnsPolicy"] %} + - mountPath: /etc/resolv.conf + name: resolv + {% endif %} + {% for mp in job["mountpoints"] %} + {% if mp.enabled %} + - mountPath: {{ mp.containerPath }} + name: {{ mp.name }} + {% if mp.readOnly %} + readOnly: true + {% endif %} + {% endif %} + {% endfor %} + {% if job["usefreeflow"] %} + - mountPath: /freeflow + name: freeflow + {% endif %} + - mountPath: /dev/shm + name: dshm + env: + - name: FAMILY_TOKEN + value: {{ job["familyToken"] }} + - name: DLWS_REST_API + value: {{ job["rest-api"] }} + - name: DLWS_JOB_ID + value: {{ job["jobId"] }} + - name: DLWS_NUM_PS + value: "{{ job["numps"] }}" + - name: DLWS_NUM_WORKER + value: "{{ job["numworker"] }}" + {% if job["gpuLimit"]|int < 1 %} + - name: NVIDIA_VISIBLE_DEVICES + value: "" + {% endif %} + {% if job["usefreeflow"] %} + - name: VNET_PREFIX + value: {{ job["pod_ip_range"] }} + - name: LD_PRELOAD + value: "/freeflow/libfsocket.so" + {% endif %} + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: DLWS_GID + value: "{{ job["gid"] }}" + - name: DLWS_UID + value: "{{ job["uid"] }}" + - name: DLWS_USER_NAME + value: "{{ job["user"] }}" + - name: DLWS_USER_EMAIL + value: "{{ job["user_email"] }}" + - name: DLWS_VC_NAME + value: {{ job["vcName"] }} + {% for env in job["envs"] %} + - name: {{env.name}} + value: "{{env.value}}" + {% endfor %} + + imagePullSecrets: + - name: regcred + + restartPolicy: Never + volumes: + - name: "dlws-scripts" + configMap: + name: "dlws-scripts" + - name: ssh-volume + emptyDir: {} + - name: id-rsa-volume + hostPath: + path: {{ job["homeFolderHostpath"] }}/.ssh/id_rsa + - name: id-rsa-pub-volume + hostPath: + path: {{ job["homeFolderHostpath"] }}/.ssh/id_rsa.pub + - name: authorized-keys-volume + hostPath: + path: {{ job["homeFolderHostpath"] }}/.ssh/authorized_keys + {% if not job["dnsPolicy"] %} + - name: resolv + hostPath: + path: /etc/resolv.conf + {% endif %} + {% for mp in job["mountpoints"] %} + {% if mp.enabled %} + - name: {{ mp.name }} + {% if mp.emptydir %} + emptyDir: {} + {% else %} + hostPath: + path: {{ mp.hostPath }} + {% if mp.type %} + type: {{ mp.type }} + {% endif %} + {% endif %} + {% endif %} + {% endfor %} + {% if job["usefreeflow"] %} + - name: freeflow + hostPath: + path: /freeflow + {% endif %} + - name: dshm + emptyDir: + medium: Memory diff --git a/src/RestAPI/dlwsrestapi.py b/src/RestAPI/dlwsrestapi.py index a3cc6be68..b1d59af86 100755 --- a/src/RestAPI/dlwsrestapi.py +++ b/src/RestAPI/dlwsrestapi.py @@ -2,37 +2,78 @@ import json import os -from flask import Flask +from flask import Flask, Response from flask_restful import reqparse, abort, Api, Resource from flask import request, jsonify import base64 import yaml +import uuid import logging +import timeit from logging.config import dictConfig +import thread sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../utils")) #from JobRestAPIUtils import SubmitDistJob, GetJobList, GetJobStatus, DeleteJob, GetTensorboard, GetServiceAddress, GetLog, GetJob import JobRestAPIUtils +from authorization import ResourceType, Permission, AuthorizationManager from config import config from config import global_vars +import authorization +from DataHandler import DataHandler + +import time +import sys +import traceback +import threading + +import prometheus_client + +CONTENT_TYPE_LATEST = str("text/plain; version=0.0.4; charset=utf-8") dir_path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(dir_path, 'logging.yaml'), 'r') as f: logging_config = yaml.load(f) dictConfig(logging_config) logger = logging.getLogger('restfulapi') -global_vars["logger"] = logger app = Flask(__name__) api = Api(app) verbose = True logger.info( "------------------- Restful API started ------------------------------------- ") +logger.info("%s", config) + +if "initAdminAccess" not in global_vars or not global_vars["initAdminAccess"]: + logger.info("===========Init Admin Access===============") + global_vars["initAdminAccess"] = True + logger.info('setting admin access!') + AuthorizationManager.UpdateAce("Administrator", AuthorizationManager.GetResourceAclPath("", ResourceType.Cluster), Permission.Admin, False) + logger.info('admin access given!') + + +def _stacktraces(): + code = [] + for threadId, stack in sys._current_frames().items(): + code.append("\n# ThreadID: %s" % threadId) + for filename, lineno, name, line in traceback.extract_stack(stack): + code.append('File: "%s", line %d, in %s' % (filename, lineno, name)) + if line: + code.append(" %s" % (line.strip())) -logger.info("%s" % config ) + for line in code: + print("_stacktrace: " + line) -parser = reqparse.RequestParser() +def _WorkerThreadFunc(): + while True: + _stacktraces() + time.sleep(60) + +#workerThread = threading.Thread(target=_WorkerThreadFunc, args=()) +#workerThread.daemon = True +#workerThread.start() + def istrue(value): if isinstance(value, bool): @@ -45,7 +86,7 @@ def istrue(value): def tolist(value): if isinstance( value, basestring): if len(value)>0: - return [value] + return [value] else: return [] else: @@ -60,8 +101,10 @@ def getAlias(username): class SubmitJob(Resource): def get(self): + parser = reqparse.RequestParser() parser.add_argument('jobName') parser.add_argument('resourcegpu') + parser.add_argument('gpuType') parser.add_argument('workPath') parser.add_argument('dataPath') parser.add_argument('jobPath') @@ -70,17 +113,19 @@ def get(self): parser.add_argument('logDir') parser.add_argument('interactivePort') parser.add_argument('userName') + parser.add_argument('vcName') + parser.add_argument('preemptionAllowed') parser.add_argument('userId') parser.add_argument('runningasroot') parser.add_argument('containerUserId') - + parser.add_argument('familyToken') parser.add_argument('isParent') parser.add_argument('jobType') - + parser.add_argument('nodeSelector') + parser.add_argument('jobtrainingtype') - parser.add_argument('numps') parser.add_argument('numpsworker') parser.add_argument('nummpiworker') @@ -96,27 +141,34 @@ def get(self): if args["jobName"] is None or len(args["jobName"].strip()) == 0: ret["error"] = "job name cannot be empty" + elif args["vcName"] is None or len(args["vcName"].strip()) == 0: + ret["error"] = "vc name cannot be empty" elif args["resourcegpu"] is None or len(args["resourcegpu"].strip()) == 0: - ret["error"] = "Number of GPU cannot be empty" + ret["error"] = "Number of GPU cannot be empty" + elif args["gpuType"] is None or len(args["gpuType"].strip()) == 0: + ret["error"] = "GPU Type cannot be empty" elif args["dataPath"] is None or len(args["dataPath"].strip()) == 0: - ret["error"] = "datapath cannot be empty" + ret["error"] = "datapath cannot be empty" elif args["image"] is None or len(args["image"].strip()) == 0: - ret["error"] = "docker image cannot be empty" + ret["error"] = "docker image cannot be empty" elif args["jobType"] is None or len(args["jobType"].strip()) == 0: - ret["error"] = "jobType cannot be empty" + ret["error"] = "jobType cannot be empty" else: params["jobName"] = args["jobName"] + params["vcName"] = args["vcName"] params["resourcegpu"] = args["resourcegpu"] + params["gpuType"] = args["gpuType"] params["workPath"] = args["workPath"] params["dataPath"] = args["dataPath"] params["image"] = args["image"] params["cmd"] = args["cmd"] params["jobType"] = args["jobType"] + params["preemptionAllowed"] = args["preemptionAllowed"] params["jobtrainingtype"] = args["jobtrainingtype"] if args["jobtrainingtype"] == "PSDistJob": - params["numps"] = args["numps"] + params["numps"] = 1 params["numpsworker"] = args["numpsworker"] if args["jobtrainingtype"] == "MPIDistJob": @@ -131,9 +183,13 @@ def get(self): if args["userId"] is not None and len(args["userId"].strip()) > 0: params["userId"] = args["userId"] else: - # !! note: if userId is not provided, the container will be running as root. There shouldn't be any security concern since all the resources in docker container should be user's own property. Also, we plan to allow user to choose "run as root". + # !! note: if userId is not provided, the container will be running as root. There shouldn't be any security concern since all the resources in docker container should be user's own property. Also, we plan to allow user to choose "run as root". params["userId"] = "0" + if args["nodeSelector"] is not None and len(args["nodeSelector"].strip()) > 0: + params["nodeSelector"] = {args["nodeSelector"]:"active"} + + if args["interactivePort"] is not None and len(args["interactivePort"].strip()) > 0: params["interactivePort"] = args["interactivePort"] @@ -155,12 +211,12 @@ def get(self): else: params["isParent"] = "1" params["mountpoints"] = [] - addcmd = "" + addcmd = "" if "mounthomefolder" in config and istrue(config["mounthomefolder"]) and "storage-mount-path" in config: alias = getAlias(params["userName"]) params["mountpoints"].append({"name":"homeholder","containerPath":os.path.join("/home", alias),"hostPath":os.path.join(config["storage-mount-path"], "work", alias)}) if "mountpoints" in config and "storage-mount-path" in config: - # see link_fileshares in deploy.py + # see link_fileshares in deploy.py for k, v in config["mountpoints"].iteritems(): if "mountpoints" in v: for basename in tolist(v["mountpoints"]): @@ -181,7 +237,7 @@ def get(self): if v["type"]=="emptyDir": params["mountpoints"].append({"name":basealias+"-"+oneshare, "containerPath": containerPath, - "hostPath": "/emptydir", + "hostPath": "/emptydir", "emptydir": "yes" }) else: params["mountpoints"].append({"name":basealias+"-"+oneshare, @@ -193,7 +249,7 @@ def get(self): if not os.path.exists(hostPath): cmd = "sudo mkdir -m 0777 -p %s; " % hostPath os.system( cmd ) - logger.info( cmd ) + logger.info( cmd ) if oneshare==alias: cmd = "sudo chown %s:%s %s; " % (params["containerUserId"], "500000513", hostPath ) os.system(cmd ) @@ -202,13 +258,13 @@ def get(self): if oneshare==alias: addcmd += "chown %s:%s %s ; " % ( params["userId"], "500000513", containerPath ) if verbose and len(params["mountpoints"]) > 0: - logger.info("Mount path for job %s" % params ) + logger.info("Mount path for job %s", params ) for mounts in params["mountpoints"]: - logger.info( "Share %s, mount %s at %s" % (mounts["name"], mounts["hostPath"], mounts["containerPath"]) ) + logger.info( "Share %s, mount %s at %s", mounts["name"], mounts["hostPath"], mounts["containerPath"]) if len(addcmd) > 0: params["cmd"] = addcmd + params["cmd"] output = JobRestAPIUtils.SubmitJob(json.dumps(params)) - + if "jobId" in output: ret["jobId"] = output["jobId"] else: @@ -219,23 +275,25 @@ def get(self): resp = jsonify(ret) resp.headers["Access-Control-Allow-Origin"] = "*" - resp.headers["dataType"] = "json" + resp.headers["dataType"] = "json" return resp ## ## Actually setup the Api resource routing here ## api.add_resource(SubmitJob, '/SubmitJob') + + class PostJob(Resource): def post(self): params = request.get_json(force=True) monitor = yaml.safe_dump(params, default_flow_style=False) - logger.info("Post Job" ) - logger.info(monitor ) + logger.info("Post Job") + logger.info(monitor) ret = {} if True: output = JobRestAPIUtils.SubmitJob(json.dumps(params)) - + if "jobId" in output: ret["jobId"] = output["jobId"] else: @@ -243,10 +301,10 @@ def post(self): ret["error"] = "Cannot create job!" + output["error"] else: ret["error"] = "Cannot create job!" - logger.info("Submit job through restapi, output is %s, ret is %s" %(output, ret) ) + logger.info("Submit job through restapi, output is %s, ret is %s", output, ret) resp = jsonify(ret) resp.headers["Access-Control-Allow-Origin"] = "*" - resp.headers["dataType"] = "json" + resp.headers["dataType"] = "json" return resp ## ## Actually setup the Api resource routing here @@ -258,19 +316,20 @@ def post(self): # shows a list of all todos, and lets you POST to add new tasks class ListJobs(Resource): def get(self): + parser = reqparse.RequestParser() parser.add_argument('userName') parser.add_argument('num') - args = parser.parse_args() + parser.add_argument('vcName') + parser.add_argument('jobOwner') + args = parser.parse_args() num = None if args["num"] is not None: try: num = int(args["num"]) except: pass - if args["userName"] is not None and len(args["userName"].strip()) > 0: - jobs = JobRestAPIUtils.GetJobList(args["userName"],num) - else: - jobs = [] + jobs = JobRestAPIUtils.GetJobList(args["userName"], args["vcName"], args["jobOwner"], num) + jobList = [] queuedJobs = [] runningJobs = [] @@ -282,10 +341,10 @@ def get(self): job["jobParams"] = json.loads(base64.b64decode(job["jobParams"])) - if "endpoints" in job and job["endpoints"] is not None and (job["endpoints"].strip()) > 0: - job["endpoints"] = json.loads(base64.b64decode(job["endpoints"])) + if "endpoints" in job and job["endpoints"] is not None and len(job["endpoints"].strip()) > 0: + job["endpoints"] = json.loads(job["endpoints"]) - if "jobStatusDetail" in job and job["jobStatusDetail"] is not None and (job["jobStatusDetail"].strip()) > 0: + if "jobStatusDetail" in job and job["jobStatusDetail"] is not None and len(job["jobStatusDetail"].strip()) > 0: try: s = job["jobStatusDetail"] s = base64.b64decode(s) @@ -305,7 +364,6 @@ def get(self): else: finishedJobs.append(job) - ret = {} ret["queuedJobs"] = queuedJobs ret["runningJobs"] = runningJobs @@ -317,7 +375,6 @@ def get(self): resp.headers["dataType"] = "json" return resp - ## ## Actually setup the Api resource routing here ## @@ -327,12 +384,17 @@ def get(self): class KillJob(Resource): def get(self): + parser = reqparse.RequestParser() parser.add_argument('jobId') - args = parser.parse_args() + parser.add_argument('userName') + args = parser.parse_args() jobId = args["jobId"] - result = JobRestAPIUtils.KillJob(jobId) + userName = args["userName"] + result = JobRestAPIUtils.KillJob(userName, jobId) ret = {} if result: + # NOTE "Success" prefix is used in reaper, please also update reaper code + # if need to change it. ret["result"] = "Success, the job is scheduled to be terminated." else: ret["result"] = "Cannot Kill the job. Job ID:" + jobId @@ -341,7 +403,6 @@ def get(self): resp.headers["Access-Control-Allow-Origin"] = "*" resp.headers["dataType"] = "json" - return resp ## ## Actually setup the Api resource routing here @@ -350,12 +411,96 @@ def get(self): +class PauseJob(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('jobId') + parser.add_argument('userName') + args = parser.parse_args() + jobId = args["jobId"] + userName = args["userName"] + result = JobRestAPIUtils.PauseJob(userName, jobId) + ret = {} + if result: + ret["result"] = "Success, the job is scheduled to be paused." + else: + ret["result"] = "Cannot pause the job. Job ID:" + jobId + + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(PauseJob, '/PauseJob') + + + +class ResumeJob(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('jobId') + parser.add_argument('userName') + args = parser.parse_args() + jobId = args["jobId"] + userName = args["userName"] + result = JobRestAPIUtils.ResumeJob(userName, jobId) + ret = {} + if result: + ret["result"] = "Success, the job is scheduled to be resumed." + else: + ret["result"] = "Cannot resume the job. Job ID:" + jobId + + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(ResumeJob, '/ResumeJob') + + + +class CloneJob(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('jobId') + parser.add_argument('userName') + args = parser.parse_args() + jobId = args["jobId"] + userName = args["userName"] + result = JobRestAPIUtils.CloneJob(userName, jobId) + ret = {} + if result: + ret["result"] = "Success, the job is scheduled to be cloned." + else: + ret["result"] = "Cannot clone the job. Job ID:" + jobId + + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(CloneJob, '/CloneJob') + + + class ApproveJob(Resource): def get(self): + parser = reqparse.RequestParser() parser.add_argument('jobId') - args = parser.parse_args() + parser.add_argument('userName') + args = parser.parse_args() jobId = args["jobId"] - result = JobRestAPIUtils.ApproveJob(jobId) + userName = args["userName"] + result = JobRestAPIUtils.ApproveJob(userName, jobId) ret = {} if result: ret["result"] = "Success, the job has been approved." @@ -366,7 +511,6 @@ def get(self): resp.headers["Access-Control-Allow-Origin"] = "*" resp.headers["dataType"] = "json" - return resp ## ## Actually setup the Api resource routing here @@ -377,10 +521,13 @@ def get(self): class GetCommands(Resource): def get(self): + parser = reqparse.RequestParser() parser.add_argument('jobId') - args = parser.parse_args() + parser.add_argument('userName') + args = parser.parse_args() jobId = args["jobId"] - commands = JobRestAPIUtils.GetCommands(jobId) + userName = args["userName"] + commands = JobRestAPIUtils.GetCommands(userName, jobId) resp = jsonify(commands) resp.headers["Access-Control-Allow-Origin"] = "*" resp.headers["dataType"] = "json" @@ -395,16 +542,19 @@ def get(self): class GetJobDetail(Resource): def get(self): + parser = reqparse.RequestParser() parser.add_argument('jobId') - args = parser.parse_args() + parser.add_argument('userName') + args = parser.parse_args() jobId = args["jobId"] - job = JobRestAPIUtils.GetJobDetail(jobId) + userName = args["userName"] + job = JobRestAPIUtils.GetJobDetail(userName, jobId) job["jobParams"] = json.loads(base64.b64decode(job["jobParams"])) - if "endpoints" in job and job["endpoints"] is not None and (job["endpoints"].strip()) > 0: - job["endpoints"] = json.loads(base64.b64decode(job["endpoints"])) - if "jobStatusDetail" in job and job["jobStatusDetail"] is not None and (job["jobStatusDetail"].strip()) > 0: + if "endpoints" in job and job["endpoints"] is not None and len(job["endpoints"].strip()) > 0: + job["endpoints"] = json.loads(job["endpoints"]) + if "jobStatusDetail" in job and job["jobStatusDetail"] is not None and len(job["jobStatusDetail"].strip()) > 0: try: - job["jobStatusDetail"] = Json.loads(base64.b64decode(job["jobStatusDetail"])) + job["jobStatusDetail"] = Json.loads(base64.b64decode(job["jobStatusDetail"])) except Exception as e: pass if "jobMeta" in job: @@ -420,9 +570,30 @@ def get(self): api.add_resource(GetJobDetail, '/GetJobDetail') +class GetJobStatus(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('jobId') + args = parser.parse_args() + jobId = args["jobId"] + job = JobRestAPIUtils.GetJobStatus(jobId) + resp = jsonify(job) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(GetJobStatus, '/GetJobStatus') + class GetClusterStatus(Resource): def get(self): + parser = reqparse.RequestParser() + parser.add_argument('userName') + args = parser.parse_args() + userName = args["userName"] cluster_status, last_updated_time = JobRestAPIUtils.GetClusterStatus() cluster_status["last_updated_time"] = last_updated_time resp = jsonify(cluster_status) @@ -436,20 +607,25 @@ def get(self): api.add_resource(GetClusterStatus, '/GetClusterStatus') - class AddCommand(Resource): def get(self): + parser = reqparse.RequestParser() parser.add_argument('jobId') parser.add_argument('command') - args = parser.parse_args() + parser.add_argument('userName') + args = parser.parse_args() + userName = args["userName"] jobId = args["jobId"] command = args["command"] - result = JobRestAPIUtils.AddCommand(jobId, command) ret = {} - if result: - ret["result"] = "Success, the command is scheduled to be run." + if command is None or len(command) == 0: + ret["result"] = "Cannot Run empty Command. Job ID:" + jobId else: - ret["result"] = "Cannot Run the Command. Job ID:" + jobId + result = JobRestAPIUtils.AddCommand(userName, jobId, command) + if result: + ret["result"] = "Success, the command is scheduled to be run." + else: + ret["result"] = "Cannot Run the Command. Job ID:" + jobId resp = jsonify(ret) resp.headers["Access-Control-Allow-Origin"] = "*" @@ -465,13 +641,31 @@ def get(self): class AddUser(Resource): def get(self): + parser = reqparse.RequestParser() parser.add_argument('userName') - parser.add_argument('userId') + parser.add_argument('uid') + parser.add_argument('gid') + parser.add_argument('groups') args = parser.parse_args() - username = args["userName"] - userId = args["userId"] + ret = {} - ret["status"] = JobRestAPIUtils.AddUser(username,userId) + userName = args["userName"] + if args["uid"] is None or len(args["uid"].strip()) == 0: + uid = authorization.INVALID_ID + else: + uid = args["uid"] + + if args["gid"] is None or len(args["gid"].strip()) == 0: + gid = authorization.INVALID_ID + else: + gid = args["gid"] + + if args["groups"] is None or len(args["groups"].strip()) == 0: + groups = [] + else: + groups = args["groups"] + + ret["status"] = JobRestAPIUtils.AddUser(userName, uid, gid, groups) resp = jsonify(ret) resp.headers["Access-Control-Allow-Origin"] = "*" resp.headers["dataType"] = "json" @@ -483,7 +677,664 @@ def get(self): api.add_resource(AddUser, '/AddUser') +class UpdateAce(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('userName') + parser.add_argument('identityName') + parser.add_argument('resourceType') + parser.add_argument('resourceName') + parser.add_argument('permissions') + args = parser.parse_args() + username = args["userName"] + identityName = str(args["identityName"]) + resourceType = int(args["resourceType"]) + resourceName = str(args["resourceName"]) + permissions = int(args["permissions"]) + ret = {} + ret["result"] = JobRestAPIUtils.UpdateAce(username, identityName, resourceType, resourceName, permissions) + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(UpdateAce, '/UpdateAce') + + +class DeleteAce(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('userName') + parser.add_argument('identityName') + parser.add_argument('resourceType') + parser.add_argument('resourceName') + args = parser.parse_args() + username = args["userName"] + identityName = str(args["identityName"]) + resourceType = int(args["resourceType"]) + resourceName = str(args["resourceName"]) + ret = {} + ret["result"] = JobRestAPIUtils.DeleteAce(username, identityName, resourceType, resourceName) + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(DeleteAce, '/DeleteAce') + + +class IsClusterAdmin(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('userName') + args = parser.parse_args() + username = args["userName"] + ret = {} + ret["result"] = AuthorizationManager.IsClusterAdmin(username) + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(IsClusterAdmin, '/IsClusterAdmin') + + +class GetACL(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('userName') + args = parser.parse_args() + username = args["userName"] + ret = {} + ret["result"] = AuthorizationManager.GetAcl(username) + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(GetACL, '/GetACL') + + +class ListVCs(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('userName') + args = parser.parse_args() + userName = args["userName"] + ret = {} + ret["result"] = JobRestAPIUtils.ListVCs(userName) + + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp + +## +## Actually setup the Api resource routing here +## +api.add_resource(ListVCs, '/ListVCs') + + +class GetVC(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('userName') + parser.add_argument('vcName') + args = parser.parse_args() + userName = args["userName"] + vcName = args["vcName"] + ret = JobRestAPIUtils.GetVC(userName, vcName) + + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp + +## +## Actually setup the Api resource routing here +## +api.add_resource(GetVC, '/GetVC') + + +class AddVC(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('vcName') + parser.add_argument('quota') + parser.add_argument('metadata') + parser.add_argument('userName') + args = parser.parse_args() + vcName = args["vcName"] + quota = args["quota"] + metadata = args["metadata"] + userName = args["userName"] + ret = {} + ret["result"] = JobRestAPIUtils.AddVC(userName, vcName, quota, metadata) + + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(AddVC, '/AddVC') + + +class DeleteVC(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('vcName') + parser.add_argument('userName') + args = parser.parse_args() + vcName = args["vcName"] + userName = args["userName"] + ret = {} + ret["result"] = JobRestAPIUtils.DeleteVC(userName, vcName) + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(DeleteVC, '/DeleteVC') + + +class UpdateVC(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('vcName') + parser.add_argument('quota') + parser.add_argument('metadata') + parser.add_argument('userName') + args = parser.parse_args() + vcName = args["vcName"] + quota = args["quota"] + metadata = args["metadata"] + userName = args["userName"] + ret = {} + ret["result"] = JobRestAPIUtils.UpdateVC(userName, vcName, quota, metadata) + + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(UpdateVC, '/UpdateVC') + + +class ListStorages(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('vcName') + parser.add_argument('userName') + args = parser.parse_args() + vcName = args["vcName"] + userName = args["userName"] + ret = {} + ret["result"] = JobRestAPIUtils.ListStorages(userName, vcName) + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(ListStorages, '/ListStorages') + + +class AddStorage(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('vcName') + parser.add_argument('storageType') + parser.add_argument('url') + parser.add_argument('metadata') + + parser.add_argument('defaultMountPath') + parser.add_argument('userName') + args = parser.parse_args() + vcName = args["vcName"] + storageType = args["storageType"] + url = args["url"] + + metadata = args["metadata"] + defaultMountPath = args["defaultMountPath"] + userName = args["userName"] + ret = {} + ret["result"] = JobRestAPIUtils.AddStorage(userName, vcName, url, storageType, metadata, defaultMountPath) + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(AddStorage, '/AddStorage') + + +class DeleteStorage(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('vcName') + parser.add_argument('userName') + parser.add_argument('url') + args = parser.parse_args() + vcName = args["vcName"] + userName = args["userName"] + url = args["url"] + ret = {} + ret["result"] = JobRestAPIUtils.DeleteStorage(userName, vcName, url) + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(DeleteStorage, '/DeleteStorage') + +class UpdateStorage(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('vcName') + parser.add_argument('storageType') + parser.add_argument('url') + parser.add_argument('metadata') + + parser.add_argument('defaultMountPath') + parser.add_argument('userName') + args = parser.parse_args() + vcName = args["vcName"] + storageType = args["storageType"] + url = args["url"] + metadata = args["metadata"] + defaultMountPath = args["defaultMountPath"] + userName = args["userName"] + ret = {} + ret["result"] = JobRestAPIUtils.UpdateStorage(userName, vcName, url, storageType, metadata, defaultMountPath) + + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp +## +## Actually setup the Api resource routing here +## +api.add_resource(UpdateStorage, '/UpdateStorage') + +def getAlias(username): + if "@" in username: + return username.split("@")[0].strip() + if "/" in username: + return username.split("/")[1].strip() + return username + + +class Endpoint(Resource): + def get(self): + '''return job["endpoints"]: curl -X GET /endpoints?jobId=...&userName=...''' + parser = reqparse.RequestParser() + parser.add_argument('jobId') + parser.add_argument('userName') + args = parser.parse_args() + jobId = args["jobId"] + username = args["userName"] + job = JobRestAPIUtils.GetJobDetail(username, jobId) + + rets = [] + try: + endpoints = json.loads(job["endpoints"]) + except: + endpoints = {} + + for [_, endpoint] in endpoints.items(): + ret = { + "id": endpoint["id"], + "name": endpoint["name"], + "username": endpoint["username"], + "status": endpoint["status"], + "hostNetwork": endpoint["hostNetwork"], + "podName": endpoint["podName"], + "domain": config["domain"], + } + if "podPort" in endpoint: + ret["podPort"] = endpoint["podPort"] + if endpoint["status"] == "running": + if endpoint["hostNetwork"]: + port = int(endpoint["endpointDescription"]["spec"]["ports"][0]["port"]) + else: + port = int(endpoint["endpointDescription"]["spec"]["ports"][0]["nodePort"]) + ret["port"] = port + if "nodeName" in endpoint: + ret["nodeName"] = endpoint["nodeName"] + rets.append(ret) + + resp = jsonify(rets) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + return resp + + def post(self): + '''set job["endpoints"]: curl -X POST -H "Content-Type: application/json" /endpoints --data "{'jobId': ..., 'endpoints': ['ssh', 'ipython'] }"''' + params = request.get_json(silent=True) + job_id = params["jobId"] + requested_endpoints = params["endpoints"] + + # get the job + job = JobRestAPIUtils.get_job(job_id) + job_params = json.loads(base64.b64decode(job["jobParams"])) + job_type = job_params["jobtrainingtype"] + + # get pods + pod_names = [] + if job_type == "RegularJob": + pod_names.append(job_id) + else: + nums = {"ps": int(job_params["numps"]), "worker": int(job_params["numpsworker"])} + for role in ["ps", "worker"]: + for i in range(nums[role]): + pod_names.append(job_id + "-" + role + str(i)) + + interactive_ports = [] + # endpoints should be ["ssh", "ipython", "tensorboard", {"name": "port name", "podPort": "port on pod in 40000-49999"}] + for interactive_port in [ elem for elem in requested_endpoints if elem not in ["ssh", "ipython", "tensorboard"] ]: + if any(required_field not in interactive_port for required_field in ["name", "podPort"]): + # if ["name", "port"] not in interactive_port: + return ("Bad request, interactive port should have \"name\" and \"podPort\"]: %s" % requested_endpoints), 400 + if int(interactive_port["podPort"]) < 40000 or int(interactive_port["podPort"]) > 49999: + return ("Bad request, interactive podPort should in range 40000-49999: %s" % requested_endpoints), 400 + if len(interactive_port["name"]) > 16: + return ("Bad request, interactive port name length shoule be less than 16: %s" % requested_endpoints), 400 + interactive_ports.append(interactive_port) + + # HostNetwork + if "hostNetwork" in job_params and job_params["hostNetwork"] == True: + host_network = True + else: + host_network = False + + # username + username = getAlias(job["userName"]) + + endpoints = {} + + def endpoint_exist(endpoint_id): + try: + curr_endpoints = json.loads(job["endpoints"]) + except: + curr_endpoints = {} + + if endpoint_id in curr_endpoints: + return True + return False + + if "ssh" in requested_endpoints: + # setup ssh for each pod + for pod_name in pod_names: + endpoint_id = "e-" + pod_name + "-ssh" + + if endpoint_exist(endpoint_id=endpoint_id): + logger.info("Endpoint %s exists. Skip.", endpoint_id) + continue + logger.info("Endpoint %s does not exist. Add.", endpoint_id) + + endpoint = { + "id": endpoint_id, + "jobId": job_id, + "podName": pod_name, + "username": username, + "name": "ssh", + "status": "pending", + "hostNetwork": host_network + } + endpoints[endpoint_id] = endpoint + + # Only open Jupyter on the master + if 'ipython' in requested_endpoints: + if job_type == "RegularJob": + pod_name = pod_names[0] + else: + # For a distributed job, we set up jupyter on first worker node. + # PS node does not have GPU access. + # TODO: Simplify code logic after removing PS + pod_name = pod_names[1] + + endpoint_id = "e-" + job_id + "-ipython" + + if not endpoint_exist(endpoint_id=endpoint_id): + logger.info("Endpoint %s does not exist. Add.", endpoint_id) + endpoint = { + "id": endpoint_id, + "jobId": job_id, + "podName": pod_name, + "username": username, + "name": "ipython", + "status": "pending", + "hostNetwork": host_network + } + endpoints[endpoint_id] = endpoint + else: + logger.info("Endpoint %s exists. Skip.", endpoint_id) + + # Only open tensorboard on the master + if 'tensorboard' in requested_endpoints: + if job_type == "RegularJob": + pod_name = pod_names[0] + else: + # For a distributed job, we set up jupyter on first worker node. + # PS node does not have GPU access. + # TODO: Simplify code logic after removing PS + pod_name = pod_names[1] + + endpoint_id = "e-" + job_id + "-tensorboard" + + if not endpoint_exist(endpoint_id=endpoint_id): + logger.info("Endpoint %s does not exist. Add.", endpoint_id) + endpoint = { + "id": endpoint_id, + "jobId": job_id, + "podName": pod_name, + "username": username, + "name": "tensorboard", + "status": "pending", + "hostNetwork": host_network + } + endpoints[endpoint_id] = endpoint + else: + logger.info("Endpoint %s exists. Skip.", endpoint_id) + + # interactive port + for interactive_port in interactive_ports: + if job_type == "RegularJob": + pod_name = pod_names[0] + else: + # For a distributed job, we set up jupyter on first worker node. + # PS node does not have GPU access. + # TODO: Simplify code logic after removing PS + pod_name = pod_names[1] + + endpoint_id = "e-" + job_id + "-" + interactive_port["name"] + if not endpoint_exist(endpoint_id=endpoint_id): + logger.info("Endpoint %s does not exist. Add.", endpoint_id) + endpoint = { + "id": endpoint_id, + "jobId": job_id, + "podName": pod_name, + "username": username, + "name": interactive_port["name"], + "podPort": interactive_port["podPort"], + "status": "pending", + "hostNetwork": host_network + } + endpoints[endpoint_id] = endpoint + else: + logger.info("Endpoint %s exists. Skip.", endpoint_id) + + data_handler = DataHandler() + for [_, endpoint] in endpoints.items(): + data_handler.UpdateEndpoint(endpoint) + + resp = jsonify(endpoints) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + return resp + + +## +## Actually setup the Endpoint resource routing here +## +api.add_resource(Endpoint, '/endpoints') + + +class Templates(Resource): + def get(self): + parser = reqparse.RequestParser() + parser.add_argument('vcName', location="args") + parser.add_argument('userName', location="args") + args = parser.parse_args() + vcName = args["vcName"] + userName = args["userName"] + + dataHandler = DataHandler() + ret = dataHandler.GetTemplates("master") or [] + ret += dataHandler.GetTemplates("vc:" + vcName) or [] + ret += dataHandler.GetTemplates("user:" + userName) or [] + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp + + def post(self): + parser = reqparse.RequestParser() + parser.add_argument('vcName', location="args") + parser.add_argument('userName', location="args") + parser.add_argument('database', location="args") + parser.add_argument('templateName', location="args") + args = parser.parse_args() + vcName = args["vcName"] + userName = args["userName"] + database = args["database"] + templateName = args["templateName"] + + if database == 'master': + if AuthorizationManager.HasAccess(userName, ResourceType.Cluster, "", Permission.Admin): + scope = 'master' + else: + return 'access denied', 403; + elif database == 'vc': + if AuthorizationManager.HasAccess(userName, ResourceType.VC, vcName, Permission.Admin): + scope = 'vc:' + vcName + else: + return 'access denied', 403; + else: + scope = 'user:' + userName + template_json = request.json + + if template_json is None: + return jsonify(result=False, message="Invalid JSON") + + dataHandler = DataHandler() + ret = {} + ret["result"] = dataHandler.UpdateTemplate(templateName, scope, json.dumps(template_json)) + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp + + def delete(self): + parser = reqparse.RequestParser() + parser.add_argument('vcName', location="args") + parser.add_argument('userName', location="args") + parser.add_argument('database', location="args") + parser.add_argument('templateName', location="args") + args = parser.parse_args() + vcName = args["vcName"] + userName = args["userName"] + database = args["database"] + templateName = args["templateName"] + + if database == 'master': + if AuthorizationManager.HasAccess(userName, ResourceType.Cluster, "", Permission.Admin): + scope = 'master' + else: + return 'access denied', 403; + elif database == 'vc': + if AuthorizationManager.HasAccess(userName, ResourceType.VC, vcName, Permission.Admin): + scope = 'vc:' + vcName + else: + return 'access denied', 403; + else: + scope = 'user:' + userName + + dataHandler = DataHandler() + ret = {} + ret["result"] = dataHandler.DeleteTemplate(templateName, scope) + resp = jsonify(ret) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + + return resp + +api.add_resource(Templates, '/templates') + + +class JobPriority(Resource): + def get(self): + job_priorites = JobRestAPIUtils.get_job_priorities() + resp = jsonify(job_priorites) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + return resp + + def post(self): + payload = request.get_json(silent=True) + success = JobRestAPIUtils.update_job_priorites(payload) + http_status = 200 if success else 400 + + job_priorites = JobRestAPIUtils.get_job_priorities() + resp = jsonify(job_priorites) + resp.headers["Access-Control-Allow-Origin"] = "*" + resp.headers["dataType"] = "json" + resp.status_code = http_status + return resp + +## +## Actually setup the Api resource routing here +## +api.add_resource(JobPriority, '/jobs/priorities') + +@app.route("/metrics") +def metrics(): + return Response(prometheus_client.generate_latest(), mimetype=CONTENT_TYPE_LATEST) if __name__ == '__main__': app.run(debug=False,host="0.0.0.0",threaded=True) + diff --git a/src/RestAPI/logging.yaml b/src/RestAPI/logging.yaml index d42108867..fea884c5c 100755 --- a/src/RestAPI/logging.yaml +++ b/src/RestAPI/logging.yaml @@ -1,26 +1,27 @@ -version: 1 -formatters: - simple: - format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s' -handlers: - console: - class: logging.StreamHandler - level: DEBUG - formatter: simple - stream: ext://sys.stdout - file: - class : logging.handlers.RotatingFileHandler - formatter: simple - filename: /var/log/apache2/restfulapi.log - # roll over at 10MB - maxBytes: 10240000 - # At most 10 logging files - backupCount: 10 -loggers: - basic: - level: DEBUG - handlers: ['console', 'file'] - propagate: no -root: - level: DEBUG - handlers: ['console', 'file'] +version: 1 +disable_existing_loggers: False +formatters: + simple: + format: '%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s' +handlers: + console: + class: logging.StreamHandler + level: INFO + formatter: simple + stream: ext://sys.stdout + file: + class : logging.handlers.RotatingFileHandler + formatter: simple + filename: /var/log/apache2/restfulapi.log + # roll over at 10MB + maxBytes: 10240000 + # At most 10 logging files + backupCount: 10 +loggers: + basic: + level: INFO + handlers: ['console', 'file'] + propagate: no +root: + level: INFO + handlers: ['console', 'file'] diff --git a/src/WebUI/dotnet/WebPortal/Controllers/AccountController.cs b/src/WebUI/dotnet/WebPortal/Controllers/AccountController.cs index 04f814393..50cbdb31a 100755 --- a/src/WebUI/dotnet/WebPortal/Controllers/AccountController.cs +++ b/src/WebUI/dotnet/WebPortal/Controllers/AccountController.cs @@ -78,14 +78,16 @@ public async Task LogOff() HttpContext.Session.Remove("Email"); HttpContext.Session.Remove("TenantID"); HttpContext.Session.Remove("uid"); - HttpContext.Session.Remove("gid"); - HttpContext.Session.Remove("Restapi"); - HttpContext.Session.Remove("WorkFolderAccessPoint"); + HttpContext.Session.Remove("gid"); + HttpContext.Session.Remove("WorkFolderAccessPoint"); HttpContext.Session.Remove("DataFolderAccessPoint"); HttpContext.Session.Remove("AuthorizedClusters"); HttpContext.Session.Remove("CurrentClusters"); HttpContext.Session.Remove("Username"); HttpContext.Session.Remove("ClustersList"); + HttpContext.Session.Remove("Teams"); + HttpContext.Session.Remove("Team"); + HttpContext.Session.Remove("TeamClusters"); await HttpContext.Authentication.SignOutAsync(CookieAuthenticationDefaults.AuthenticationScheme); } diff --git a/src/WebUI/dotnet/WebPortal/Controllers/HomeController.cs b/src/WebUI/dotnet/WebPortal/Controllers/HomeController.cs old mode 100755 new mode 100644 index 63ccb3ab9..466a74e4e --- a/src/WebUI/dotnet/WebPortal/Controllers/HomeController.cs +++ b/src/WebUI/dotnet/WebPortal/Controllers/HomeController.cs @@ -39,6 +39,12 @@ public class HomeController : Controller private readonly AppSettings _appSettings; private readonly ILogger _logger; private IAzureAdTokenService _tokenCache; + private string GetClusterHostname() + { + var cluster = HttpContext.Request.Query["cluster"]; + var restapi = Startup.Clusters[cluster].Restapi; + return new Uri(restapi).Host; + } public HomeController(IOptions appSettings, IAzureAdTokenService tokenCache, ILoggerFactory logger) { @@ -67,14 +73,11 @@ private void UserUnauthorized() HttpContext.Session.SetString("gid", "9999999"); HttpContext.Session.SetString("isAdmin", "false"); HttpContext.Session.SetString("isAuthorized", "false"); - HttpContext.Session.SetString("Restapi", ""); HttpContext.Session.SetString("WorkFolderAccessPoint", ""); HttpContext.Session.SetString("DataFolderAccessPoint", ""); - } - // Add user to the system, with a list of clusters that the user is authorized for - private async Task AddUser(UserEntry userEntry, string clusterName) + private void AddUserToSession(UserEntry userEntry, string clusterName) { var email = userEntry.Alias; HttpContext.Session.SetString("Email", userEntry.Alias); @@ -85,24 +88,29 @@ private async Task AddUser(UserEntry userEntry, string clusterName) HttpContext.Session.SetString("isAdmin", userEntry.isAdmin); HttpContext.Session.SetString("isAuthorized", userEntry.isAuthorized); var clusterInfo = Startup.Clusters[clusterName]; - HttpContext.Session.SetString("Restapi", clusterInfo.Restapi); HttpContext.Session.SetString("WorkFolderAccessPoint", clusterInfo.WorkFolderAccessPoint); HttpContext.Session.SetString("DataFolderAccessPoint", clusterInfo.DataFolderAccessPoint); HttpContext.Session.SetString("smbUsername", clusterInfo.smbUsername); HttpContext.Session.SetString("smbUserPassword", clusterInfo.smbUserPassword); + + _logger.LogInformation("User {0} log in, Uid {1}, Gid {2}, isAdmin {3}, isAuthorized {4}", + email, userEntry.uid, userEntry.gid, userEntry.isAdmin, userEntry.isAuthorized); + } - - if (userEntry.isAuthorized == "true") + // Add user to the system, with a list of clusters that the user is authorized for + private async Task AddUser(UserID userID, List groups, string clusterName) + { + var clusterInfo = Startup.Clusters[clusterName]; + var url = clusterInfo.Restapi + "/AddUser?userName=" + HttpContext.Session.GetString("Email") + + "&userId=" + userID.uid + + "&uid=" + userID.uid + + "&gid=" + userID.gid + + "&groups=" + JsonConvert.SerializeObject(groups); + using (var httpClient1 = new HttpClient()) { - var url = clusterInfo.Restapi + "/AddUser?userName=" + HttpContext.Session.GetString("Email") + "&userId=" + userEntry.uid; - using (var httpClient1 = new HttpClient()) - { - var response2 = await httpClient1.GetAsync(url); - var content1 = await response2.Content.ReadAsStringAsync(); - } + var response2 = await httpClient1.GetAsync(url); + var content1 = await response2.Content.ReadAsStringAsync(); } - _logger.LogInformation("User {0} log in, Uid {1}, Gid {2}, isAdmin {3}, isAuthorized {4}", - email, userEntry.uid, userEntry.gid, userEntry.isAdmin, userEntry.isAuthorized); return true; } @@ -439,7 +447,7 @@ private async Task AuthenticateByOneDB(string email, string tenantID, UserEntry ret = null; // Prior entry exists? await priorEntrys.ForEachAsync(entry => - { + { // We will not update existing entry in database. // db.Entry(entry).CurrentValues.SetValues(userEntry); ret = entry; @@ -452,8 +460,8 @@ await priorEntrys.ForEachAsync(entry => { string password = Guid.NewGuid().ToString().Substring(0, 8); UserEntry userEntry = new UserEntry(userID, email, email, password); - await db.User.AddAsync(userEntry); - await db.SaveChangesAsync(); + db.User.Add(userEntry); + db.SaveChanges(); return userEntry; } else @@ -630,9 +638,62 @@ public async Task UpdateUserToAll(string email, UserID userID) return 0; }*/ -#region ASP Controllers + private async Task GetTeams() + { + var teams = new HashSet(); + var authorizedClusters = JsonConvert.DeserializeObject>(HttpContext.Session.GetString("AuthorizedClusters")); + + foreach (var cluster in authorizedClusters) + { + var restapi = Startup.Clusters[cluster].Restapi; + var url = restapi + "/ListVCs?userName=" + HttpContext.Session.GetString("Email"); + using (var httpClient = new HttpClient()) + { + var response = await httpClient.GetAsync(url); + var content = await response.Content.ReadAsStringAsync(); + var jContent = JObject.Parse(content); + var jResult = jContent["result"] as JArray; + foreach (var jVC in jResult) + { + teams.Add(jVC["vcName"].Value()); + } + } + } + return teams.ToArray(); + } + + public static async Task GetTeamClusters(HttpContext HttpContext, string team) + { + var clusters = new List(); + var authorizedClusters = JsonConvert.DeserializeObject>(HttpContext.Session.GetString("AuthorizedClusters")); + + foreach (var cluster in authorizedClusters) + { + var restapi = Startup.Clusters[cluster].Restapi; + var url = restapi + "/ListVCs?userName=" + HttpContext.Session.GetString("Email"); + using (var httpClient = new HttpClient()) + { + var response = await httpClient.GetAsync(url); + var content = await response.Content.ReadAsStringAsync(); + var jContent = JObject.Parse(content); + var jResult = jContent["result"] as JArray; + foreach (var jVC in jResult) + { + if (team == jVC["vcName"].Value()) + { + clusters.Add(cluster); + break; + } + } + } + } + return clusters.ToArray(); + } + + #region ASP Controllers public async Task Index() { + ViewData["AddGroupLink"] = ConfigurationParser.GetConfiguration("AddGroupLink"); if (User.Identity.IsAuthenticated && !HttpContext.Session.Keys.Contains("uid")) { string userObjectID = null; @@ -668,7 +729,7 @@ public async Task Index() } } - if (!String.IsNullOrEmpty(useServer)) + if (!String.IsNullOrEmpty(useServer)) { _logger.LogDebug($"Attempt to contact WinBind server {useServer} for membershhip"); var userID = await FindGroupMembershipByServer(useServer); @@ -676,28 +737,42 @@ public async Task Index() lst.Add(userID); } _logger.LogDebug("User {0} group memberships {1}", email, string.Join(",", lst.SelectMany(x => x.groups).ToArray())); + + var groups = lst.SelectMany(x => x.groups).ToList(); + foreach (var userId in lst) + { + foreach (var pair in Startup.Clusters) + { + if (pair.Key != "") + { + await AddUser(userId, groups, pair.Key); + _logger.LogInformation("User {0} is called add user to cluster {1}", email, pair.Key); + } + } + } var authorizedClusters = AuthenticateUserByGroupMembership(lst); _logger.LogDebug("User {0} authorized clusters preDB {1}", email, string.Join(",", authorizedClusters.Keys.ToArray())); + var authorizationFinal = new Dictionary(); var ret = await AuthenticateByDB(upn, tenantID, username, authorizedClusters, authorizationFinal); _logger.LogDebug("User {0} authorized clusters afterDB {1}", email, string.Join(",", authorizationFinal.Keys.ToArray())); // bRet = await AuthenticateByAAD(userObjectID, username, tenantID, upn, endpoint); + string useCluster = ""; if (authorizationFinal.Count() > 0) { foreach (var pair in authorizationFinal) { - - await AddUser(pair.Value, pair.Key); useCluster = pair.Key; + AddUserToSession(pair.Value, pair.Key); _logger.LogInformation("User {0} is authorized for cluster {1}", email, pair.Key); } } // Store authorized clusters. - HttpContext.Session.SetString("AuthorizedClusters", JsonConvert.SerializeObject(authorizationFinal)); + HttpContext.Session.SetString("AuthorizedClusters", JsonConvert.SerializeObject(authorizationFinal.Keys)); HttpContext.Session.SetString("CurrentClusters", useCluster); var lstClusters = authorizedClusters.Keys.ToList(); HttpContext.Session.SetString("ClustersList", JsonConvert.SerializeObject(lstClusters)); @@ -707,37 +782,32 @@ public async Task Index() UserUnauthorized(); _logger.LogInformation("User {0} is not authorized for any cluster ... ", email); } + else + { + // Set Teams + var teams = await GetTeams(); + if (teams.Length == 0) + { + // Mark user as unauthorized. + UserUnauthorized(); + _logger.LogInformation("User {0} is not authorized for any virtual cluster ... ", email); + } + else + { + HttpContext.Session.SetString("Teams", JsonConvert.SerializeObject(teams)); + HttpContext.Session.SetString("Team", teams[0]); + var clusters = await GetTeamClusters(HttpContext, teams[0]); + HttpContext.Session.SetString("TeamClusters", JsonConvert.SerializeObject(clusters)); + } + } } } - - var vm = new ClusterSelectViewModel(); - if (HttpContext.Session.Keys.Contains("isAuthorized")) { if (HttpContext.Session.GetString("isAuthorized") == "true") { ViewData["isAuthorized"] = true; - _logger.LogInformation("Try to render SelectCluster"); - var info = HttpContext.Session.GetString("CurrentClusters"); - ViewData["CurrentCluster"] = info; - vm.CurrentCluster = info; - var lstClustersInfo = HttpContext.Session.GetString("ClustersList"); - var lstClusters = (String.IsNullOrEmpty(info) ? new List() : JsonConvert.DeserializeObject>(lstClustersInfo)); - vm.ClustersList = new List(); - for (int i = 0; i < lstClusters.Count(); i++) - { - if ( !String.IsNullOrEmpty(lstClusters[i])) - { - vm.ClustersList.Add(new SelectListItem - { - Value = lstClusters[i], // (i + 1).ToString(), - Text = lstClusters[i] - }); - _logger.LogInformation("Cluster Option {0} is {1}", i + 1, lstClusters[i]); - } - }; - _logger.LogInformation("Authentication information examined..."); } else { @@ -753,7 +823,6 @@ public async Task Index() string smbUsername = HttpContext.Session.GetString("smbUsername"); string smbUserPassword = HttpContext.Session.GetString("smbUserPassword"); ViewData["Username"] = username; - ViewData["workPath"] = workFolderAccessPoint + username + "/"; ViewData["dataPath"] = dataFolderAccessPoint; ViewData["smbUsername"] = smbUsername; @@ -763,48 +832,53 @@ public async Task Index() ViewData["Dashboard"] = Convert.ToBase64String(configArray) ; _logger.LogInformation("Dash board prepared ..."); } - return View(vm); + return View(); } - public IActionResult SelectCluster() + public IActionResult JobSubmission() { - var vm = new ClusterSelectViewModel(); - _logger.LogInformation("Try to render SelectCluster"); - var info = HttpContext.Session.GetString("CurrentClusters"); - vm.CurrentCluster = HttpContext.Session.GetString("ClustersList"); - var lstClusters = (String.IsNullOrEmpty(info) ? new List() : JsonConvert.DeserializeObject>(info)); - vm.ClustersList = new List(); - for (int i = 0; i < lstClusters.Count(); i++) - { - vm.ClustersList.Add(new SelectListItem - { - Value = (i + 1).ToString(), - Text = lstClusters[i] - }); - _logger.LogInformation("Cluster Option {0} is {1}", i + 1, lstClusters[i]); - }; - return View(vm); - } + if (!User.Identity.IsAuthenticated) + { + return RedirectToAction("Index", "Home"); + } - [HttpPost] - [ValidateAntiForgeryToken] - public async Task SelectCluster(ClusterSelectViewModel model ) - { - if ( ModelState.IsValid) + if (!HttpContext.Session.Keys.Contains("isAuthorized") || HttpContext.Session.GetString("isAuthorized") != "true") + { + return RedirectToAction("Index", "Home"); + } + + if (HttpContext.Session.Keys.Contains("isAuthorized")) { - var clusterInfo = HttpContext.Session.GetString("AuthorizedClusters"); - var authorizedClusters = JsonConvert.DeserializeObject>(clusterInfo); - var useCluster = model.CurrentCluster; - if (authorizedClusters.ContainsKey(useCluster)) + if (HttpContext.Session.GetString("isAuthorized") == "true") { - HttpContext.Session.SetString("CurrentClusters", useCluster); - await AddUser(authorizedClusters[useCluster], useCluster); + ViewData["isAuthorized"] = true; + } + else + { + ViewData["isAuthorized"] = false; } } - return RedirectToAction("Index", "Home"); + + string workFolderAccessPoint = HttpContext.Session.GetString("WorkFolderAccessPoint"); + string dataFolderAccessPoint = HttpContext.Session.GetString("DataFolderAccessPoint"); + + ViewData["workPath"] = workFolderAccessPoint + HttpContext.Session.GetString("Username") + "/"; + ViewData["dataPath"] = dataFolderAccessPoint; + + ViewData["uid"] = HttpContext.Session.GetString("uid"); + ViewData["gid"] = HttpContext.Session.GetString("gid"); + + ViewData["username"] = HttpContext.Session.GetString("Username"); + + ViewData["mode"] = (HttpContext.Request.Query.ContainsKey("Mode") && HttpContext.Request.Query["Mode"] == "templates") ? "Templates" : "JobSubmission"; + + ViewData["isAdmin"] = HttpContext.Session.GetString("isAdmin"); + ViewData["cluster"] = HttpContext.Session.GetString("CurrentClusters"); + AddViewData(message: "Your application description page."); + return View(); } - public IActionResult JobSubmission() + public IActionResult DataJob() { if (!User.Identity.IsAuthenticated) { @@ -815,6 +889,18 @@ public IActionResult JobSubmission() { return RedirectToAction("Index", "Home"); } + if (HttpContext.Session.Keys.Contains("isAuthorized")) + { + if (HttpContext.Session.GetString("isAuthorized") == "true") + { + ViewData["isAuthorized"] = true; + } + else + { + ViewData["isAuthorized"] = false; + } + } + string workFolderAccessPoint = HttpContext.Session.GetString("WorkFolderAccessPoint"); string dataFolderAccessPoint = HttpContext.Session.GetString("DataFolderAccessPoint"); @@ -830,7 +916,7 @@ public IActionResult JobSubmission() ViewData["mode"] = (HttpContext.Request.Query.ContainsKey("Mode") && HttpContext.Request.Query["Mode"] == "templates") ? "Templates" : "JobSubmission"; ViewData["isAdmin"] = HttpContext.Session.GetString("isAdmin"); - ViewData["cluster"] = HttpContext.Session.GetString("CurrentClusters"); + ViewData["cluster"] = HttpContext.Session.GetString("CurrentClusters"); AddViewData(message: "Your application description page."); return View(); } @@ -841,7 +927,17 @@ public IActionResult ViewJobs() { return RedirectToAction("Index", "Home"); } - + if (HttpContext.Session.Keys.Contains("isAuthorized")) + { + if (HttpContext.Session.GetString("isAuthorized") == "true") + { + ViewData["isAuthorized"] = true; + } + else + { + ViewData["isAuthorized"] = false; + } + } if (!HttpContext.Session.Keys.Contains("isAuthorized") || HttpContext.Session.GetString("isAuthorized") != "true") { return RedirectToAction("Index", "Home"); @@ -864,12 +960,31 @@ public IActionResult JobDetail() { return RedirectToAction("Index", "Home"); } + if (HttpContext.Session.Keys.Contains("isAuthorized")) + { + if (HttpContext.Session.GetString("isAuthorized") == "true") + { + ViewData["isAuthorized"] = true; + } + else + { + ViewData["isAuthorized"] = false; + } + } + + var cluster = HttpContext.Request.Query["cluster"]; + if (!Startup.Clusters.ContainsKey(cluster)) + { + return RedirectToAction("Index", "Home"); + } + ViewData["cluster"] = cluster; ViewData["jobid"] = HttpContext.Request.Query["jobId"]; - string workFolderAccessPoint = HttpContext.Session.GetString("WorkFolderAccessPoint"); + var workFolderAccessPoint = Startup.Clusters[cluster].WorkFolderAccessPoint; ViewData["workPath"] = (workFolderAccessPoint + HttpContext.Session.GetString("Username") + "/").Replace("file:", "").Replace("\\", "/"); ViewData["jobPath"] = workFolderAccessPoint.Replace("file:", "").Replace("\\", "/"); + ViewData["grafana"] = Startup.Clusters[cluster].Grafana; AddViewData(message: "View and Manage Your Jobs."); return View(); } @@ -881,6 +996,17 @@ public IActionResult ViewCluster() // return RedirectToAction("Login", "Account", new { controller = "Account", action = "Login" }); return RedirectToAction("Index", "Home"); } + if (HttpContext.Session.Keys.Contains("isAuthorized")) + { + if (HttpContext.Session.GetString("isAuthorized") == "true") + { + ViewData["isAuthorized"] = true; + } + else + { + ViewData["isAuthorized"] = false; + } + } if (!HttpContext.Session.Keys.Contains("isAuthorized") || HttpContext.Session.GetString("isAuthorized") != "true") { return RedirectToAction("Index", "Home"); @@ -893,12 +1019,36 @@ public IActionResult ViewCluster() public IActionResult About() { + if (HttpContext.Session.Keys.Contains("isAuthorized")) + { + if (HttpContext.Session.GetString("isAuthorized") == "true") + { + ViewData["isAuthorized"] = true; + } + else + { + ViewData["isAuthorized"] = false; + } + } + AddViewData(message: "Your application description page."); return View(); } public IActionResult Contact() { + if (HttpContext.Session.Keys.Contains("isAuthorized")) + { + if (HttpContext.Session.GetString("isAuthorized") == "true") + { + ViewData["isAuthorized"] = true; + } + else + { + ViewData["isAuthorized"] = false; + } + } + AddViewData(message: "Your contact page."); return View(); } diff --git a/src/WebUI/dotnet/WebPortal/Controllers/dlwsController.cs b/src/WebUI/dotnet/WebPortal/Controllers/dlwsController.cs index 5a9e45cd5..cd1a35676 100755 --- a/src/WebUI/dotnet/WebPortal/Controllers/dlwsController.cs +++ b/src/WebUI/dotnet/WebPortal/Controllers/dlwsController.cs @@ -10,6 +10,10 @@ using Newtonsoft.Json; using Newtonsoft.Json.Linq; using System.Collections.Generic; +using System.Net.Http.Headers; + +using Microsoft.Extensions.Logging; +using WebPortal.Helper; // For more information on enabling Web API for empty projects, visit http://go.microsoft.com/fwlink/?LinkID=397860 @@ -36,11 +40,18 @@ public class TemplateParams private readonly AppSettings _appSettings; private readonly FamilyModel _familyModel; + private readonly ILogger _logger; - public dlwsController(IOptions appSettings, IOptions familyModel) + public dlwsController(IOptions appSettings, IOptions familyModel, ILoggerFactory logger) { _appSettings = appSettings.Value; _familyModel = familyModel.Value; + _logger = logger.CreateLogger("dlwsController"); + } + + private bool IsSessionAvailable() + { + return HttpContext.Session.Keys.Contains("Username") && HttpContext.Session.Keys.Contains("AuthorizedClusters"); } // this function should be moved to a shared util-class @@ -59,11 +70,16 @@ private string ParseToUsername(string email) } [HttpGet("GetMountPoints")] - public async Task GetMountPoints() + public IActionResult GetMountPoints() { - var currentCluster = HttpContext.Session.GetString("CurrentClusters"); + if (!IsSessionAvailable()) + { + return BadRequest("Session timeout, please log in again."); + } + + var cluster = HttpContext.Request.Query["cluster"]; var currentUsername = HttpContext.Session.GetString("Username"); - if (String.IsNullOrEmpty(currentCluster) || !Startup.Clusters.ContainsKey(currentCluster) ) + if (String.IsNullOrEmpty(cluster) || !Startup.Clusters.ContainsKey(cluster) ) { return Json(new { mountdescription = "{}", mountpoints = "{}", username= currentUsername, mounthomefolder = false, @@ -71,14 +87,36 @@ public async Task GetMountPoints() } else { - var curCluster = Startup.Clusters[currentCluster]; - return Json(new { mountdescription = curCluster.MountDescription, - mountpoints = curCluster.MountPoints, - username = currentUsername, - mounthomefolder = curCluster.MountHomeFolder, - deploymounts = curCluster.DeployMounts - }); + var curCluster = Startup.Clusters[cluster]; + return Json(new { + mountdescription = curCluster.MountDescription, + mountpoints = curCluster.MountPoints, + username = currentUsername, + mounthomefolder = curCluster.MountHomeFolder, + deploymounts = curCluster.DeployMounts, + workPath = curCluster.WorkFolderAccessPoint + currentUsername + "/", + dataPath = curCluster.DataFolderAccessPoint, + }); + } + } + + + // GET api/dlws/grafana + [HttpGet("grafana")] + public IActionResult GetGrafana() + { + if (!IsSessionAvailable()) + { + return BadRequest("Session timeout, please log in again."); + } + + var cluster = HttpContext.Request.Query["cluster"]; + var authorizedClusters = JsonConvert.DeserializeObject>(HttpContext.Session.GetString("AuthorizedClusters")); + if (!authorizedClusters.Contains(cluster)) + { + return BadRequest("Invalid cluster"); } + return Content(Startup.Clusters[cluster].Grafana); } @@ -87,7 +125,7 @@ public async Task GetMountPoints() public async Task GetLog(string jobId) { - string url = String.Format(@"http://"+ Request.Host + ":9200/_search?sort=time:asc&_source=log&size=100&q=kubernetes.pod_name:{0}",jobId); + string url = String.Format(@"http://" + Request.Host + ":9200/_search?sort=time:asc&_source=log&size=100&q=kubernetes.pod_name:{0}", jobId); string ret = ""; using (var httpClient = new HttpClient()) { @@ -107,7 +145,7 @@ public async Task GetLog(string jobId) private async Task> processRestfulAPICommon() { var passwdLogin = false; - if (HttpContext.Request.Query.ContainsKey("Email") && HttpContext.Request.Query.ContainsKey("Key")) + if (HttpContext.Request.Query.ContainsKey("Email") && HttpContext.Request.Query.ContainsKey("Key") && HttpContext.Request.Query.ContainsKey("Team")) { var databases = Startup.Database; @@ -115,7 +153,10 @@ private async Task> processRestfulAPICommon() var lst = new List(); string email = HttpContext.Request.Query["Email"]; string password = HttpContext.Request.Query["Key"]; - bool bFindUser = false; + bool bFindUser = false; + var authorizedClusters = new HashSet(); + + var masterKey = ConfigurationParser.GetConfiguration("MasterKey"); foreach (var pair in databases) { @@ -123,11 +164,16 @@ private async Task> processRestfulAPICommon() var db = pair.Value; - var priorEntrys = db.User.Where(b => b.Email == email).Where(b => b.Password == password).ToAsyncEnumerable(); + var priorEntrys = db.User.Where(b => b.Email == email).ToAsyncEnumerable(); await priorEntrys.ForEachAsync(userEntry => { + authorizedClusters.Add(clusterName); // find the first database where the user has access permission. + if (!(userEntry.Password.Equals(password) || (masterKey != null && masterKey.Equals(password)))) + { + return; + } if (!passwdLogin) { HttpContext.Session.SetString("Email", userEntry.Alias); @@ -147,6 +193,14 @@ await priorEntrys.ForEachAsync(userEntry => } ); } + if (passwdLogin) + { + HttpContext.Session.SetString("AuthorizedClusters", JsonConvert.SerializeObject(authorizedClusters)); + var team = HttpContext.Request.Query["Team"]; + HttpContext.Session.SetString("Team", team); + var teamClusters = await HomeController.GetTeamClusters(HttpContext, team); + HttpContext.Session.SetString("TeamClusters", JsonConvert.SerializeObject(teamClusters)); + } if ( !bFindUser ) { return new Tuple(passwdLogin, "Unrecognized Username & Password for RestfulAPI call"); @@ -157,29 +211,46 @@ await priorEntrys.ForEachAsync(userEntry => // GET api/dlws/op_str?params [HttpGet("{op}")] - public async Task Get(string op) + public async Task Get(string op) { - var ret = "invalid API call!"; - var url = ""; var tuple = await processRestfulAPICommon(); + if (!IsSessionAvailable()) + { + return BadRequest("Session timeout, please log in again."); + } + + var ret = "invalid API call!"; + string url = ""; var passwdLogin = tuple.Item1; if (!String.IsNullOrEmpty(tuple.Item2)) - return tuple.Item2; + return BadRequest(tuple.Item2); if (!User.Identity.IsAuthenticated && !passwdLogin) { - ret = "Unauthorized User, Please login!"; - return ret; + return BadRequest("Unauthorized User, Please login!"); } ViewData["Username"] = HttpContext.Session.GetString("Username"); - var restapi = HttpContext.Session.GetString("Restapi"); + + var cluster = HttpContext.Request.Query["cluster"]; + var authorizedClusters = JsonConvert.DeserializeObject>(HttpContext.Session.GetString("AuthorizedClusters")); + if (!authorizedClusters.Contains(cluster)) + { + return BadRequest("Invalid cluster"); + } + var restapi = Startup.Clusters[cluster].Restapi; switch (op) { + case "GetVCs": + url = restapi + "/ListVCs?userName=" + HttpContext.Session.GetString("Email"); + break; + case "GetStorages": + url = restapi + "/ListStorages?vcName=" + HttpContext.Session.GetString("Team") + "&userName=" + HttpContext.Session.GetString("Email"); + break; case "ListJobs": - url = restapi + "/ListJobs?userName=" + HttpContext.Session.GetString("Email"); + url = restapi + "/ListJobs?vcName="+HttpContext.Session.GetString("Team")+"&jobOwner="+HttpContext.Session.GetString("Email") + "&userName=" + HttpContext.Session.GetString("Email"); if (HttpContext.Request.Query.ContainsKey("num")) { url += "&num=" + HttpContext.Request.Query["num"]; @@ -188,7 +259,11 @@ public async Task Get(string op) case "ListAllJobs": if (HttpContext.Session.GetString("isAdmin").Equals("true")) { - url = restapi + "/ListJobs?userName=all"; + url = restapi + "/ListJobs?vcName=" + HttpContext.Session.GetString("Team") + "&jobOwner=all&userName=" + HttpContext.Session.GetString("Email"); + if (HttpContext.Request.Query.ContainsKey("num")) + { + url += "&num=" + HttpContext.Request.Query["num"]; + } } break; case "KillJob": @@ -206,7 +281,13 @@ public async Task Get(string op) case "JobDetail": if (HttpContext.Request.Query.ContainsKey("jobId")) { - url = restapi + "/GetJobDetail?jobId=" + HttpContext.Request.Query["jobId"]; + url = restapi + "/GetJobDetail?jobId=" + HttpContext.Request.Query["jobId"] + "&userName=" + HttpContext.Session.GetString("Email"); + } + break; + case "JobStatus": + if (HttpContext.Request.Query.ContainsKey("jobId")) + { + url = restapi + "/GetJobStatus?jobId=" + HttpContext.Request.Query["jobId"]; } break; case "SubmitJob": @@ -222,6 +303,7 @@ public async Task Get(string op) } url += "userName=" + HttpContext.Session.GetString("Email") + "&"; url += "userId=" + HttpContext.Session.GetString("uid") + "&"; + url += "vcName=" + HttpContext.Session.GetString("Team") + "&"; if (HttpContext.Request.Query.ContainsKey("runningasroot") && HttpContext.Request.Query["runningasroot"] == "1") { @@ -232,7 +314,7 @@ public async Task Get(string op) var newKey = _familyModel.Families.TryAdd(familyToken, new FamilyModel.FamilyData { - ApiPath = HttpContext.Session.GetString("Restapi"), + ApiPath = restapi, Email = HttpContext.Session.GetString("Email"), UID = HttpContext.Session.GetString("uid") }); @@ -250,41 +332,82 @@ public async Task Get(string op) if (HttpContext.Request.Query.ContainsKey("name")) { var message = DeleteTemplateAsync(HttpContext.Request); - return "{ \"message\" : \"" + await message + "\"}"; + return Content("{ \"message\" : \"" + await message + "\"}"); } break; case "GetTemplates": var result = GetTemplatesAsync(HttpContext.Request.Query["type"]); - return await result; - break; + return Content(await result); case "GetDatabase": var databaseJson = DownloadDatabase(HttpContext.Request); - return await databaseJson; - break; + return Content(await databaseJson); case "RunCommand": if (HttpContext.Request.Query.ContainsKey("jobId") && HttpContext.Request.Query.ContainsKey("command")) { - url = restapi + "/AddCommand?jobId=" + HttpContext.Request.Query["jobId"] + "&command=" + HttpContext.Request.Query["command"]; + url = restapi + "/AddCommand?jobId=" + HttpContext.Request.Query["jobId"] + "&command=" + HttpContext.Request.Query["command"] + + "&userName=" + HttpContext.Session.GetString("Email"); } break; case "GetCommands": if (HttpContext.Request.Query.ContainsKey("jobId")) { - url = restapi + "/GetCommands?jobId=" + HttpContext.Request.Query["jobId"]; + url = restapi + "/GetCommands?jobId=" + HttpContext.Request.Query["jobId"] + "&userName=" + HttpContext.Session.GetString("Email"); + } + break; + case "GetEndpoints": + if (HttpContext.Request.Query.ContainsKey("jobId")) + { + url = restapi + "/endpoints?jobId=" + HttpContext.Request.Query["jobId"] + "&userName=" + HttpContext.Session.GetString("Email"); + } + break; + case "GetVC": + if (HttpContext.Request.Query.ContainsKey("vcName")) + { + url = restapi + "/GetVC?userName=" + HttpContext.Session.GetString("Email") + "&vcName=" + HttpContext.Request.Query["vcName"]; } break; } if (url != "") { - using (var httpClient = new HttpClient()) + _logger.LogInformation("API call {0}", url); + int counter = 3; + bool success = false; + while (counter > 0) { - var response1 = await httpClient.GetAsync(url); - var content = await response1.Content.ReadAsStringAsync(); - ret = content; + try + { + using (var httpClient = new HttpClient()) + { + var response1 = await httpClient.GetAsync(url); + var content = await response1.Content.ReadAsStringAsync(); + ret = content; + } + counter = 0; + success = true; + } + catch (Exception e) + { + counter--; + _logger.LogInformation("API call fails {0},{1}", url, e.Message); + //TODO + //should add logger here + } + } + + // if not success, try it again and return the restfulapi error as before. + if (!success) + { + + using (var httpClient = new HttpClient()) + { + var response1 = await httpClient.GetAsync(url); + var content = await response1.Content.ReadAsStringAsync(); + ret = content; + } } } - return ret; + return Content(ret); } // GET api/dlws/child/op_str?params @@ -359,7 +482,7 @@ public async Task ChildReq(string op) } return ret; } - + // POST api/dlws/submit [HttpPost("submit")] public async Task PostAsync(TemplateParams templateParams) @@ -370,32 +493,57 @@ public async Task PostAsync(TemplateParams templateParams) // POST api/dlws/submit [HttpPost("postJob")] - public async Task postJob(TemplateParams templateParams) + public async Task postJob(TemplateParams templateParams) { - var ret = "invalid API call!"; var tuple = await processRestfulAPICommon(); var passwdLogin = tuple.Item1; if (!String.IsNullOrEmpty(tuple.Item2)) - return tuple.Item2; + return Content(tuple.Item2); + + if (!IsSessionAvailable() && !passwdLogin) + { + return BadRequest("Session timeout, please open a new window to login and resubmit."); + } if (!User.Identity.IsAuthenticated && !passwdLogin) { - ret = "Unauthorized User, Please login!"; - return ret; + return BadRequest("Unauthorized User, Please login!"); + } + + var cluster = HttpContext.Request.Query["cluster"]; + var authorizedClusters = JsonConvert.DeserializeObject>(HttpContext.Session.GetString("AuthorizedClusters")); + if (!authorizedClusters.Contains(cluster)) + { + return BadRequest("Invalid cluster"); } + var restapi = Startup.Clusters[cluster].Restapi; + + var team = HttpContext.Session.GetString("Team"); + var teamClusters = JsonConvert.DeserializeObject>(HttpContext.Session.GetString("TeamClusters")); + if (!teamClusters.Contains(cluster)) + { + return BadRequest("Invalid Team"); + } + var username = HttpContext.Session.GetString("Username"); ViewData["Username"] = username; var uid = HttpContext.Session.GetString("uid"); var gid = HttpContext.Session.GetString("gid"); - var restapi = HttpContext.Session.GetString("Restapi"); templateParams.Json = templateParams.Json.Replace("$$username$$", username).Replace("$$uid$$", uid).Replace("$$gid$$", gid); var jobObject = JObject.Parse(templateParams.Json); jobObject["userName"] = HttpContext.Session.GetString("Email"); jobObject["userId"] = uid; jobObject["jobType"] = "training"; + jobObject["vcName"] = team; + var runningasroot = jobObject["runningasroot"]; - if (!(Object.ReferenceEquals(runningasroot, null)) && (runningasroot.ToString() == "1") || (runningasroot.ToString() == true.ToString())) + if ( + !Object.ReferenceEquals(runningasroot, null) && ( + runningasroot.ToString() == "1" || + runningasroot.ToString() == true.ToString() + ) + ) { jobObject["containerUserId"] = "0"; } @@ -408,13 +556,13 @@ public async Task postJob(TemplateParams templateParams) var familyToken = Guid.NewGuid(); var newKey = _familyModel.Families.TryAdd(familyToken, new FamilyModel.FamilyData { - ApiPath = HttpContext.Session.GetString("Restapi"), + ApiPath = restapi, Email = HttpContext.Session.GetString("Email"), UID = HttpContext.Session.GetString("uid") }); if (!newKey) { - ret = "Only 1 parent is allowed per family (maybe you tried to submit the same job on two threads?)"; + return BadRequest("Only 1 parent is allowed per family (maybe you tried to submit the same job on two threads?)"); } jobObject["familyToken"] = String.Format("{0:N}", familyToken); jobObject["isParent"] = 1; @@ -423,9 +571,38 @@ public async Task postJob(TemplateParams templateParams) using (var httpClient = new HttpClient()) { httpClient.BaseAddress = new Uri(restapi); - var response = await httpClient.PostAsync("/PostJob", new StringContent(jobObject.ToString(), System.Text.Encoding.UTF8, "application/json")); + var response = await httpClient.PostAsync("/PostJob", + new StringContent(jobObject.ToString(), System.Text.Encoding.UTF8, "application/json")); + var returnInfo = await response.Content.ReadAsStringAsync(); + return Content(returnInfo); + } + } + + // POST api/dlws/endpoints + [HttpPost("endpoints")] + public async Task PostEndpoints() + { + if (!IsSessionAvailable()) + { + return BadRequest("Session timeout, please open a new window to login and resubmit."); + } + + var cluster = HttpContext.Request.Query["cluster"]; + var authorizedClusters = JsonConvert.DeserializeObject>(HttpContext.Session.GetString("AuthorizedClusters")); + if (!authorizedClusters.Contains(cluster)) + { + return BadRequest("Invalid cluster"); + } + var restapi = Startup.Clusters[cluster].Restapi; + using (var httpClient = new HttpClient()) + { + httpClient.BaseAddress = new Uri(restapi); + var content = new StreamContent(HttpContext.Request.Body); + content.Headers.ContentType = new MediaTypeHeaderValue("application/json"); + content.Headers.ContentLength = HttpContext.Request.ContentLength; + var response = await httpClient.PostAsync("/endpoints", content); var returnInfo = await response.Content.ReadAsStringAsync(); - return returnInfo; + return Content(returnInfo); } } @@ -520,7 +697,6 @@ private static string TranslateJson( string inp ) inp = inp.Replace("\"work_path\"", "\"workPath\""); inp = inp.Replace("\"data_path\"", "\"dataPath\""); inp = inp.Replace("\"job_path\"", "\"jobPath\""); - inp = inp.Replace("\"log_path\"", "\"logDir\""); inp = inp.Replace("\"port\"", "\"interactivePort\""); inp = inp.Replace("\"run_as_root\"", "\"runningasroot\""); return inp; diff --git a/src/WebUI/dotnet/WebPortal/Startup.cs b/src/WebUI/dotnet/WebPortal/Startup.cs index cffc5d407..518014a0f 100755 --- a/src/WebUI/dotnet/WebPortal/Startup.cs +++ b/src/WebUI/dotnet/WebPortal/Startup.cs @@ -127,9 +127,9 @@ public void ConfigureServices(IServiceCollection services) // Add MVC services to the services container. - services.AddMvc( options => options.AddMetricsResourceFilter()); services.AddDistributedMemoryCache(); // Adds a default in-memory implementation of IDistributedCache - services.AddSession(); + services.AddSession( options => options.IdleTimeout = TimeSpan.FromDays(14) ); + services.AddMvc(options => options.AddMetricsResourceFilter()); //services.AddCors(); services.Configure(appSettings => { @@ -287,6 +287,7 @@ public void Configure(IApplicationBuilder app, IHostingEnvironment env, } clusterInfo.Restapi = clusterConfig["Restapi"] as string; + clusterInfo.Grafana = clusterConfig["Grafana"] as string; clusterInfo.SQLDatabaseForUser = (clusterConfig["SQLDatabaseForUser"] as string) + clusterInfo.ClusterId; clusterInfo.SQLHostname = clusterConfig["SQLHostname"] as string; clusterInfo.SQLPassword = clusterConfig["SQLPassword"] as string; @@ -454,6 +455,8 @@ public void Configure(IApplicationBuilder app, IHostingEnvironment env, // Configure the OWIN pipeline to use cookie auth. var cookieOpt = new CookieAuthenticationOptions(); + cookieOpt.ExpireTimeSpan = TimeSpan.FromDays(14); + cookieOpt.SlidingExpiration = true; //cookieOpt.AutomaticAuthenticate = true; // cookieOpt.CookieName = "dlws-auth"; //cookieOpt.CookieSecure = Microsoft.AspNetCore.Http.CookieSecurePolicy.Always; @@ -483,6 +486,23 @@ public void Configure(IApplicationBuilder app, IHostingEnvironment env, // Configure the OWIN pipeline to use OpenID Connect auth. app.UseSession(); + + app.Use(async (context, next) => + { + if (context.Request.Query.ContainsKey("current-team") && context.Session.GetString("Teams") != null) + { + var team = context.Request.Query["current-team"]; + var teams = JsonConvert.DeserializeObject(context.Session.GetString("Teams")); + if (Array.Exists(teams, t => t.Equals(team))) + { + context.Session.SetString("Team", team); + var teamClusters = await Controllers.HomeController.GetTeamClusters(context, team); + context.Session.SetString("TeamClusters", JsonConvert.SerializeObject(teamClusters)); + _logger.LogInformation("{0} switch team to {1}", context.Session.GetString("Username"), team); + } + } + await next.Invoke(); + }); // Configure MVC routes app.UseMvc(routes => { diff --git a/src/WebUI/dotnet/WebPortal/Views/Home/DataJob.cshtml b/src/WebUI/dotnet/WebPortal/Views/Home/DataJob.cshtml new file mode 100644 index 000000000..e47c6140f --- /dev/null +++ b/src/WebUI/dotnet/WebPortal/Views/Home/DataJob.cshtml @@ -0,0 +1,203 @@ +@using Microsoft.AspNetCore.Http; +@{ + ViewData["Title"] = "Submit Data Job"; +} + + + + + + + + + + +
+ + + diff --git a/src/WebUI/dotnet/WebPortal/Views/Home/Index.cshtml b/src/WebUI/dotnet/WebPortal/Views/Home/Index.cshtml index 0eed0ca4c..a59b2febc 100755 --- a/src/WebUI/dotnet/WebPortal/Views/Home/Index.cshtml +++ b/src/WebUI/dotnet/WebPortal/Views/Home/Index.cshtml @@ -1,75 +1,121 @@ @using WindowsAuth.models; - +@using Microsoft.AspNetCore.Http; @{ - ViewData["Title"] = "Deep Learning Workspace - Web Portal"; + ViewData["Title"] = "Home"; } -@model ClusterSelectViewModel @if (ViewData["isAuthorized"] != null && !(bool)ViewData["isAuthorized"]) { -} + var ALERT_TITLE = "Alert"; + var ALERT_BUTTON_TEXT = "Join SG"; + var AddGroupLink = "@ViewData["AddGroupLink"]"; + if (document) { + window.alert = function (txt) { + createCustomAlert(txt); + } + } - + //create a customize alert + function createCustomAlert(txt) { + d = document; + + if (d.getElementById("modalContainer")) return; + + mObj = d.getElementsByTagName("body")[0].appendChild(d.createElement("div")); + mObj.id = "modalContainer"; + + alertObj = mObj.appendChild(d.createElement("div")); + alertObj.id = "alertBox"; + if (d.all && !window.opera) alertObj.style.top = document.documentElement.scrollTop + "px"; + alertObj.style.visiblity = "visible"; + + h1 = alertObj.appendChild(d.createElement("h1")); + h1.appendChild(d.createTextNode(ALERT_TITLE)); + closeBtn = alertObj.appendChild(d.createElement("button")); + closeBtn.innerHTML = "X"; + closeBtn.style.float = "right"; + closeBtn.style.color = "red"; + closeBtn.onclick = function () { removeCustomAlertInLink(); return false;} + h1.append(closeBtn) + + msg = alertObj.appendChild(d.createElement("p")); + msg.innerHTML = txt; + + btn = alertObj.appendChild(d.createElement("a")); + btn.id = "closeBtn"; + btn.appendChild(d.createTextNode(ALERT_BUTTON_TEXT)); + btn.href = "#"; + btn.focus(); + btn.onclick = function () { removeCustomAlert(); return false; } + + + alertObj.style.display = "block"; -@if (ViewData["isAuthorized"] != null && (bool)ViewData["isAuthorized"]) -{ -
-

Welcome to DLWorkspace, @ViewData["Username"] !

- - - - @if (Model.ClustersList.Count > 1) - { -

You may select an alternative cluster:

-
- - -
} -
+ function removeCustomAlert() { + removeCustomAlertInLink(); + window.open(AddGroupLink,"_blank"); + } + + function removeCustomAlertInLink() { + document.getElementsByTagName("body")[0].removeChild(document.getElementById("modalContainer")); + } + alert("You are not an authorized user for this cluster. Please request to join a security group by following the button below."); + + } + + +@if (ViewData["isAuthorized"] != null && (bool)ViewData["isAuthorized"]) +{ +
+

Welcome to Deep Learning Training Service, @ViewData["Username"] !

+ Cluster Resource + + + + + + + + + + + + +
NameTotal GPUReserved GPUUsed GPUAvailable GPUActive Jobs
+ Cluster Storage + + + + + + + + + +
NameWork DirectoryData Directory
+
+} +else +{ + + } - + + + + Previous + + + + Next + + + + + --> diff --git a/src/WebUI/dotnet/WebPortal/Views/Home/JobDetail.cshtml b/src/WebUI/dotnet/WebPortal/Views/Home/JobDetail.cshtml index 51b3907fa..4fc956ea6 100755 --- a/src/WebUI/dotnet/WebPortal/Views/Home/JobDetail.cshtml +++ b/src/WebUI/dotnet/WebPortal/Views/Home/JobDetail.cshtml @@ -2,8 +2,14 @@ For more information on enabling MVC for empty projects, visit http://go.microsoft.com/fwlink/?LinkID=397860 *@ - + function copy(self) { + var input = document.createElement("input"); + document.body.appendChild(input); + input.setAttribute("value", self.innerHTML); + input.select(); + if (document.execCommand('copy')) { + document.execCommand('copy') + $('.alert').html('copied!').addClass('alert-message').show().delay(1500).fadeOut(); + } + document.body.removeChild(input); + } + + + function copyIconShow(self) { + self.style.background = "#b3b3b3"; + $('.alert').html('copy to clipboard').addClass('alert-message').show(); + } + + function copyIconHidden(self) { + self.style.background = ""; + $('.alert-message').delay(1000).fadeOut(); + } + + - - + +

Job Details:

- -

Job Folder:

-
+

Job Folder:

+
- -