diff --git a/Kubernetes/container-azm-ms-agentconfig.yaml b/Kubernetes/container-azm-ms-agentconfig.yaml index 63377f55..3ec9a65e 100644 --- a/Kubernetes/container-azm-ms-agentconfig.yaml +++ b/Kubernetes/container-azm-ms-agentconfig.yaml @@ -8,7 +8,7 @@ data: #string.used by customer to keep track of this config file's version in their source control/repository (max allowed 10 chars, other chars will be truncated) ver1 log-data-collection-settings: |- - # Log data collection settings + # Log data collection settings container-azm-ms-agent settings [log_collection_settings] [log_collection_settings.stdout] # In the absense of this configmap, default value for enabled is true @@ -72,8 +72,11 @@ data: #fieldpass = ["metric_to_pass1", "metric_to_pass12"] #fielddrop = ["metric_to_drop"] - - + agent-settings: |- + # agent health model feature settings + [agent_settings.health_model] + # In the absence of this configmap, default value for enabled is false + enabled = false metadata: name: container-azm-ms-agentconfig namespace: kube-system diff --git a/Kubernetes/omsagent.yaml b/Kubernetes/omsagent.yaml index 20d0c6d9..e1b70875 100644 --- a/Kubernetes/omsagent.yaml +++ b/Kubernetes/omsagent.yaml @@ -12,6 +12,9 @@ rules: - apiGroups: [""] resources: ["pods", "events", "nodes", "namespaces", "services"] verbs: ["list", "get", "watch"] +- apiGroups: ["extensions"] + resources: ["deployments"] + verbs: ["list"] - nonResourceURLs: ["/metrics"] verbs: ["get"] --- @@ -33,6 +36,12 @@ apiVersion: v1 data: kube.conf: |- # Fluentd config file for OMS Docker - cluster components (kubeAPI) + #fluent forward plugin + + type forward + port 25235 + bind 0.0.0.0 + #Kubernetes pod inventory @@ -81,6 +90,14 @@ data: log_level debug + #Kubernetes health + + type kubehealth + tag oms.api.KubeHealth.ReplicaSet + run_interval 60s + log_level debug + + #cadvisor perf- Windows nodes type wincadvisorperf @@ -103,6 +120,11 @@ data: log_level info + #health model aggregation filter + + type filter_health_model_builder + + type out_oms log_level debug @@ -249,6 +271,18 @@ data: max_retry_wait 9m retry_mdm_post_wait_minutes 60 + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_kubehealth*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s + metadata: name: omsagent-rs-config namespace: kube-system @@ -261,8 +295,8 @@ metadata: type: Opaque data: #BASE64 ENCODED (Both WSID & KEY) INSIDE DOUBLE QUOTE ("") - WSID: "WSID" - KEY: "KEY" + WSID: "VALUE_WSID" + KEY: "VALUE_KEY" --- apiVersion: extensions/v1beta1 kind: DaemonSet @@ -284,7 +318,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod07092019" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" imagePullPolicy: IfNotPresent resources: limits: @@ -294,13 +328,15 @@ spec: cpu: 75m memory: 225Mi env: - #- name: AKS_RESOURCE_ID - # value: "VALUE_AKS_RESOURCE_ID_VALUE" - #- name: AKS_REGION - # value: "VALUE_AKS_RESOURCE_REGION_VALUE" + - name: AKS_RESOURCE_ID + value: "VALUE_AKS_RESOURCE_ID_VALUE" + - name: AKS_REGION + value: "VALUE_AKS_REGION_VALUE" #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters - - name: ACS_RESOURCE_NAME - value: "my_acs_cluster_name" + #- name: ACS_RESOURCE_NAME + # value: "my_acs_cluster_name" + - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION + value: "true" - name: CONTROLLER_TYPE value: "DaemonSet" - name: NODE_IP @@ -397,8 +433,8 @@ spec: spec: serviceAccountName: omsagent containers: - - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod07092019" + - name: omsagent + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" imagePullPolicy: IfNotPresent resources: limits: @@ -428,6 +464,9 @@ spec: protocol: TCP - containerPort: 25224 protocol: UDP + - containerPort: 25235 + protocol: TCP + name: in-rs-tcp volumeMounts: - mountPath: /var/run/host name: docker-sock @@ -445,6 +484,8 @@ spec: - mountPath: /etc/config/settings name: settings-vol-config readOnly: true + - mountPath: "/mnt/azure" + name: azurefile-pv livenessProbe: exec: command: @@ -482,4 +523,67 @@ spec: configMap: name: container-azm-ms-agentconfig optional: true - + - name: azurefile-pv + persistentVolumeClaim: + claimName: azurefile +--- +kind: Service +apiVersion: v1 +metadata: + name: replicaset-service + namespace: kube-system +spec: + selector: + rsName: "omsagent-rs" + ports: + - protocol: TCP + port: 25235 + targetPort: in-rs-tcp +--- +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: azurefile +provisioner: kubernetes.io/azure-file +mountOptions: + - dir_mode=0777 + - file_mode=0777 + - uid=1000 + - gid=1000 +parameters: + skuName: Standard_LRS +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: system:azure-cloud-provider +rules: +- apiGroups: [''] + resources: ['secrets'] + verbs: ['get','create'] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: system:azure-cloud-provider +roleRef: + kind: ClusterRole + apiGroup: rbac.authorization.k8s.io + name: system:azure-cloud-provider +subjects: +- kind: ServiceAccount + name: persistent-volume-binder + namespace: kube-system +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: azurefile + namespace: kube-system +spec: + accessModes: + - ReadWriteMany + storageClassName: azurefile + resources: + requests: + storage: 10Mi \ No newline at end of file diff --git a/ci_feature/Dockerfile b/ci_feature/Dockerfile index 53c622ee..aa09e978 100644 --- a/ci_feature/Dockerfile +++ b/ci_feature/Dockerfile @@ -1,8 +1,8 @@ FROM ubuntu:16.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ -com.microsoft.product="OMS Container Docker Provider" \ -com.microsoft.version="6.0.0-0" + com.microsoft.product="OMS Container Docker Provider" \ + com.microsoft.version="6.0.0-1" ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH OTQzNWI0M2YtOTdkNS00ZGVkLThkOTAtYjA0Nzk1OGU2ZTg3 ENV AGENT_VERSION ciprod07092019 diff --git a/ci_feature/setup.sh b/ci_feature/setup.sh index 76a3ee81..d8abc5d2 100644 --- a/ci_feature/setup.sh +++ b/ci_feature/setup.sh @@ -14,8 +14,7 @@ wget https://github.com/Microsoft/OMS-Agent-for-Linux/releases/download/OMSAgent #create file to disable omi service startup script touch /etc/.omi_disable_service_control -wget https://github.com/microsoft/Docker-Provider/releases/download/6.0.0.0/docker-cimprov-6.0.0-0.universal.x86_64.sh - +wget https://github.com/microsoft/Docker-Provider/releases/download/healthpreview06182019/docker-cimprov-6.0.0-1.universal.x86_64.sh chmod 775 $TMPDIR/*.sh #Extract omsbundle diff --git a/ci_feature_prod/Dockerfile b/ci_feature_prod/Dockerfile index 15a2b873..167bf7d4 100644 --- a/ci_feature_prod/Dockerfile +++ b/ci_feature_prod/Dockerfile @@ -1,11 +1,11 @@ FROM ubuntu:16.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ -com.microsoft.product="OMS Container Docker Provider" \ -com.microsoft.version="6.0.0-0" + com.microsoft.product="OMS Container Docker Provider" \ + com.microsoft.version="6.0.0-1" ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi -ENV AGENT_VERSION ciprod07092019 +ENV AGENT_VERSION healthpreview07182019 ENV HOST_MOUNT_PREFIX /hostfs ENV HOST_PROC /hostfs/proc ENV HOST_SYS /hostfs/sys diff --git a/ci_feature_prod/main.sh b/ci_feature_prod/main.sh index 5813da93..b3399148 100644 --- a/ci_feature_prod/main.sh +++ b/ci_feature_prod/main.sh @@ -36,7 +36,7 @@ if [ -S ${DOCKER_SOCKET} ]; then groupadd -for -g ${DOCKER_GID} ${DOCKER_GROUP} echo "adding omsagent user to local docker group" usermod -aG ${DOCKER_GROUP} ${REGULAR_USER} -fi +fi #Run inotify as a daemon to track changes to the mounted configmap. inotifywait /etc/config/settings --daemon --recursive --outfile "/opt/inotifyoutput.txt" --event create,delete --format '%e : %T' --timefmt '+%s' @@ -48,11 +48,11 @@ else curl --unix-socket /var/run/host/docker.sock "http:/info" | python -c "import sys, json; print json.load(sys.stdin)['Name']" > /var/opt/microsoft/docker-cimprov/state/containerhostname fi #check if file was written successfully. -cat /var/opt/microsoft/docker-cimprov/state/containerhostname +cat /var/opt/microsoft/docker-cimprov/state/containerhostname #resourceid override for loganalytics data. if [ -z $AKS_RESOURCE_ID ]; then - echo "not setting customResourceId" + echo "not setting customResourceId" else export customResourceId=$AKS_RESOURCE_ID echo "export customResourceId=$AKS_RESOURCE_ID" >> ~/.bashrc @@ -63,7 +63,7 @@ fi #set agent config schema version if [ -e "/etc/config/settings/schema-version" ] && [ -s "/etc/config/settings/schema-version" ]; then #trim - config_schema_version="$(cat /etc/config/settings/schema-version | xargs)" + config_schema_version="$(cat /etc/config/settings/schema-version | xargs)" #remove all spaces config_schema_version="${config_schema_version//[[:space:]]/}" #take first 10 characters @@ -92,7 +92,7 @@ fi # Check for internet connectivity RET=`curl -s -o /dev/null -w "%{http_code}" http://www.microsoft.com/` -if [ $RET -eq 200 ]; then +if [ $RET -eq 200 ]; then # Check for workspace existence if [ -e "/etc/omsagent-secret/WSID" ]; then workspaceId=$(cat /etc/omsagent-secret/WSID) @@ -103,7 +103,7 @@ if [ $RET -eq 200 ]; then else echo "LA Onboarding:Workspace Id not mounted" fi -else +else echo "-e error Error resolving host during the onboarding request. Check the internet connectivity and/or network policy on the cluster" fi @@ -131,7 +131,7 @@ rm -f /etc/opt/microsoft/omsagent/conf/omsagent.d/omsconfig.consistencyinvoker.c if [ -z $INT ]; then if [ -a /etc/omsagent-secret/DOMAIN ]; then /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /etc/omsagent-secret/WSID` -s `cat /etc/omsagent-secret/KEY` -d `cat /etc/omsagent-secret/DOMAIN` - elif [ -a /etc/omsagent-secret/WSID ]; then + elif [ -a /etc/omsagent-secret/WSID ]; then /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /etc/omsagent-secret/WSID` -s `cat /etc/omsagent-secret/KEY` elif [ -a /run/secrets/DOMAIN ]; then /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /run/secrets/WSID` -s `cat /run/secrets/KEY` -d `cat /run/secrets/DOMAIN` @@ -159,7 +159,7 @@ service cron start #get omsagent and docker-provider versions dpkg -l | grep omsagent | awk '{print $2 " " $3}' -dpkg -l | grep docker-cimprov | awk '{print $2 " " $3}' +dpkg -l | grep docker-cimprov | awk '{print $2 " " $3}' #telegraf & fluentbit requirements if [ ! -e "/etc/config/kube.conf" ]; then @@ -272,7 +272,7 @@ fi /opt/telegraf --version dpkg -l | grep td-agent-bit | awk '{print $2 " " $3}' -#dpkg -l | grep telegraf | awk '{print $2 " " $3}' +#dpkg -l | grep telegraf | awk '{print $2 " " $3}' shutdown() { /opt/microsoft/omsagent/bin/service_control stop diff --git a/ci_feature_prod/setup.sh b/ci_feature_prod/setup.sh index 76a3ee81..914fc4c8 100644 --- a/ci_feature_prod/setup.sh +++ b/ci_feature_prod/setup.sh @@ -14,7 +14,7 @@ wget https://github.com/Microsoft/OMS-Agent-for-Linux/releases/download/OMSAgent #create file to disable omi service startup script touch /etc/.omi_disable_service_control -wget https://github.com/microsoft/Docker-Provider/releases/download/6.0.0.0/docker-cimprov-6.0.0-0.universal.x86_64.sh +wget https://github.com/microsoft/Docker-Provider/releases/download/healthpreview06182019/docker-cimprov-6.0.0-1.universal.x86_64.sh chmod 775 $TMPDIR/*.sh diff --git a/health/HealthAgentOnboarding.ps1 b/health/HealthAgentOnboarding.ps1 new file mode 100644 index 00000000..0b0500eb --- /dev/null +++ b/health/HealthAgentOnboarding.ps1 @@ -0,0 +1,438 @@ +<# + .DESCRIPTION + Upgrades the Kubernetes cluster that has been onboarded to monitoring to a version of the agent + that generates health monitor signals + 1. Installs necessary powershell modules + 2. Onboards Container Insights solution to the supplied LA workspace if not already onboarded + 3. Updates the cluster metadata to link the LA workspace ID to the cluster + .PARAMETER aksResourceId + Name of the cluster configured on the OMSAgent + .PARAMETER loganalyticsWorkspaceResourceId + Azure ResourceId of the log analytics workspace Id + .PARAMETER aksResourceLocation + Resource location of the AKS cluster resource +#> +param( + [Parameter(mandatory = $true)] + [string]$aksResourceId, + [Parameter(mandatory = $true)] + [string]$aksResourceLocation, + [Parameter(mandatory = $true)] + [string]$logAnalyticsWorkspaceResourceId +) + + +$OptOutLink = "https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-optout" + +# checks the required Powershell modules exist and if not exists, request the user permission to install +$azAccountModule = Get-Module -ListAvailable -Name Az.Accounts +$azResourcesModule = Get-Module -ListAvailable -Name Az.Resources +$azOperationalInsights = Get-Module -ListAvailable -Name Az.OperationalInsights +$azAks = Get-Module -ListAvailable -Name Az.Aks + +if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null -eq $azOperationalInsights)) { + + $currentPrincipal = New-Object Security.Principal.WindowsPrincipal([Security.Principal.WindowsIdentity]::GetCurrent()) + + if ($currentPrincipal.IsInRole([Security.Principal.WindowsBuiltInRole]::Administrator)) { + Write-Host("Running script as an admin...") + Write-Host("") + } + else { + Write-Host("Please re-launch the script with elevated administrator") -ForegroundColor Red + Stop-Transcript + exit + } + + $message = "This script will try to install the latest versions of the following Modules : ` + Az.Resources, Az.Accounts, Az.Aks and Az.OperationalInsights using the command` + `'Install-Module {Insert Module Name} -Repository PSGallery -Force -AllowClobber -ErrorAction Stop -WarningAction Stop' + `If you do not have the latest version of these Modules, this troubleshooting script may not run." + $question = "Do you want to Install the modules and run the script or just run the script?" + + $choices = New-Object Collections.ObjectModel.Collection[Management.Automation.Host.ChoiceDescription] + $choices.Add((New-Object Management.Automation.Host.ChoiceDescription -ArgumentList '&Yes, Install and run')) + $choices.Add((New-Object Management.Automation.Host.ChoiceDescription -ArgumentList '&Continue without installing the Module')) + $choices.Add((New-Object Management.Automation.Host.ChoiceDescription -ArgumentList '&Quit')) + + $decision = $Host.UI.PromptForChoice($message, $question, $choices, 0) + + switch ($decision) { + 0 { + + if ($null -eq $azResourcesModule) { + try { + Write-Host("Installing Az.Resources...") + Install-Module Az.Resources -Repository PSGallery -Force -AllowClobber -ErrorAction Stop + } + catch { + Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red + exit + } + } + + if ($null -eq $azAccountModule) { + try { + Write-Host("Installing Az.Accounts...") + Install-Module Az.Accounts -Repository PSGallery -Force -AllowClobber -ErrorAction Stop + } + catch { + Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red + exit + } + } + + if ($null -eq $azOperationalInsights) { + try { + + Write-Host("Installing AzureRM.OperationalInsights...") + Install-Module Az.OperationalInsights -Repository PSGallery -Force -AllowClobber -ErrorAction Stop + } + catch { + Write-Host("Close other powershell logins and try installing the latest modules for AzureRM.OperationalInsights in a new powershell window: eg. 'Install-Module AzureRM.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red + exit + } + } + if ($null -eq $azAks) { + try { + + Write-Host("Installing Az.Aks...") + Install-Module Az.Aks -Repository PSGallery -Force -AllowClobber -ErrorAction Stop + } + catch { + Write-Host("Close other powershell logins and try installing the latest modules for AzureRM.OperationalInsights in a new powershell window: eg. 'Install-Module AzureRM.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red + exit + } + } + + + } + 1 { + + if ($null -eq $azResourcesModule) { + try { + Import-Module Az.Resources -ErrorAction Stop + } + catch { + Write-Host("Could not import Az.Resources...") -ForegroundColor Red + Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red + Stop-Transcript + exit + } + } + if ($null -eq $azAccountModule) { + try { + Import-Module Az.Accounts -ErrorAction Stop + } + catch { + Write-Host("Could not import Az.Accounts...") -ForegroundColor Red + Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red + Stop-Transcript + exit + } + } + + if ($null -eq $azAccountModule) { + try { + Import-Module Az.OperationalInsights + } + catch { + Write-Host("Could not import Az.OperationalInsights... Please reinstall this Module") -ForegroundColor Red + Stop-Transcript + exit + } + } + + } + 2 { + Write-Host("") + Stop-Transcript + exit + } + } +} + +if ([string]::IsNullOrEmpty($logAnalyticsWorkspaceResourceId)) { + Write-Host("logAnalyticsWorkspaceResourceId should not be NULL or empty") -ForegroundColor Red + exit +} + +if (($logAnalyticsWorkspaceResourceId -match "/providers/Microsoft.OperationalInsights/workspaces") -eq $false) { + Write-Host("logAnalyticsWorkspaceResourceId should be valid Azure Resource Id format") -ForegroundColor Red + exit +} + +$workspaceResourceDetails = $logAnalyticsWorkspaceResourceId.Split("/") + +if ($workspaceResourceDetails.Length -ne 9) { + Write-Host("logAnalyticsWorkspaceResourceId should be valid Azure Resource Id format") -ForegroundColor Red + exit +} + +$workspaceSubscriptionId = $workspaceResourceDetails[2] +$workspaceSubscriptionId = $workspaceSubscriptionId.Trim() +$workspaceResourceGroupName = $workspaceResourceDetails[4] +$workspaceResourceGroupName = $workspaceResourceGroupName.Trim() +$workspaceName = $workspaceResourceDetails[8] +$workspaceResourceGroupName = $workspaceResourceGroupName.Trim() + +$aksResourceDetails = $aksResourceId.Split("/") +$clusterResourceGroupName = $aksResourceDetails[4].Trim() +$clusterSubscriptionId = $aksResourceDetails[2].Trim() +$clusterName = $aksResourceDetails[8].Trim() + +Write-Host("LogAnalytics Workspace SubscriptionId : '" + $workspaceSubscriptionId + "' ") -ForegroundColor Green + +try { + Write-Host("") + Write-Host("Trying to get the current Az login context...") + $account = Get-AzContext -ErrorAction Stop + Write-Host("Successfully fetched current AzContext context...") -ForegroundColor Green + Write-Host("") +} +catch { + Write-Host("") + Write-Host("Could not fetch AzContext..." ) -ForegroundColor Red + Write-Host("") +} + + +if ($null -eq $account.Account) { + try { + Write-Host("Please login...") + Connect-AzAccount -subscriptionid $clusterSubscriptionId + } + catch { + Write-Host("") + Write-Host("Could not select subscription with ID : " + $clusterSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit + } +} + +Write-Host("Checking if cluster is onboarded to Container Monitoring") +if ($account.Subscription.Id -eq $clusterSubscriptionId) { + Write-Host("Subscription: $clusterSubscriptionId is already selected. Account details: ") + $account +} +else { + try { + Write-Host("Current Subscription:") + $account + Write-Host("Changing to workspace subscription: $clusterSubscriptionId") + Set-AzContext -SubscriptionId $clusterSubscriptionId + + } + catch { + Write-Host("") + Write-Host("Could not select subscription with ID : " + $workspaceSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit + } +} + +try { + $resources = Get-AzResource -ResourceGroupName $clusterResourceGroupName -Name $clusterName -ResourceType "Microsoft.ContainerService/managedClusters" -ExpandProperties -ErrorAction Stop -WarningAction Stop + $clusterResource = $resources[0] + + $props = ($clusterResource.Properties | ConvertTo-Json).toLower() | ConvertFrom-Json + + if ($true -eq $props.addonprofiles.omsagent.enabled -and $null -ne $props.addonprofiles.omsagent.config) { + Write-Host("Your cluster is already onboarded to Azure monitor for containers. Please refer to the following documentation to opt-out and re-run this script again:") -ForegroundColor Red; + Write-Host("") + Write-Host($OptOutLink) -ForegroundColor Red + Write-Host("") + throw + } + + Write-Host("Setting context to the current cluster") + Import-AzAksCredential -Id $aksResourceId -Force + $omsagentCount = kubectl get pods -n kube-system | Select-String omsagent + if ($null -eq $omsagentCount) { + Write-Host ("OmsAgent is not running. Proceeding to do custom onboarding for Health Agent") + } + else { + Write-Host ("Cluster is not enabled for Monitoring. But detected omsagent pods. Please wait for 30 min to ensure that omsagent containers are completely stopped and re-run this script") -ForegroundColor Red + Stop-Transcript + exit + } +} +catch { + Write-Host("Error when checking if cluster is already onboarded") + exit +} + + +if ($account.Subscription.Id -eq $workspaceSubscriptionId) { + Write-Host("Subscription: $workspaceSubscriptionId is already selected. Account details: ") + $account +} +else { + try { + Write-Host("Current Subscription:") + $account + Write-Host("Changing to workspace subscription: $workspaceSubscriptionId") + Set-AzContext -SubscriptionId $workspaceSubscriptionId + } + catch { + Write-Host("") + Write-Host("Could not select subscription with ID : " + $workspaceSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit + } +} + +$WorkspaceInformation = Get-AzOperationalInsightsWorkspace -ResourceGroupName $workspaceResourceGroupName -Name $workspaceName -ErrorAction Stop +$key = (Get-AzOperationalInsightsWorkspaceSharedKeys -ResourceGroupName $workspaceResourceGroupName -Name $workspaceName).PrimarySharedKey +$wsid = $WorkspaceInformation.CustomerId +$base64EncodedKey = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($key)) +$base64EncodedWsId = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($wsid)) +Write-Host("Successfully verified specified logAnalyticsWorkspaceResourceId valid and exists...") -ForegroundColor Green +$WorkspaceLocation = $WorkspaceInformation.Location + +if ($null -eq $WorkspaceLocation) { + Write-Host("") + Write-Host("Cannot fetch workspace location. Please try again...") -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit +} + +try { + $WorkspaceIPDetails = Get-AzOperationalInsightsIntelligencePacks -ResourceGroupName $workspaceResourceGroupName -WorkspaceName $workspaceName -ErrorAction Stop + Write-Host("Successfully fetched workspace IP details...") -ForegroundColor Green + Write-Host("") +} +catch { + Write-Host("") + Write-Host("Failed to get the list of solutions onboarded to the workspace. Please make sure that it hasn't been deleted and you have access to it.") -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit +} + +try { + $ContainerInsightsIndex = $WorkspaceIPDetails.Name.IndexOf("ContainerInsights"); + Write-Host("Successfully located ContainerInsights solution") -ForegroundColor Green + Write-Host("") +} +catch { + Write-Host("Failed to get ContainerInsights solution details from the workspace") -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit +} + +$isSolutionOnboarded = $WorkspaceIPDetails.Enabled[$ContainerInsightsIndex] + +if ($false -eq $isSolutionOnboarded) { + + $DeploymentName = "ContainerInsightsSolutionOnboarding-" + ((Get-Date).ToUniversalTime()).ToString('MMdd-HHmm') + $Parameters = @{ } + $Parameters.Add("workspaceResourceId", $logAnalyticsWorkspaceResourceID) + $Parameters.Add("workspaceRegion", $WorkspaceLocation) + $Parameters + + try { + New-AzResourceGroupDeployment -Name $DeploymentName ` + -ResourceGroupName $workspaceResourceGroupName ` + -TemplateUri https://raw.githubusercontent.com/Microsoft/OMS-docker/ci_feature/docs/templates/azuremonitor-containerSolution.json ` + -TemplateParameterObject $Parameters -ErrorAction Stop` + + + Write-Host("Successfully added Container Insights Solution") -ForegroundColor Green + + } + catch { + Write-Host ("Template deployment failed with an error: '" + $Error[0] + "' ") -ForegroundColor Red + Write-Host("Please contact us by emailing askcoin@microsoft.com for help") -ForegroundColor Red + } + +} + +Write-Host("Successfully added Container Insights Solution to workspace " + $workspaceName) -ForegroundColor Green + +try { + $Parameters = @{ } + $Parameters.Add("aksResourceId", $aksResourceId) + $Parameters.Add("aksResourceLocation", $aksResourceLocation) + $Parameters.Add("workspaceResourceId", $logAnalyticsWorkspaceResourceId) + $DeploymentName = "ClusterHealthOnboarding-" + ((Get-Date).ToUniversalTime()).ToString('MMdd-HHmm') + $Parameters + + Write-Host " Onboarding cluster to provided LA workspace " + + if ($account.Subscription.Id -eq $clusterSubscriptionId) { + Write-Host("Subscription: $clusterSubscriptionId is already selected. Account details: ") + $account + } + else { + try { + Write-Host("Current Subscription:") + $account + Write-Host("Changing to subscription: $clusterSubscriptionId") + Set-AzContext -SubscriptionId $clusterSubscriptionId + } + catch { + Write-Host("") + Write-Host("Could not select subscription with ID : " + $workspaceSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit + } + } + + Write-Host("Getting Tags for restoring later") + $tags = (Get-AzAks -Id $aksResourceId).Tags + + Write-Host("Enabling Custom Monitoring using template deployment") + New-AzResourceGroupDeployment -Name $DeploymentName ` + -ResourceGroupName $clusterResourceGroupName ` + -TemplateUri https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/onboardHealth/health/customOnboarding.json ` + -TemplateParameterObject $Parameters -ErrorAction Stop` + + Write-Host("") + + Write-Host("Successfully custom onboarded cluster to Monitoring") -ForegroundColor Green + + Set-AzResource -ResourceId $aksResourceId -Tag $tags -Force + Write-Host("Successfully restored tags") + + Write-Host("") +} +catch { + Write-Host ("Template deployment failed with an error: '" + $Error[0] + "' ") -ForegroundColor Red + exit + #Write-Host("Please contact us by emailing askcoin@microsoft.com for help") -ForegroundColor Red +} + +$desktopPath = "~" +if (-not (test-path $desktopPath/deployments) ) { + Write-Host "$($desktopPath)/deployments doesn't exist, creating it" + mkdir $desktopPath/deployments | out-null +} +else { + Write-Host "$($desktopPath)/deployments exists, no need to create it" +} +try { + + $aksResourceDetails = $aksResourceId.Split("/") + if ($aksResourceDetails.Length -ne 9) { + Write-Host("aksResourceDetails should be valid Azure Resource Id format") -ForegroundColor Red + exit + } + $clusterName = $aksResourceDetails[8].Trim() + $clusterResourceGroupName = $aksResourceDetails[4].Trim() + Import-AzAksCredential -Id $aksResourceId -Force + Invoke-WebRequest https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/kubeHealth/health/omsagent-template.yaml -OutFile $desktopPath/omsagent-template.yaml + + (Get-Content -Path $desktopPath/omsagent-template.yaml -Raw) -replace 'VALUE_AKS_RESOURCE_ID', $aksResourceId -replace 'VALUE_AKS_REGION', $aksResourceLocation -replace 'VALUE_WSID', $base64EncodedWsId -replace 'VALUE_KEY', $base64EncodedKey -replace 'VALUE_ACS_RESOURCE_NAME', $acsResourceName | Set-Content $desktopPath/deployments/omsagent-$clusterName.yaml + kubectl apply -f $desktopPath/deployments/omsagent-$clusterName.yaml + Write-Host "Successfully onboarded to health model omsagent" -ForegroundColor Green +} +catch { + Write-Host ("Agent deployment failed with an error: '" + $Error[0] + "' ") -ForegroundColor Red +} diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md new file mode 100644 index 00000000..87a70df2 --- /dev/null +++ b/health/HealthOnboarding.md @@ -0,0 +1,40 @@ +## Overview +The following documentation outlines the steps required to upgrade an existing cluster onboarded to a Log Analytics workspace running the omsagent, to an agent running the workflow that generates health monitor signals into the same workspace. + +### Onboarding using a script (AKS) +We have a handy [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/kubeHealth/health/HealthAgentOnboarding.ps1) which can onboard your AKS clusters to a version of the agent that can generate the health model. Read on to find out more + +#### Script Prerequisites +* script should run in an elevated command prompt +* kubectl should have been installed and be present in the path + +#### What does the script do: +* Installs necessary powershell modules +* Onboards Container Insights solution to the supplied LA workspace if not already onboarded +* Updates the cluster metadata to link the LA workspace ID to the cluster +* Installs the new agent that generates health monitor signals (using kubectl) + +#### Script Execution +* Download the script from [here](https://github.com/Microsoft/OMS-docker/blob/dilipr/kubeHealth/health/HealthAgentOnboarding.ps1) +* Run the script: + .\HealthAgentOnboarding.ps1 -aksResourceId -aksResourceLocation + -logAnalyticsWorkspaceResourceId (e.g./subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourceGroups/dilipr-health-preview/providers/Microsoft.OperationalInsights/workspaces/dilipr-health-preview) + * Please make sure the right location of the AKS cluster is passed in to the script (without spaces e.g. eastus, southcentralus) + +#### Viewing the health model +* Navigate to +* There should be a new tab named "Health" in Cluster Insights +* Note: It might take about 15-20 min after the script runs for the data to show up in the Insights Page of the Cluster + + +### AKS Engine Onboarding +If your cluster is already onboarded to Monitoring, proceed directly to step 4 and continue from there on. +1. Add Container Insights Solution to your workspace using the instructions [here](http://aka.ms/coinhelmdoc) +2. Tag your AKS-Engine cluster appropriately using the instructions [here](http://aka.ms/coin-acs-tag-doc) +3. Set the current k8s context to be your AKS Engine cluster (the kube-config should refer to your AKS-Engine cluster) +4. Download the [omsagent-template-aks-engine.yaml](https://github.com/microsoft/OMS-docker/blob/dilipr/kubeHealth/health/omsagent-template-aks-engine.yaml) file to your local machine +5. Update the Values of VALUE_ACS_RESOURCE_NAME, VALUE_WSID {base 64 encoded workspace id} and VALUE_KEY {base 64 encoded workspace key}. See [here](https://github.com/Azure/aks-engine/blob/master/examples/addons/container-monitoring/README.md) on instructions to get the Workspace ID and Key of the file downloaded in Step 4 above +6. Run kubectl delete on the file {kubectl delete -f path_to_file_in_step_4} +7. Run kubectl apply on the file {kubectl apply -f path_to_file_in_step_4} + + diff --git a/health/customOnboarding.json b/health/customOnboarding.json new file mode 100644 index 00000000..ecccc2ea --- /dev/null +++ b/health/customOnboarding.json @@ -0,0 +1,44 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "aksResourceId": { + "type": "string", + "metadata": { + "description": "AKS Cluster resource id" + } + }, + "aksResourceLocation": { + "type": "string", + "metadata": { + "description": "Location of the AKS resource e.g. \"East US\"" + } + }, + "workspaceResourceId": { + "type": "string", + "metadata": { + "description": "Azure Monitor Log Analytics Resource ID" + } + } + }, + "resources": [ + { + "name": "[split(parameters('aksResourceId'),'/')[8]]", + "type": "Microsoft.ContainerService/managedClusters", + "location": "[parameters('aksResourceLocation')]", + "apiVersion": "2018-03-31", + "properties": { + "mode": "Incremental", + "id": "[parameters('aksResourceId')]", + "addonProfiles": { + "omsagent": { + "enabled": false, + "config": { + "logAnalyticsWorkspaceResourceID": "[parameters('workspaceResourceId')]" + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/health/omsagent-template-aks-engine.yaml b/health/omsagent-template-aks-engine.yaml new file mode 100644 index 00000000..e9683d32 --- /dev/null +++ b/health/omsagent-template-aks-engine.yaml @@ -0,0 +1,590 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: omsagent + namespace: kube-system +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1beta1 +metadata: + name: omsagent-reader +rules: +- apiGroups: [""] + resources: ["pods", "events", "nodes", "namespaces", "services"] + verbs: ["list", "get", "watch"] +- apiGroups: ["extensions"] + resources: ["deployments"] + verbs: ["list"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1beta1 +metadata: + name: omsagentclusterrolebinding +subjects: + - kind: ServiceAccount + name: omsagent + namespace: kube-system +roleRef: + kind: ClusterRole + name: omsagent-reader + apiGroup: rbac.authorization.k8s.io +--- +kind: ConfigMap +apiVersion: v1 +data: + kube.conf: |- + # Fluentd config file for OMS Docker - cluster components (kubeAPI) + #fluent forward plugin + + type forward + port 25235 + bind 0.0.0.0 + + + #Kubernetes pod inventory + + type kubepodinventory + tag oms.containerinsights.KubePodInventory + run_interval 60s + log_level debug + + + #Kubernetes events + + type kubeevents + tag oms.containerinsights.KubeEvents + run_interval 60s + log_level debug + + + #Kubernetes logs + + type kubelogs + tag oms.api.KubeLogs + run_interval 60s + + + #Kubernetes services + + type kubeservices + tag oms.containerinsights.KubeServices + run_interval 60s + log_level debug + + + #Kubernetes Nodes + + type kubenodeinventory + tag oms.containerinsights.KubeNodeInventory + run_interval 60s + log_level debug + + + #Kubernetes perf + + type kubeperf + tag oms.api.KubePerf + run_interval 60s + log_level debug + + + #Kubernetes health + + type kubehealth + tag oms.api.KubeHealth.ReplicaSet + run_interval 60s + log_level debug + + + #cadvisor perf- Windows nodes + + type wincadvisorperf + tag oms.api.wincadvisorperf + run_interval 60s + log_level debug + + + + type filter_inventory2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + log_level info + + + # custom_metrics_mdm filter plugin for perf data from windows nodes + + type filter_cadvisor2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + log_level info + + #health model aggregation filter + + type filter_health_model_builder + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 5m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer + buffer_queue_limit 10 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_kubernetes_logs*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeservices*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer + buffer_queue_limit 20 + flush_interval 20s + retry_limit 10 + retry_wait 15s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_wincadvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_kubehealth*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s + +metadata: + name: omsagent-rs-config + namespace: kube-system +--- +apiVersion: v1 +kind: Secret +metadata: + name: omsagent-secret + namespace: kube-system +type: Opaque +data: + #BASE64 ENCODED (Both WSID & KEY) INSIDE DOUBLE QUOTE ("") + WSID: "VALUE_WSID" + KEY: "VALUE_KEY" +--- +kind: Service +apiVersion: v1 +metadata: + name: replicaset-service + namespace: kube-system +spec: + selector: + rsName: "omsagent-rs" + ports: + - protocol: TCP + port: 25235 + targetPort: in-rs-tcp +--- +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: azurefile +provisioner: kubernetes.io/azure-file +mountOptions: + - dir_mode=0777 + - file_mode=0777 + - uid=1000 + - gid=1000 +parameters: + skuName: Standard_LRS +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: system:azure-cloud-provider +rules: +- apiGroups: [''] + resources: ['secrets'] + verbs: ['get','create'] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: system:azure-cloud-provider +roleRef: + kind: ClusterRole + apiGroup: rbac.authorization.k8s.io + name: system:azure-cloud-provider +subjects: +- kind: ServiceAccount + name: persistent-volume-binder + namespace: kube-system +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: azurefile + namespace: kube-system +spec: + accessModes: + - ReadWriteMany + storageClassName: azurefile + resources: + requests: + storage: 10Mi +--- +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: omsagent + namespace: kube-system +spec: + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + dsName: "omsagent-ds" + annotations: + agentVersion: "1.10.0.1" + dockerProviderVersion: "5.0.0-1" + schema-versions: "v1" + spec: + serviceAccountName: omsagent + containers: + - name: omsagent + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 150m + memory: 300Mi + requests: + cpu: 75m + memory: 225Mi + env: + # - name: AKS_RESOURCE_ID + # value: "VALUE_AKS_RESOURCE_ID" + # - name: AKS_REGION + # value: "VALUE_AKS_REGION" + # Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + - name: ACS_RESOURCE_NAME + value: "VALUE_ACS_RESOURCE_NAME" + - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION + value: "true" + - name: CONTROLLER_TYPE + value: "DaemonSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + securityContext: + privileged: true + ports: + - containerPort: 25225 + protocol: TCP + - containerPort: 25224 + protocol: UDP + volumeMounts: + - mountPath: /hostfs + name: host-root + readOnly: true + - mountPath: /var/run/host + name: docker-sock + - mountPath: /var/log + name: host-log + - mountPath: /var/lib/docker/containers + name: containerlog-path + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + - mountPath: /etc/config/settings + name: settings-vol-config + readOnly: true + livenessProbe: + exec: + command: + - /bin/bash + - -c + - /opt/livenessprobe.sh + initialDelaySeconds: 60 + periodSeconds: 60 + nodeSelector: + beta.kubernetes.io/os: linux + # Tolerate a NoSchedule taint on master that ACS Engine sets. + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "true" + effect: "NoSchedule" + volumes: + - name: host-root + hostPath: + path: / + - name: docker-sock + hostPath: + path: /var/run + - name: container-hostname + hostPath: + path: /etc/hostname + - name: host-log + hostPath: + path: /var/log + - name: containerlog-path + hostPath: + path: /var/lib/docker/containers + - name: azure-json-path + hostPath: + path: /etc/kubernetes + - name: omsagent-secret + secret: + secretName: omsagent-secret + - name: settings-vol-config + configMap: + name: container-azm-ms-agentconfig + optional: true +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: omsagent-rs + namespace: kube-system +spec: + replicas: 1 + selector: + matchLabels: + rsName: "omsagent-rs" + strategy: + type: RollingUpdate + template: + metadata: + labels: + rsName: "omsagent-rs" + annotations: + agentVersion: "1.10.0.1" + dockerProviderVersion: "5.0.0-1" + schema-versions: "v1" + spec: + serviceAccountName: omsagent + containers: + - name: omsagent + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 150m + memory: 500Mi + requests: + cpu: 50m + memory: 175Mi + env: + # - name: AKS_RESOURCE_ID + # value: "VALUE_AKS_RESOURCE_ID" + # - name: AKS_REGION + # value: "VALUE_AKS_REGION" + # Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + - name: ACS_RESOURCE_NAME + value: "aks-engine-health" + - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION + value: "true" + - name: CONTROLLER_TYPE + value: "ReplicaSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + securityContext: + privileged: true + ports: + - containerPort: 25225 + protocol: TCP + - containerPort: 25224 + protocol: UDP + - containerPort: 25235 + protocol: TCP + name: in-rs-tcp + volumeMounts: + - mountPath: /var/run/host + name: docker-sock + - mountPath: /var/log + name: host-log + - mountPath: /var/lib/docker/containers + name: containerlog-path + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + readOnly: true + - mountPath : /etc/config + name: omsagent-rs-config + - mountPath: /etc/config/settings + name: settings-vol-config + readOnly: true + - mountPath: "/mnt/azure" + name: azurefile-pv + livenessProbe: + exec: + command: + - /bin/bash + - -c + - ps -ef | grep omsagent | grep -v "grep" + initialDelaySeconds: 60 + periodSeconds: 60 + nodeSelector: + beta.kubernetes.io/os: linux + kubernetes.io/role: agent + volumes: + - name: docker-sock + hostPath: + path: /var/run + - name: container-hostname + hostPath: + path: /etc/hostname + - name: host-log + hostPath: + path: /var/log + - name: containerlog-path + hostPath: + path: /var/lib/docker/containers + - name: azure-json-path + hostPath: + path: /etc/kubernetes + - name: omsagent-secret + secret: + secretName: omsagent-secret + - name: omsagent-rs-config + configMap: + name: omsagent-rs-config + - name: settings-vol-config + configMap: + name: container-azm-ms-agentconfig + optional: true + - name: azurefile-pv + persistentVolumeClaim: + claimName: azurefile diff --git a/health/omsagent-template.yaml b/health/omsagent-template.yaml new file mode 100644 index 00000000..845569e8 --- /dev/null +++ b/health/omsagent-template.yaml @@ -0,0 +1,590 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: omsagent + namespace: kube-system +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1beta1 +metadata: + name: omsagent-reader +rules: +- apiGroups: [""] + resources: ["pods", "events", "nodes", "namespaces", "services"] + verbs: ["list", "get", "watch"] +- apiGroups: ["extensions"] + resources: ["deployments"] + verbs: ["list"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1beta1 +metadata: + name: omsagentclusterrolebinding +subjects: + - kind: ServiceAccount + name: omsagent + namespace: kube-system +roleRef: + kind: ClusterRole + name: omsagent-reader + apiGroup: rbac.authorization.k8s.io +--- +kind: ConfigMap +apiVersion: v1 +data: + kube.conf: |- + # Fluentd config file for OMS Docker - cluster components (kubeAPI) + #fluent forward plugin + + type forward + port 25235 + bind 0.0.0.0 + + + #Kubernetes pod inventory + + type kubepodinventory + tag oms.containerinsights.KubePodInventory + run_interval 60s + log_level debug + + + #Kubernetes events + + type kubeevents + tag oms.containerinsights.KubeEvents + run_interval 60s + log_level debug + + + #Kubernetes logs + + type kubelogs + tag oms.api.KubeLogs + run_interval 60s + + + #Kubernetes services + + type kubeservices + tag oms.containerinsights.KubeServices + run_interval 60s + log_level debug + + + #Kubernetes Nodes + + type kubenodeinventory + tag oms.containerinsights.KubeNodeInventory + run_interval 60s + log_level debug + + + #Kubernetes perf + + type kubeperf + tag oms.api.KubePerf + run_interval 60s + log_level debug + + + #Kubernetes health + + type kubehealth + tag oms.api.KubeHealth.ReplicaSet + run_interval 60s + log_level debug + + + #cadvisor perf- Windows nodes + + type wincadvisorperf + tag oms.api.wincadvisorperf + run_interval 60s + log_level debug + + + + type filter_inventory2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + log_level info + + + # custom_metrics_mdm filter plugin for perf data from windows nodes + + type filter_cadvisor2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + log_level info + + #health model aggregation filter + + type filter_health_model_builder + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 5m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer + buffer_queue_limit 10 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_kubernetes_logs*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeservices*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer + buffer_queue_limit 20 + flush_interval 20s + retry_limit 10 + retry_wait 15s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_wincadvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_kubehealth*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s + +metadata: + name: omsagent-rs-config + namespace: kube-system +--- +apiVersion: v1 +kind: Secret +metadata: + name: omsagent-secret + namespace: kube-system +type: Opaque +data: + #BASE64 ENCODED (Both WSID & KEY) INSIDE DOUBLE QUOTE ("") + WSID: "VALUE_WSID" + KEY: "VALUE_KEY" +--- +kind: Service +apiVersion: v1 +metadata: + name: replicaset-service + namespace: kube-system +spec: + selector: + rsName: "omsagent-rs" + ports: + - protocol: TCP + port: 25235 + targetPort: in-rs-tcp +--- +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: azurefile +provisioner: kubernetes.io/azure-file +mountOptions: + - dir_mode=0777 + - file_mode=0777 + - uid=1000 + - gid=1000 +parameters: + skuName: Standard_LRS +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: system:azure-cloud-provider +rules: +- apiGroups: [''] + resources: ['secrets'] + verbs: ['get','create'] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: system:azure-cloud-provider +roleRef: + kind: ClusterRole + apiGroup: rbac.authorization.k8s.io + name: system:azure-cloud-provider +subjects: +- kind: ServiceAccount + name: persistent-volume-binder + namespace: kube-system +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: azurefile + namespace: kube-system +spec: + accessModes: + - ReadWriteMany + storageClassName: azurefile + resources: + requests: + storage: 10Mi +--- +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: omsagent + namespace: kube-system +spec: + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + dsName: "omsagent-ds" + annotations: + agentVersion: "1.10.0.1" + dockerProviderVersion: "5.0.0-1" + schema-versions: "v1" + spec: + serviceAccountName: omsagent + containers: + - name: omsagent + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" + imagePullPolicy: Always + resources: + limits: + cpu: 150m + memory: 300Mi + requests: + cpu: 75m + memory: 225Mi + env: + - name: AKS_RESOURCE_ID + value: "VALUE_AKS_RESOURCE_ID" + - name: AKS_REGION + value: "VALUE_AKS_REGION" + # Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + # - name: ACS_RESOURCE_NAME + # value: "aks-engine-health" + - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION + value: "true" + - name: CONTROLLER_TYPE + value: "DaemonSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + securityContext: + privileged: true + ports: + - containerPort: 25225 + protocol: TCP + - containerPort: 25224 + protocol: UDP + volumeMounts: + - mountPath: /hostfs + name: host-root + readOnly: true + - mountPath: /var/run/host + name: docker-sock + - mountPath: /var/log + name: host-log + - mountPath: /var/lib/docker/containers + name: containerlog-path + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + - mountPath: /etc/config/settings + name: settings-vol-config + readOnly: true + livenessProbe: + exec: + command: + - /bin/bash + - -c + - /opt/livenessprobe.sh + initialDelaySeconds: 60 + periodSeconds: 60 + nodeSelector: + beta.kubernetes.io/os: linux + # Tolerate a NoSchedule taint on master that ACS Engine sets. + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "true" + effect: "NoSchedule" + volumes: + - name: host-root + hostPath: + path: / + - name: docker-sock + hostPath: + path: /var/run + - name: container-hostname + hostPath: + path: /etc/hostname + - name: host-log + hostPath: + path: /var/log + - name: containerlog-path + hostPath: + path: /var/lib/docker/containers + - name: azure-json-path + hostPath: + path: /etc/kubernetes + - name: omsagent-secret + secret: + secretName: omsagent-secret + - name: settings-vol-config + configMap: + name: container-azm-ms-agentconfig + optional: true +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: omsagent-rs + namespace: kube-system +spec: + replicas: 1 + selector: + matchLabels: + rsName: "omsagent-rs" + strategy: + type: RollingUpdate + template: + metadata: + labels: + rsName: "omsagent-rs" + annotations: + agentVersion: "1.10.0.1" + dockerProviderVersion: "5.0.0-1" + schema-versions: "v1" + spec: + serviceAccountName: omsagent + containers: + - name: omsagent + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" + imagePullPolicy: Always + resources: + limits: + cpu: 150m + memory: 500Mi + requests: + cpu: 50m + memory: 175Mi + env: + - name: AKS_RESOURCE_ID + value: "VALUE_AKS_RESOURCE_ID" + - name: AKS_REGION + value: "VALUE_AKS_REGION" + #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + # - name: ACS_RESOURCE_NAME + # value: "aks-engine-health" + - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION + value: "true" + - name: CONTROLLER_TYPE + value: "ReplicaSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + securityContext: + privileged: true + ports: + - containerPort: 25225 + protocol: TCP + - containerPort: 25224 + protocol: UDP + - containerPort: 25235 + protocol: TCP + name: in-rs-tcp + volumeMounts: + - mountPath: /var/run/host + name: docker-sock + - mountPath: /var/log + name: host-log + - mountPath: /var/lib/docker/containers + name: containerlog-path + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + readOnly: true + - mountPath : /etc/config + name: omsagent-rs-config + - mountPath: /etc/config/settings + name: settings-vol-config + readOnly: true + - mountPath: "/mnt/azure" + name: azurefile-pv + livenessProbe: + exec: + command: + - /bin/bash + - -c + - ps -ef | grep omsagent | grep -v "grep" + initialDelaySeconds: 60 + periodSeconds: 60 + nodeSelector: + beta.kubernetes.io/os: linux + kubernetes.io/role: agent + volumes: + - name: docker-sock + hostPath: + path: /var/run + - name: container-hostname + hostPath: + path: /etc/hostname + - name: host-log + hostPath: + path: /var/log + - name: containerlog-path + hostPath: + path: /var/lib/docker/containers + - name: azure-json-path + hostPath: + path: /etc/kubernetes + - name: omsagent-secret + secret: + secretName: omsagent-secret + - name: omsagent-rs-config + configMap: + name: omsagent-rs-config + - name: settings-vol-config + configMap: + name: container-azm-ms-agentconfig + optional: true + - name: azurefile-pv + persistentVolumeClaim: + claimName: azurefile diff --git a/health/optouttemplate.json b/health/optouttemplate.json new file mode 100644 index 00000000..b036aba2 --- /dev/null +++ b/health/optouttemplate.json @@ -0,0 +1,36 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "aksResourceId": { + "type": "string", + "metadata": { + "description": "AKS Cluster resource id" + } + }, + "aksResourceLocation": { + "type": "string", + "metadata": { + "description": "Location of the AKS resource e.g. \"East US\"" + } + } + }, + "resources": [ + { + "name": "[split(parameters('aksResourceId'),'/')[8]]", + "type": "Microsoft.ContainerService/managedClusters", + "location": "[parameters('aksResourceLocation')]", + "apiVersion": "2018-03-31", + "properties": { + "mode": "Incremental", + "id": "[parameters('aksResourceId')]", + "addonProfiles": { + "omsagent": { + "enabled": false, + "config": null + } + } + } + } + ] +}