From 6e8832e6277e398d62fd1d175b1b2cd173b296f1 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Thu, 11 Apr 2019 13:00:44 -0700 Subject: [PATCH 01/55] Pushing to remote for generating image --- Kubernetes/omsagent.yaml | 36 ++++++++++++++++++++++++++++-------- ci_feature/Dockerfile | 2 +- ci_feature/setup.sh | 5 +++-- ci_feature_prod/Dockerfile | 2 +- ci_feature_prod/setup.sh | 7 +++++-- 5 files changed, 38 insertions(+), 14 deletions(-) diff --git a/Kubernetes/omsagent.yaml b/Kubernetes/omsagent.yaml index c45145ce..0c09e034 100644 --- a/Kubernetes/omsagent.yaml +++ b/Kubernetes/omsagent.yaml @@ -79,6 +79,14 @@ data: log_level debug + #Kubernetes health + + type kubehealth + tag oms.api.KubeHealth.AgentCollectionTime + run_interval 60s + log_level debug + + type filter_inventory2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope @@ -200,6 +208,18 @@ data: max_retry_wait 9m retry_mdm_post_wait_minutes 60 + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_kubehealth*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s + metadata: name: omsagent-rs-config namespace: kube-system @@ -212,8 +232,8 @@ metadata: type: Opaque data: #BASE64 ENCODED (Both WSID & KEY) INSIDE DOUBLE QUOTE ("") - WSID: "WSID" - KEY: "KEY" + WSID: "VALUE_WSID" + KEY: "VALUE_KEY" --- apiVersion: extensions/v1beta1 kind: DaemonSet @@ -244,13 +264,13 @@ spec: cpu: 50m memory: 150Mi env: - #- name: AKS_RESOURCE_ID - # value: "VALUE_AKS_RESOURCE_ID_VALUE" - #- name: AKS_REGION - # value: "VALUE_AKS_RESOURCE_REGION_VALUE" + - name: AKS_RESOURCE_ID + value: "VALUE_AKS_RESOURCE_ID_VALUE" + - name: AKS_REGION + value: "VALUE_AKS_REGION_VALUE" #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters - - name: ACS_RESOURCE_NAME - value: "my_acs_cluster_name" + #- name: ACS_RESOURCE_NAME + #value: "my_acs_cluster_name" - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION value: "true" - name: CONTROLLER_TYPE diff --git a/ci_feature/Dockerfile b/ci_feature/Dockerfile index 9aa7bdce..576c54af 100644 --- a/ci_feature/Dockerfile +++ b/ci_feature/Dockerfile @@ -5,7 +5,7 @@ com.microsoft.product="OMS Container Docker Provider" \ com.microsoft.version="2.0.0-7" ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH OTQzNWI0M2YtOTdkNS00ZGVkLThkOTAtYjA0Nzk1OGU2ZTg3 -ENV AGENT_VERSION ciprod02212019 +ENV AGENT_VERSION healthpreview04112019 ENV AZMON_COLLECT_ENV False RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes sysv-rc net-tools rsyslog cron vim dmidecode apt-transport-https && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh $tmpdir/ diff --git a/ci_feature/setup.sh b/ci_feature/setup.sh index bf5822d0..dd196abe 100644 --- a/ci_feature/setup.sh +++ b/ci_feature/setup.sh @@ -16,7 +16,8 @@ touch /etc/.omi_disable_service_control #wget https://github.com/Microsoft/Docker-Provider/releases/download/hotfix-01292019/docker-cimprov-3.0.0-8.universal.x86_64.sh -wget "https://github.com/Microsoft/Docker-Provider/releases/download/3.0.0-4(2)/docker-cimprov-3.0.0-4.universal.x86_64.sh" +#wget "https://github.com/Microsoft/Docker-Provider/releases/download/3.0.0-4(2)/docker-cimprov-3.0.0-4.universal.x86_64.sh" +wget https://diliprstorage.blob.core.windows.net/mycontainer/docker-cimprov-3.0.0-5.universal.x86_64.sh chmod 775 $TMPDIR/*.sh @@ -47,7 +48,7 @@ sudo apt-get install acl wget -qO - https://packages.fluentbit.io/fluentbit.key | sudo apt-key add - sudo echo "deb https://packages.fluentbit.io/ubuntu/xenial xenial main" >> /etc/apt/sources.list sudo apt-get update -sudo apt-get install td-agent-bit=0.14.4 sqlite3=3.11.0-1ubuntu1 libsqlite3-dev=3.11.0-1ubuntu1 -y +sudo apt-get install td-agent-bit=0.14.4 sqlite3=3.11.0-1ubuntu1.1 libsqlite3-dev=3.11.0-1ubuntu1.1 -y rm -rf $TMPDIR/omsbundle rm -f $TMPDIR/omsagent*.sh diff --git a/ci_feature_prod/Dockerfile b/ci_feature_prod/Dockerfile index f7b439af..5f42f6a8 100644 --- a/ci_feature_prod/Dockerfile +++ b/ci_feature_prod/Dockerfile @@ -5,7 +5,7 @@ com.microsoft.product="OMS Container Docker Provider" \ com.microsoft.version="2.0.0-3" ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi -ENV AGENT_VERSION ciprod02212019 +ENV AGENT_VERSION healthpreview04112019 ENV AZMON_COLLECT_ENV False RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes sysv-rc net-tools rsyslog cron vim dmidecode apt-transport-https && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh $tmpdir/ diff --git a/ci_feature_prod/setup.sh b/ci_feature_prod/setup.sh index 733511a7..dd196abe 100644 --- a/ci_feature_prod/setup.sh +++ b/ci_feature_prod/setup.sh @@ -14,7 +14,10 @@ wget https://github.com/Microsoft/OMS-Agent-for-Linux/releases/download/OMSAgent #create file to disable omi service startup script touch /etc/.omi_disable_service_control -wget "https://github.com/Microsoft/Docker-Provider/releases/download/3.0.0-4(2)/docker-cimprov-3.0.0-4.universal.x86_64.sh" +#wget https://github.com/Microsoft/Docker-Provider/releases/download/hotfix-01292019/docker-cimprov-3.0.0-8.universal.x86_64.sh + +#wget "https://github.com/Microsoft/Docker-Provider/releases/download/3.0.0-4(2)/docker-cimprov-3.0.0-4.universal.x86_64.sh" +wget https://diliprstorage.blob.core.windows.net/mycontainer/docker-cimprov-3.0.0-5.universal.x86_64.sh chmod 775 $TMPDIR/*.sh @@ -45,7 +48,7 @@ sudo apt-get install acl wget -qO - https://packages.fluentbit.io/fluentbit.key | sudo apt-key add - sudo echo "deb https://packages.fluentbit.io/ubuntu/xenial xenial main" >> /etc/apt/sources.list sudo apt-get update -sudo apt-get install td-agent-bit=0.14.4 sqlite3=3.11.0-1ubuntu1 libsqlite3-dev=3.11.0-1ubuntu1 -y +sudo apt-get install td-agent-bit=0.14.4 sqlite3=3.11.0-1ubuntu1.1 libsqlite3-dev=3.11.0-1ubuntu1.1 -y rm -rf $TMPDIR/omsbundle rm -f $TMPDIR/omsagent*.sh From 7c8f4b3a01bd0a172b870e12815d959d138a5289 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Fri, 12 Apr 2019 13:23:49 -0700 Subject: [PATCH 02/55] Building New Version of the agent --- Kubernetes/omsagent.yaml | 12 ++++++++++++ ci_feature/Dockerfile | 2 +- ci_feature/setup.sh | 5 +---- ci_feature_prod/Dockerfile | 2 +- ci_feature_prod/setup.sh | 5 +---- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/Kubernetes/omsagent.yaml b/Kubernetes/omsagent.yaml index 4a027db3..1bb86015 100644 --- a/Kubernetes/omsagent.yaml +++ b/Kubernetes/omsagent.yaml @@ -255,6 +255,18 @@ data: max_retry_wait 9m retry_mdm_post_wait_minutes 60 + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_kubehealth*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s + metadata: name: omsagent-rs-config namespace: kube-system diff --git a/ci_feature/Dockerfile b/ci_feature/Dockerfile index 3e7f47a7..c9aeb7dc 100644 --- a/ci_feature/Dockerfile +++ b/ci_feature/Dockerfile @@ -5,7 +5,7 @@ com.microsoft.product="OMS Container Docker Provider" \ com.microsoft.version="2.0.0-7" ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH OTQzNWI0M2YtOTdkNS00ZGVkLThkOTAtYjA0Nzk1OGU2ZTg3 -ENV AGENT_VERSION healthpreview04112019 +ENV AGENT_VERSION healthpreview04122019-1 ENV AZMON_COLLECT_ENV False RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes sysv-rc net-tools rsyslog cron vim dmidecode apt-transport-https && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh $tmpdir/ diff --git a/ci_feature/setup.sh b/ci_feature/setup.sh index 911960fe..7ca71601 100644 --- a/ci_feature/setup.sh +++ b/ci_feature/setup.sh @@ -9,14 +9,11 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ dpkg-reconfigure --frontend=noninteractive locales && \ update-locale LANG=en_US.UTF-8 -wget https://dockerprovider.blob.core.windows.net/omsagent/omsagent-1.8.1-422.universal.x64.sh +wget https://github.com/Microsoft/OMS-Agent-for-Linux/releases/download/OMSAgent_v1.8.1.256/omsagent-1.8.1-256.universal.x64.sh #create file to disable omi service startup script touch /etc/.omi_disable_service_control -#wget https://github.com/Microsoft/Docker-Provider/releases/download/hotfix-01292019/docker-cimprov-3.0.0-8.universal.x86_64.sh - -#wget "https://github.com/Microsoft/Docker-Provider/releases/download/3.0.0-4(2)/docker-cimprov-3.0.0-4.universal.x86_64.sh" wget https://diliprstorage.blob.core.windows.net/mycontainer/docker-cimprov-3.0.0-5.universal.x86_64.sh chmod 775 $TMPDIR/*.sh diff --git a/ci_feature_prod/Dockerfile b/ci_feature_prod/Dockerfile index 5f42f6a8..1cbf2870 100644 --- a/ci_feature_prod/Dockerfile +++ b/ci_feature_prod/Dockerfile @@ -5,7 +5,7 @@ com.microsoft.product="OMS Container Docker Provider" \ com.microsoft.version="2.0.0-3" ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi -ENV AGENT_VERSION healthpreview04112019 +ENV AGENT_VERSION healthpreview04122019-1 ENV AZMON_COLLECT_ENV False RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes sysv-rc net-tools rsyslog cron vim dmidecode apt-transport-https && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh $tmpdir/ diff --git a/ci_feature_prod/setup.sh b/ci_feature_prod/setup.sh index 88615305..b876b6fa 100644 --- a/ci_feature_prod/setup.sh +++ b/ci_feature_prod/setup.sh @@ -9,14 +9,11 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ dpkg-reconfigure --frontend=noninteractive locales && \ update-locale LANG=en_US.UTF-8 -wget https://dockerprovider.blob.core.windows.net/omsagent/omsagent-1.8.1-416.universal.x64.sh +wget https://github.com/Microsoft/OMS-Agent-for-Linux/releases/download/OMSAgent_v1.8.1.256/omsagent-1.8.1-256.universal.x64.sh #create file to disable omi service startup script touch /etc/.omi_disable_service_control -#wget https://github.com/Microsoft/Docker-Provider/releases/download/hotfix-01292019/docker-cimprov-3.0.0-8.universal.x86_64.sh - -#wget "https://github.com/Microsoft/Docker-Provider/releases/download/3.0.0-4(2)/docker-cimprov-3.0.0-4.universal.x86_64.sh" wget https://diliprstorage.blob.core.windows.net/mycontainer/docker-cimprov-3.0.0-5.universal.x86_64.sh chmod 775 $TMPDIR/*.sh From d4ae283f12e7f1d1e6ffa30a2d5665838d6e6c77 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Mon, 15 Apr 2019 13:56:04 -0700 Subject: [PATCH 03/55] Publish docker provider as a release, Update image tag --- ci_feature/Dockerfile | 2 +- ci_feature/setup.sh | 2 +- ci_feature_prod/Dockerfile | 2 +- ci_feature_prod/setup.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci_feature/Dockerfile b/ci_feature/Dockerfile index c9aeb7dc..c7f7fdc6 100644 --- a/ci_feature/Dockerfile +++ b/ci_feature/Dockerfile @@ -5,7 +5,7 @@ com.microsoft.product="OMS Container Docker Provider" \ com.microsoft.version="2.0.0-7" ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH OTQzNWI0M2YtOTdkNS00ZGVkLThkOTAtYjA0Nzk1OGU2ZTg3 -ENV AGENT_VERSION healthpreview04122019-1 +ENV AGENT_VERSION healthpreview04152019 ENV AZMON_COLLECT_ENV False RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes sysv-rc net-tools rsyslog cron vim dmidecode apt-transport-https && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh $tmpdir/ diff --git a/ci_feature/setup.sh b/ci_feature/setup.sh index 7ca71601..52635b0b 100644 --- a/ci_feature/setup.sh +++ b/ci_feature/setup.sh @@ -14,7 +14,7 @@ wget https://github.com/Microsoft/OMS-Agent-for-Linux/releases/download/OMSAgent #create file to disable omi service startup script touch /etc/.omi_disable_service_control -wget https://diliprstorage.blob.core.windows.net/mycontainer/docker-cimprov-3.0.0-5.universal.x86_64.sh +wget https://github.com/Microsoft/Docker-Provider/releases/download/healthpreview04152019/docker-cimprov-3.0.0-6.universal.x86_64.sh chmod 775 $TMPDIR/*.sh diff --git a/ci_feature_prod/Dockerfile b/ci_feature_prod/Dockerfile index 1cbf2870..558c4aed 100644 --- a/ci_feature_prod/Dockerfile +++ b/ci_feature_prod/Dockerfile @@ -5,7 +5,7 @@ com.microsoft.product="OMS Container Docker Provider" \ com.microsoft.version="2.0.0-3" ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi -ENV AGENT_VERSION healthpreview04122019-1 +ENV AGENT_VERSION healthpreview04152019 ENV AZMON_COLLECT_ENV False RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes sysv-rc net-tools rsyslog cron vim dmidecode apt-transport-https && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh $tmpdir/ diff --git a/ci_feature_prod/setup.sh b/ci_feature_prod/setup.sh index b876b6fa..ebaa1183 100644 --- a/ci_feature_prod/setup.sh +++ b/ci_feature_prod/setup.sh @@ -14,7 +14,7 @@ wget https://github.com/Microsoft/OMS-Agent-for-Linux/releases/download/OMSAgent #create file to disable omi service startup script touch /etc/.omi_disable_service_control -wget https://diliprstorage.blob.core.windows.net/mycontainer/docker-cimprov-3.0.0-5.universal.x86_64.sh +wget https://github.com/Microsoft/Docker-Provider/releases/download/healthpreview04152019/docker-cimprov-3.0.0-6.universal.x86_64.sh chmod 775 $TMPDIR/*.sh From c092910442bc4230d9bcd9a3dac31b485504d08b Mon Sep 17 00:00:00 2001 From: r-dilip Date: Wed, 17 Apr 2019 17:32:36 -0700 Subject: [PATCH 04/55] Onboard Script --- health/HealthAgentOnboarding.ps1 | 344 ++++++++++++++++++++++++ health/customOnboarding.json | 44 ++++ health/omsagent-template.yaml | 439 +++++++++++++++++++++++++++++++ 3 files changed, 827 insertions(+) create mode 100644 health/HealthAgentOnboarding.ps1 create mode 100644 health/customOnboarding.json create mode 100644 health/omsagent-template.yaml diff --git a/health/HealthAgentOnboarding.ps1 b/health/HealthAgentOnboarding.ps1 new file mode 100644 index 00000000..c015207e --- /dev/null +++ b/health/HealthAgentOnboarding.ps1 @@ -0,0 +1,344 @@ +<# + .DESCRIPTION + .PARAMETER aksResourceId + Name of the cluster configured on the OMSAgent + .PARAMETER loganalyticsWorkspaceResourceId + Azure ResourceId of the log analytics workspace Id +#> +param( + [Parameter(mandatory = $true)] + [string]$aksResourceId, + [Parameter(mandatory = $true)] + [string]$aksResourceLocation, + [Parameter(mandatory = $true)] + [string]$logAnalyticsWorkspaceResourceId +) + +# checks the required Powershell modules exist and if not exists, request the user permission to install +$azAccountModule = Get-Module -ListAvailable -Name Az.Accounts +$azResourcesModule = Get-Module -ListAvailable -Name Az.Resources +$azOperationalInsights = Get-Module -ListAvailable -Name Az.OperationalInsights + +if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null -eq $azOperationalInsights)) { + + $currentPrincipal = New-Object Security.Principal.WindowsPrincipal([Security.Principal.WindowsIdentity]::GetCurrent()) + + if ($currentPrincipal.IsInRole([Security.Principal.WindowsBuiltInRole]::Administrator)) { + Write-Host("Running script as an admin...") + Write-Host("") + } + else { + Write-Host("Please re-launch the script with elevated administrator") -ForegroundColor Red + Stop-Transcript + exit + } + + $message = "This script will try to install the latest versions of the following Modules : ` + Az.Resources, Az.Accounts and Az.OperationalInsights using the command` + `'Install-Module {Insert Module Name} -Repository PSGallery -Force -AllowClobber -ErrorAction Stop -WarningAction Stop' + `If you do not have the latest version of these Modules, this troubleshooting script may not run." + $question = "Do you want to Install the modules and run the script or just run the script?" + + $choices = New-Object Collections.ObjectModel.Collection[Management.Automation.Host.ChoiceDescription] + $choices.Add((New-Object Management.Automation.Host.ChoiceDescription -ArgumentList '&Yes, Install and run')) + $choices.Add((New-Object Management.Automation.Host.ChoiceDescription -ArgumentList '&Continue without installing the Module')) + $choices.Add((New-Object Management.Automation.Host.ChoiceDescription -ArgumentList '&Quit')) + + $decision = $Host.UI.PromptForChoice($message, $question, $choices, 0) + + switch ($decision) { + 0 { + + if ($null -eq $azResourcesModule) { + try { + Write-Host("Installing Az.Resources...") + Install-Module Az.Resources -Repository PSGallery -Force -AllowClobber -ErrorAction Stop + } + catch { + Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red + exit + } + } + + if ($null -eq $azAccountModule) { + try { + Write-Host("Installing Az.Accounts...") + Install-Module Az.Accounts -Repository PSGallery -Force -AllowClobber -ErrorAction Stop + } + catch { + Write-Host("Close other powershell logins and try installing the latest modules forAz.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red + exit + } + } + + if ($null -eq $azOperationalInsights) { + try { + + Write-Host("Installing AzureRM.OperationalInsights...") + Install-Module Az.OperationalInsights -Repository PSGallery -Force -AllowClobber -ErrorAction Stop + } + catch { + Write-Host("Close other powershell logins and try installing the latest modules for AzureRM.OperationalInsights in a new powershell window: eg. 'Install-Module AzureRM.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red + exit + } + } + + } + 1 { + + if ($null -eq $azResourcesModule) { + try { + Import-Module Az.Resources -ErrorAction Stop + } + catch { + Write-Host("Could not import Az.Resources...") -ForegroundColor Red + Write-Host("Close other powershell logins and try installing the latest modules for Az.Resources in a new powershell window: eg. 'Install-Module Az.Resources -Repository PSGallery -Force'") -ForegroundColor Red + Stop-Transcript + exit + } + } + if ($null -eq $azAccountModule) { + try { + Import-Module Az.Accounts -ErrorAction Stop + } + catch { + Write-Host("Could not import Az.Accounts...") -ForegroundColor Red + Write-Host("Close other powershell logins and try installing the latest modules for Az.Accounts in a new powershell window: eg. 'Install-Module Az.Accounts -Repository PSGallery -Force'") -ForegroundColor Red + Stop-Transcript + exit + } + } + + if ($null -eq $azAccountModule) { + try { + Import-Module Az.OperationalInsights + } + catch { + Write-Host("Could not import Az.OperationalInsights... Please reinstall this Module") -ForegroundColor Red + Stop-Transcript + exit + } + } + + } + 2 { + Write-Host("") + Stop-Transcript + exit + } + } +} + +if ([string]::IsNullOrEmpty($logAnalyticsWorkspaceResourceId)) { + Write-Host("logAnalyticsWorkspaceResourceId should not be NULL or empty") -ForegroundColor Red + exit +} + +if (($logAnalyticsWorkspaceResourceId -match "/providers/Microsoft.OperationalInsights/workspaces") -eq $false) { + Write-Host("logAnalyticsWorkspaceResourceId should be valid Azure Resource Id format") -ForegroundColor Red + exit +} + +$workspaceResourceDetails = $logAnalyticsWorkspaceResourceId.Split("/") + +if ($workspaceResourceDetails.Length -ne 9) { + Write-Host("logAnalyticsWorkspaceResourceId should be valid Azure Resource Id format") -ForegroundColor Red + exit +} + +$workspaceSubscriptionId = $workspaceResourceDetails[2] +$workspaceSubscriptionId = $workspaceSubscriptionId.Trim() + +$workspaceResourceGroupName = $workspaceResourceDetails[4] +$workspaceResourceGroupName = $workspaceResourceGroupName.Trim() + +$workspaceName = $workspaceResourceDetails[8] +$workspaceResourceGroupName = $workspaceResourceGroupName.Trim() + +Write-Host("LogAnalytics Workspace SubscriptionId : '" + $workspaceSubscriptionId + "' ") -ForegroundColor Green + +try { + Write-Host("") + Write-Host("Trying to get the current Az login context...") + $account = Get-AzContext -ErrorAction Stop + Write-Host("Successfully fetched current AzContext context...") -ForegroundColor Green + Write-Host("") +} +catch { + Write-Host("") + Write-Host("Could not fetch AzContext..." ) -ForegroundColor Red + Write-Host("") +} + + +if ($null -eq $account.Account) { + try { + Write-Host("Please login...") + Connect-AzAccount -subscriptionid $workspaceSubscriptionId + } + catch { + Write-Host("") + Write-Host("Could not select subscription with ID : " + $workspaceSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit + } +} +else { + if ($account.Subscription.Id -eq $workspaceSubscriptionId) { + Write-Host("Subscription: $SubscriptionId is already selected. Account details: ") + $account + } + else { + try { + Write-Host("Current Subscription:") + $account + Write-Host("Changing to subscription: $workspaceSubscriptionId") + Set-AzContext -SubscriptionId $workspaceSubscriptionId + } + catch { + Write-Host("") + Write-Host("Could not select subscription with ID : " + $workspaceSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit + } + } +} + +# validate specified logAnalytics workspace exists and got access permissions +Write-Host("Checking specified logAnalyticsWorkspaceResourceId exists and got access...") + +try { + $WorkspaceInformation = Get-AzOperationalInsightsWorkspace -ResourceGroupName $workspaceResourceGroupName -Name $workspaceName -ErrorAction Stop +} +catch { + Write-Host("") + Write-Host("Could not fetch details for the workspace : '" + $workspaceName + "'. Please make sure that it hasn't been deleted and you have access to it.") -ForegroundColor Red + Stop-Transcript + exit +} + +Write-Host("Successfully verified specified logAnalyticsWorkspaceResourceId valid and exists...") -ForegroundColor Green + +$WorkspaceLocation = $WorkspaceInformation.Location + +if ($null -eq $WorkspaceLocation) { + Write-Host("") + Write-Host("Cannot fetch workspace location. Please try again...") -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit +} + +try { + $WorkspaceIPDetails = Get-AzOperationalInsightsIntelligencePacks -ResourceGroupName $workspaceResourceGroupName -WorkspaceName $workspaceName -ErrorAction Stop + Write-Host("Successfully fetched workspace IP details...") -ForegroundColor Green + Write-Host("") +} +catch { + Write-Host("") + Write-Host("Failed to get the list of solutions onboarded to the workspace. Please make sure that it hasn't been deleted and you have access to it.") -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit +} + +try { + $ContainerInsightsIndex = $WorkspaceIPDetails.Name.IndexOf("ContainerInsights"); + Write-Host("Successfully located ContainerInsights solution") -ForegroundColor Green + Write-Host("") +} +catch { + Write-Host("Failed to get ContainerInsights solution details from the workspace") -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit +} + +$isSolutionOnboarded = $WorkspaceIPDetails.Enabled[$ContainerInsightsIndex] + +if ($false -eq $isSolutionOnboarded) { + + $DeploymentName = "ContainerInsightsSolutionOnboarding-" + ((Get-Date).ToUniversalTime()).ToString('MMdd-HHmm') + $Parameters = @{} + $Parameters.Add("workspaceResourceId", $logAnalyticsWorkspaceResourceID) + $Parameters.Add("workspaceRegion", $WorkspaceLocation) + $Parameters + + try { + New-AzResourceGroupDeployment -Name $DeploymentName ` + -ResourceGroupName $workspaceResourceGroupName ` + -TemplateUri https://raw.githubusercontent.com/Microsoft/OMS-docker/ci_feature/docs/templates/azuremonitor-containerSolution.json ` + -TemplateParameterObject $Parameters -ErrorAction Stop` + Write-Host("") + + Write-Host("Successfully added Container Insights Solution") -ForegroundColor Green + + Write-Host("") + } + catch { + Write-Host ("Template deployment failed with an error: '" + $Error[0] + "' ") -ForegroundColor Red + Write-Host("Please contact us by emailing askcoin@microsoft.com for help") -ForegroundColor Red + } + +} + +Write-Host("Successfully added Container Insights Solution to workspace" + $workspaceName) -ForegroundColor Green + +try { + $DeploymentName = "ClusterHealthOnboarding-" + ((Get-Date).ToUniversalTime()).ToString('MMdd-HHmm') + $Parameters = @{} + $Parameters.Add("aksResourceId", $aksResourceId) + $Parameters.Add("aksResourceLocation", $aksResourceLocation) + $Parameters.Add("workspaceResourceId", $logAnalyticsWorkspaceResourceId) + + + #New-AzResourceGroupDeployment -Name $DeploymentName ` + # -ResourceGroupName $workspaceResourceGroupName ` + # -TemplateUri C:\health\customOnboarding.json ` + # -TemplateParameterObject $Parameters -ErrorAction Stop` + Write-Host("") + + Write-Host("Successfully custom onboarded cluster to Monitoring") -ForegroundColor Green + + Write-Host("") +} +catch { + Write-Host ("Template deployment failed with an error: '" + $Error[0] + "' ") -ForegroundColor Red + #Write-Host("Please contact us by emailing askcoin@microsoft.com for help") -ForegroundColor Red +} + + +$desktopPath = [System.Environment]::GetFolderPath([System.Environment+SpecialFolder]::Desktop) + +try { + + $aksResourceDetails = $aksResourceId.Split("/") + + + if ($aksResourceDetails.Length -ne 9) { + Write-Host("aksResourceDetails should be valid Azure Resource Id format") -ForegroundColor Red + exit + } + + $clusterName = $aksResourceDetails[8].Trim() + $clusterResourceGroupName = $aksResourceDetails[4].Trim() + + az aks get-credentials -n $clusterName -g $clusterResourceGroupName + + $key = (Get-AzOperationalInsightsWorkspaceSharedKeys -ResourceGroupName $workspaceResourceGroupName -Name $workspaceName).PrimarySharedKey + $wsid = $WorkspaceInformation.CustomerId + Write-Host "key $($key) wsid $($wsid)" + $base64EncodedKey = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($key)) + $base64EncodedWsId = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($wsid)) + (Get-Content -Path "C:\Users\dilipr\Desktop\deployments\omsagent-template.yaml" -Raw) -replace 'VALUE_AKS_RESOURCE_ID', $aksResourceId -replace 'VALUE_AKS_REGION', $aksRegion -replace 'VALUE_WSID', $base64EncodedWsId -replace 'VALUE_KEY', $base64EncodedKey | Set-Content $desktopPath\deployments\omsagent-$clusterName.yaml + kubectl delete -f $desktopPath\deployments\omsagent-$clusterName.yaml + kubectl apply -f $desktopPath\deployments\omsagent-$clusterName.yaml +} +catch { + Write-Host ("Agent deployment failed with an error: '" + $Error[0] + "' ") -ForegroundColor Red +} + +Write-Host "Upgraded omsagent" + diff --git a/health/customOnboarding.json b/health/customOnboarding.json new file mode 100644 index 00000000..ecccc2ea --- /dev/null +++ b/health/customOnboarding.json @@ -0,0 +1,44 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "aksResourceId": { + "type": "string", + "metadata": { + "description": "AKS Cluster resource id" + } + }, + "aksResourceLocation": { + "type": "string", + "metadata": { + "description": "Location of the AKS resource e.g. \"East US\"" + } + }, + "workspaceResourceId": { + "type": "string", + "metadata": { + "description": "Azure Monitor Log Analytics Resource ID" + } + } + }, + "resources": [ + { + "name": "[split(parameters('aksResourceId'),'/')[8]]", + "type": "Microsoft.ContainerService/managedClusters", + "location": "[parameters('aksResourceLocation')]", + "apiVersion": "2018-03-31", + "properties": { + "mode": "Incremental", + "id": "[parameters('aksResourceId')]", + "addonProfiles": { + "omsagent": { + "enabled": false, + "config": { + "logAnalyticsWorkspaceResourceID": "[parameters('workspaceResourceId')]" + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/health/omsagent-template.yaml b/health/omsagent-template.yaml new file mode 100644 index 00000000..dde02882 --- /dev/null +++ b/health/omsagent-template.yaml @@ -0,0 +1,439 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: omsagent + namespace: kube-system +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1beta1 +metadata: + name: omsagent-reader +rules: +- apiGroups: [""] + resources: ["pods", "events", "nodes", "namespaces", "services"] + verbs: ["list"] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1beta1 +metadata: + name: omsagentclusterrolebinding +subjects: + - kind: ServiceAccount + name: omsagent + namespace: kube-system +roleRef: + kind: ClusterRole + name: omsagent-reader + apiGroup: rbac.authorization.k8s.io +--- +kind: ConfigMap +apiVersion: v1 +data: + kube.conf: |- + # Fluentd config file for OMS Docker - cluster components (kubeAPI) + + #Kubernetes pod inventory + + type kubepodinventory + tag oms.containerinsights.KubePodInventory + run_interval 60s + log_level debug + + + #Kubernetes events + + type kubeevents + tag oms.containerinsights.KubeEvents + run_interval 60s + log_level debug + + + #Kubernetes logs + + type kubelogs + tag oms.api.KubeLogs + run_interval 60s + + + #Kubernetes services + + type kubeservices + tag oms.containerinsights.KubeServices + run_interval 60s + log_level debug + + + #Kubernetes Nodes + + type kubenodeinventory + tag oms.containerinsights.KubeNodeInventory + run_interval 60s + log_level debug + + + #Kubernetes perf + + type kubeperf + tag oms.api.KubePerf + run_interval 60s + log_level debug + + + #Kubernetes health + + type kubehealth + tag oms.api.KubeHealth.AgentCollectionTime + run_interval 60s + log_level debug + + + + type filter_inventory2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + log_level info + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 5m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer + buffer_queue_limit 10 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_kubernetes_logs*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeservices*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer + buffer_queue_limit 20 + flush_interval 20s + retry_limit 10 + retry_wait 15s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_kubehealth*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s + +metadata: + name: omsagent-rs-config + namespace: kube-system +--- +apiVersion: v1 +kind: Secret +metadata: + name: omsagent-secret + namespace: kube-system +type: Opaque +data: + #BASE64 ENCODED (Both WSID & KEY) INSIDE DOUBLE QUOTE ("") + WSID: "VALUE_WSID" + KEY: "VALUE_KEY" +--- +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: omsagent + namespace: kube-system +spec: + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + dsName: "omsagent-ds" + annotations: + agentVersion: "1.8.1.256" + dockerProviderVersion: "3.0.0-4" + spec: + serviceAccountName: omsagent + containers: + - name: omsagent + image: "microsoft/oms:healthpreview04172019-1" + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 150m + memory: 300Mi + requests: + cpu: 50m + memory: 150Mi + env: + - name: AKS_RESOURCE_ID + value: "VALUE_AKS_RESOURCE_ID" + - name: AKS_REGION + value: "VALUE_AKS_REGION" + # Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + # - name: ACS_RESOURCE_NAME + # value: "aks-engine-health" + - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION + value: "true" + - name: CONTROLLER_TYPE + value: "DaemonSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + securityContext: + privileged: true + ports: + - containerPort: 25225 + protocol: TCP + - containerPort: 25224 + protocol: UDP + volumeMounts: + - mountPath: /var/run/host + name: docker-sock + - mountPath: /var/log + name: host-log + - mountPath: /var/lib/docker/containers + name: containerlog-path + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + readOnly: true + livenessProbe: + exec: + command: + - /bin/bash + - -c + - ps -ef | grep main | grep -v "grep" + initialDelaySeconds: 60 + periodSeconds: 60 + nodeSelector: + beta.kubernetes.io/os: linux + # Tolerate a NoSchedule taint on master that ACS Engine sets. + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "true" + effect: "NoSchedule" + volumes: + - name: docker-sock + hostPath: + path: /var/run + - name: container-hostname + hostPath: + path: /etc/hostname + - name: host-log + hostPath: + path: /var/log + - name: containerlog-path + hostPath: + path: /var/lib/docker/containers + - name: azure-json-path + hostPath: + path: /etc/kubernetes + - name: omsagent-secret + secret: + secretName: omsagent-secret +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: omsagent-rs + namespace: kube-system +spec: + replicas: 1 + selector: + matchLabels: + rsName: "omsagent-rs" + strategy: + type: RollingUpdate + template: + metadata: + labels: + rsName: "omsagent-rs" + annotations: + agentVersion: "1.8.1.256" + dockerProviderVersion: "3.0.0-4" + spec: + serviceAccountName: omsagent + containers: + - name: omsagent + image: "microsoft/oms:healthpreview04172019-1" + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 150m + memory: 500Mi + requests: + cpu: 50m + memory: 100Mi + env: + - name: AKS_RESOURCE_ID + value: "VALUE_AKS_RESOURCE_ID" + - name: AKS_REGION + value: "VALUE_AKS_REGION" + #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + # - name: ACS_RESOURCE_NAME + # value: "aks-engine-health" + - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION + value: "true" + - name: CONTROLLER_TYPE + value: "ReplicaSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + securityContext: + privileged: true + ports: + - containerPort: 25225 + protocol: TCP + - containerPort: 25224 + protocol: UDP + volumeMounts: + - mountPath: /var/run/host + name: docker-sock + - mountPath: /var/log + name: host-log + - mountPath: /var/lib/docker/containers + name: containerlog-path + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + readOnly: true + - mountPath : /etc/config + name: omsagent-rs-config + livenessProbe: + exec: + command: + - /bin/bash + - -c + - ps -ef | grep main | grep -v "grep" + initialDelaySeconds: 60 + periodSeconds: 60 + nodeSelector: + beta.kubernetes.io/os: linux + kubernetes.io/role: agent + volumes: + - name: docker-sock + hostPath: + path: /var/run + - name: container-hostname + hostPath: + path: /etc/hostname + - name: host-log + hostPath: + path: /var/log + - name: containerlog-path + hostPath: + path: /var/lib/docker/containers + - name: azure-json-path + hostPath: + path: /etc/kubernetes + - name: omsagent-secret + secret: + secretName: omsagent-secret + - name: omsagent-rs-config + configMap: + name: omsagent-rs-config + From e251cc936db2e16b36fbb917d2087b682ea30b6d Mon Sep 17 00:00:00 2001 From: r-dilip Date: Wed, 17 Apr 2019 18:00:34 -0700 Subject: [PATCH 05/55] Update Agent Version, do actual ARM Deployment --- health/HealthAgentOnboarding.ps1 | 20 ++++++++++++++------ health/omsagent-template.yaml | 4 ++-- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/health/HealthAgentOnboarding.ps1 b/health/HealthAgentOnboarding.ps1 index c015207e..170d1458 100644 --- a/health/HealthAgentOnboarding.ps1 +++ b/health/HealthAgentOnboarding.ps1 @@ -294,10 +294,10 @@ try { $Parameters.Add("workspaceResourceId", $logAnalyticsWorkspaceResourceId) - #New-AzResourceGroupDeployment -Name $DeploymentName ` - # -ResourceGroupName $workspaceResourceGroupName ` - # -TemplateUri C:\health\customOnboarding.json ` - # -TemplateParameterObject $Parameters -ErrorAction Stop` + New-AzResourceGroupDeployment -Name $DeploymentName ` + -ResourceGroupName $workspaceResourceGroupName ` + -TemplateUri https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/onboardHealth/health/customOnboarding.json ` + -TemplateParameterObject $Parameters -ErrorAction Stop` Write-Host("") Write-Host("Successfully custom onboarded cluster to Monitoring") -ForegroundColor Green @@ -312,6 +312,14 @@ catch { $desktopPath = [System.Environment]::GetFolderPath([System.Environment+SpecialFolder]::Desktop) +if (-not (test-path $desktopPath\deployments) ) { + Write-Host "$($desktopPath)\deployments doesn't exist, creating it" + mkdir $desktopPath\deployments|out-null +} else { + Write-Host "$($desktopPath)\deployments exists, no need to create it" +} + + try { $aksResourceDetails = $aksResourceId.Split("/") @@ -329,10 +337,10 @@ try { $key = (Get-AzOperationalInsightsWorkspaceSharedKeys -ResourceGroupName $workspaceResourceGroupName -Name $workspaceName).PrimarySharedKey $wsid = $WorkspaceInformation.CustomerId - Write-Host "key $($key) wsid $($wsid)" $base64EncodedKey = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($key)) $base64EncodedWsId = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($wsid)) - (Get-Content -Path "C:\Users\dilipr\Desktop\deployments\omsagent-template.yaml" -Raw) -replace 'VALUE_AKS_RESOURCE_ID', $aksResourceId -replace 'VALUE_AKS_REGION', $aksRegion -replace 'VALUE_WSID', $base64EncodedWsId -replace 'VALUE_KEY', $base64EncodedKey | Set-Content $desktopPath\deployments\omsagent-$clusterName.yaml + Invoke-WebRequest https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/onboardHealth/health/omsagent-template.yaml -OutFile $desktopPath\omsagent-template.yaml + (Get-Content -Path $desktopPath\omsagent-template.yaml -Raw) -replace 'VALUE_AKS_RESOURCE_ID', $aksResourceId -replace 'VALUE_AKS_REGION', $aksRegion -replace 'VALUE_WSID', $base64EncodedWsId -replace 'VALUE_KEY', $base64EncodedKey | Set-Content $desktopPath\deployments\omsagent-$clusterName.yaml kubectl delete -f $desktopPath\deployments\omsagent-$clusterName.yaml kubectl apply -f $desktopPath\deployments\omsagent-$clusterName.yaml } diff --git a/health/omsagent-template.yaml b/health/omsagent-template.yaml index dde02882..fb4ef5ac 100644 --- a/health/omsagent-template.yaml +++ b/health/omsagent-template.yaml @@ -254,7 +254,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "microsoft/oms:healthpreview04172019-1" + image: "microsoft/oms:healthpreview04172019" imagePullPolicy: IfNotPresent resources: limits: @@ -357,7 +357,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "microsoft/oms:healthpreview04172019-1" + image: "microsoft/oms:healthpreview04172019" imagePullPolicy: IfNotPresent resources: limits: From 733be41a33d488af4fe46bedbb23447ac47e198b Mon Sep 17 00:00:00 2001 From: r-dilip Date: Fri, 19 Apr 2019 12:20:26 -0700 Subject: [PATCH 06/55] Use Import-AzAks instead of az cli --- health/HealthAgentOnboarding.ps1 | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/health/HealthAgentOnboarding.ps1 b/health/HealthAgentOnboarding.ps1 index 170d1458..3ba2b90c 100644 --- a/health/HealthAgentOnboarding.ps1 +++ b/health/HealthAgentOnboarding.ps1 @@ -4,6 +4,8 @@ Name of the cluster configured on the OMSAgent .PARAMETER loganalyticsWorkspaceResourceId Azure ResourceId of the log analytics workspace Id + .PARAMETER aksResourceLocation + Resource location of the AKS cluster resource #> param( [Parameter(mandatory = $true)] @@ -18,6 +20,7 @@ param( $azAccountModule = Get-Module -ListAvailable -Name Az.Accounts $azResourcesModule = Get-Module -ListAvailable -Name Az.Resources $azOperationalInsights = Get-Module -ListAvailable -Name Az.OperationalInsights +$azAks = Get-Module -ListAvailable -Name Az.Aks if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null -eq $azOperationalInsights)) { @@ -34,7 +37,7 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - } $message = "This script will try to install the latest versions of the following Modules : ` - Az.Resources, Az.Accounts and Az.OperationalInsights using the command` + Az.Resources, Az.Accounts, Az.Aks and Az.OperationalInsights using the command` `'Install-Module {Insert Module Name} -Repository PSGallery -Force -AllowClobber -ErrorAction Stop -WarningAction Stop' `If you do not have the latest version of these Modules, this troubleshooting script may not run." $question = "Do you want to Install the modules and run the script or just run the script?" @@ -81,7 +84,19 @@ if (($null -eq $azAccountModule) -or ($null -eq $azResourcesModule) -or ($null - Write-Host("Close other powershell logins and try installing the latest modules for AzureRM.OperationalInsights in a new powershell window: eg. 'Install-Module AzureRM.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red exit } - } + } + if ($null -eq $azAks) { + try { + + Write-Host("Installing Az.Aks...") + Install-Module Az.Aks -Repository PSGallery -Force -AllowClobber -ErrorAction Stop + } + catch { + Write-Host("Close other powershell logins and try installing the latest modules for AzureRM.OperationalInsights in a new powershell window: eg. 'Install-Module AzureRM.OperationalInsights -Repository PSGallery -Force'") -ForegroundColor Red + exit + } + } + } 1 { @@ -333,7 +348,7 @@ try { $clusterName = $aksResourceDetails[8].Trim() $clusterResourceGroupName = $aksResourceDetails[4].Trim() - az aks get-credentials -n $clusterName -g $clusterResourceGroupName + Import-AzAksCredential -Id $aksResourceId -Force $key = (Get-AzOperationalInsightsWorkspaceSharedKeys -ResourceGroupName $workspaceResourceGroupName -Name $workspaceName).PrimarySharedKey $wsid = $WorkspaceInformation.CustomerId From ba99de4c307ad85ee97742a11718f659259bc1aa Mon Sep 17 00:00:00 2001 From: r-dilip Date: Tue, 23 Apr 2019 13:44:01 -0700 Subject: [PATCH 07/55] Pushing doc to OMS-docker repo --- health/HealthOnboarding.md | 594 +++++++++++++++++++++++++++++++++++++ 1 file changed, 594 insertions(+) create mode 100644 health/HealthOnboarding.md diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md new file mode 100644 index 00000000..b548384d --- /dev/null +++ b/health/HealthOnboarding.md @@ -0,0 +1,594 @@ +## Overview +The following documentation outlines the script required to upgrade an existing cluster onboarded to a Log Analytics workspace, to an agent running the workflow that generates health monitor signals into the same workspace. + +* Do a custom off-boarding of the cluster from Monitoring +* Installing the new agent that generates health monitor signals +* Please refer to the powershell script + +## Prerequisites +* Cluster that has already been onboarded to Monitoring using a Log Analytics workspace +* kubectl +* Powershell with the following modules installed + * Az.Accounts + * Az.Resources + * Az.OperationalInsights + * Az.Aks + +## Steps +1. Copy and paste the following JSON into a file. + +```json +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "aksResourceId": { + "type": "string", + "metadata": { + "description": "AKS Cluster Resource ID" + } + }, + "aksResourceLocation": { + "type": "string", + "metadata": { + "description": "Location of the AKS resource e.g. \"East US\"" + } + } +}, +"resources": [ + { + "name": "[split(parameters('aksResourceId'),'/')[8]]", + "type": "Microsoft.ContainerService/managedClusters", + "location": "[parameters('aksResourceLocation')]", + "apiVersion": "2018-03-31", + "properties": { + "mode": "Incremental", + "id": "[parameters('aksResourceId')]", + "addonProfiles": { + "omsagent": { + "enabled": false, + "config": { + "loganalyticsworkspaceresourceid": "[parameters('workspaceResourceId')]" + } + } + } + } + } + ] +} +``` + +2. Save this file as HealthPreviewOnboarding.json in your local folder + +3. Copy and paste the following JSON into a file + +```json +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "aksResourceId": { + "value": "VALUE_AKS_RESOURCE_ID" + }, + "aksResourceLocation": { + "value": "eastus" + }, + "workspaceResourceId": { + "value": "VALUE_WORKSPACE_RESOURCE_ID" + } + } +} +``` + +4. Save this file as HealthPreviewOnboardingParams.json in your local folder + +5. Replace the contents of the VALUE_AKS_RESOURCE_ID and VALUE_WORKSPACE_RESOURCE_ID with the correct values in the HealthPreviewOnboardingParams file +The VALUE_AKS_RESOURCE_ID (resource id of the cluster) can be found in the Properties section of the AKS cluster. VALUE_WORKSPACE_RESOURCE_ID (get the value of this from the portal when the cluster is onboarded) is of the format /subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/ -- replace the subscriptionId, resourceGroupName and workspaceName values with the right ones. + +6. Run the following commands from a powershell window +* Connect-AzAccount +* Select-AzSubscription -SubscriptionName +* New-AzResourceGroupDeployment -Name opt-out -ResourceGroupName -TemplateFile .\HealthPreviewOnboarding.json -TemplateParameterFile .\HealthPreviewOnboardingParams.json + +7. Copy the following content into a yaml file: +```yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: omsagent + namespace: kube-system +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1beta1 +metadata: + name: omsagent-reader +rules: +- apiGroups: [""] + resources: ["pods", "events", "nodes", "namespaces", "services"] + verbs: ["list"] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1beta1 +metadata: + name: omsagentclusterrolebinding +subjects: + - kind: ServiceAccount + name: omsagent + namespace: kube-system +roleRef: + kind: ClusterRole + name: omsagent-reader + apiGroup: rbac.authorization.k8s.io +--- +kind: ConfigMap +apiVersion: v1 +data: + kube.conf: |- + # Fluentd config file for OMS Docker - cluster components (kubeAPI) + + #Kubernetes pod inventory + + type kubepodinventory + tag oms.containerinsights.KubePodInventory + run_interval 60s + log_level debug + + + #Kubernetes events + + type kubeevents + tag oms.containerinsights.KubeEvents + run_interval 60s + log_level debug + + + #Kubernetes logs + + type kubelogs + tag oms.api.KubeLogs + run_interval 60s + + + #Kubernetes services + + type kubeservices + tag oms.containerinsights.KubeServices + run_interval 60s + log_level debug + + + #Kubernetes Nodes + + type kubenodeinventory + tag oms.containerinsights.KubeNodeInventory + run_interval 60s + log_level debug + + + #Kubernetes perf + + type kubeperf + tag oms.api.KubePerf + run_interval 60s + log_level debug + + + #Kubernetes health + + type kubehealth + tag oms.api.KubeHealth.AgentCollectionTime + run_interval 60s + log_level debug + + + #cadvisor perf- Windows nodes + + type wincadvisorperf + tag oms.api.wincadvisorperf + run_interval 60s + log_level debug + + + + type filter_inventory2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + log_level info + + + # custom_metrics_mdm filter plugin for perf data from windows nodes + + type filter_cadvisor2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + log_level info + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 5m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer + buffer_queue_limit 10 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_kubernetes_logs*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeservices*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer + buffer_queue_limit 20 + flush_interval 20s + retry_limit 10 + retry_wait 15s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_wincadvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_kubehealth*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s + +metadata: + name: omsagent-rs-config + namespace: kube-system +--- +apiVersion: v1 +kind: Secret +metadata: + name: omsagent-secret + namespace: kube-system +type: Opaque +data: + #BASE64 ENCODED (Both WSID & KEY) INSIDE DOUBLE QUOTE ("") + WSID: "VALUE_WSID" + KEY: "VALUE_KEY" +--- +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: omsagent + namespace: kube-system +spec: + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + dsName: "omsagent-ds" + annotations: + agentVersion: "1.8.1.256" + dockerProviderVersion: "3.0.0-5" + spec: + serviceAccountName: omsagent + containers: + - name: omsagent + image: "microsoft/oms:healthpreview04152019" + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 150m + memory: 300Mi + requests: + cpu: 50m + memory: 150Mi + env: + - name: AKS_RESOURCE_ID + value: "VALUE_AKS_RESOURCE_ID" + - name: AKS_REGION + value: "VALUE_AKS_REGION" + #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + #- name: ACS_RESOURCE_NAME + #value: "my_acs_cluster_name" + - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION + value: "true" + - name: CONTROLLER_TYPE + value: "DaemonSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + securityContext: + privileged: true + ports: + - containerPort: 25225 + protocol: TCP + - containerPort: 25224 + protocol: UDP + volumeMounts: + - mountPath: /var/run/host + name: docker-sock + - mountPath: /var/log + name: host-log + - mountPath: /var/lib/docker/containers + name: containerlog-path + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + readOnly: true + livenessProbe: + exec: + command: + - /bin/bash + - -c + - ps -ef | grep omsagent | grep -v "grep" + initialDelaySeconds: 60 + periodSeconds: 60 + nodeSelector: + beta.kubernetes.io/os: linux + # Tolerate a NoSchedule taint on master that ACS Engine sets. + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "true" + effect: "NoSchedule" + volumes: + - name: docker-sock + hostPath: + path: /var/run + - name: container-hostname + hostPath: + path: /etc/hostname + - name: host-log + hostPath: + path: /var/log + - name: containerlog-path + hostPath: + path: /var/lib/docker/containers + - name: azure-json-path + hostPath: + path: /etc/kubernetes + - name: omsagent-secret + secret: + secretName: omsagent-secret +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: omsagent-rs + namespace: kube-system +spec: + replicas: 1 + selector: + matchLabels: + rsName: "omsagent-rs" + strategy: + type: RollingUpdate + template: + metadata: + labels: + rsName: "omsagent-rs" + annotations: + agentVersion: "1.8.1.256" + dockerProviderVersion: "3.0.0-5" + spec: + serviceAccountName: omsagent + containers: + - name: omsagent + image: "microsoft/oms:healthpreview04152019" + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 150m + memory: 500Mi + requests: + cpu: 50m + memory: 100Mi + env: + - name: AKS_RESOURCE_ID + value: "VALUE_AKS_RESOURCE_ID" + - name: AKS_REGION + value: "VALUE_AKS_REGION" + #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + # - name: ACS_RESOURCE_NAME + # value: "my_acs_cluster_name" + # - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION + # value: "true" + - name: CONTROLLER_TYPE + value: "ReplicaSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + securityContext: + privileged: true + ports: + - containerPort: 25225 + protocol: TCP + - containerPort: 25224 + protocol: UDP + volumeMounts: + - mountPath: /var/run/host + name: docker-sock + - mountPath: /var/log + name: host-log + - mountPath: /var/lib/docker/containers + name: containerlog-path + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + readOnly: true + - mountPath : /etc/config + name: omsagent-rs-config + livenessProbe: + exec: + command: + - /bin/bash + - -c + - ps -ef | grep omsagent | grep -v "grep" + initialDelaySeconds: 60 + periodSeconds: 60 + nodeSelector: + beta.kubernetes.io/os: linux + kubernetes.io/role: agent + volumes: + - name: docker-sock + hostPath: + path: /var/run + - name: container-hostname + hostPath: + path: /etc/hostname + - name: host-log + hostPath: + path: /var/log + - name: containerlog-path + hostPath: + path: /var/lib/docker/containers + - name: azure-json-path + hostPath: + path: /etc/kubernetes + - name: omsagent-secret + secret: + secretName: omsagent-secret + - name: omsagent-rs-config + configMap: + name: omsagent-rs-config +``` + +8. save this file as omsagent.yaml + +9. Replace the following values in the file + +* VALUE_AKS_RESOURCE_ID -- Resource Id of the cluster +* VALUE_AKS_REGION -- Region the cluster is in +* VALUE_WSID -- base 64 encoded Workspace Id. To get this, go to the portal -- log analytics workspace -- Advanced Settings. REMEMBER: PASTE the base 64 encoded value +* VALUE_KEY -- base 64 encoded Primary Shared Key of the workspace. To get this, go to the portal -- log analytics workspace -- Advanced Settings. REMEMBER: PASTE the base 64 encoded value of the key (which is base 64 encoded to start with) + +10. Set the context in your local machine to the AKS cluster +az aks get-credentials -n -g + +11. kubectl apply -f omsagent.yaml \ No newline at end of file From a266899d95dcc825c37bad9888864f4a9bf5b299 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 23 Apr 2019 13:48:13 -0700 Subject: [PATCH 08/55] Adding description --- health/HealthAgentOnboarding.ps1 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/health/HealthAgentOnboarding.ps1 b/health/HealthAgentOnboarding.ps1 index 3ba2b90c..b081adf1 100644 --- a/health/HealthAgentOnboarding.ps1 +++ b/health/HealthAgentOnboarding.ps1 @@ -1,5 +1,10 @@ <# .DESCRIPTION + Upgrades the Kubernetes cluster that has been onboarded to monitoring to a version of the agent + that generates health monitor signals + 1. Installs necessary powershell modules + 2. Onboards Container Insights solution to the supplied LA workspace if not already onboarded + 3. Updates the cluster metadata to link the LA workspace ID to the cluster .PARAMETER aksResourceId Name of the cluster configured on the OMSAgent .PARAMETER loganalyticsWorkspaceResourceId From 5683b4a627f12cb93101da284e87756a4a6a89d0 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 23 Apr 2019 13:54:10 -0700 Subject: [PATCH 09/55] Update HealthAgentOnboarding.ps1 --- health/HealthAgentOnboarding.ps1 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/health/HealthAgentOnboarding.ps1 b/health/HealthAgentOnboarding.ps1 index b081adf1..9d5991dc 100644 --- a/health/HealthAgentOnboarding.ps1 +++ b/health/HealthAgentOnboarding.ps1 @@ -363,10 +363,11 @@ try { (Get-Content -Path $desktopPath\omsagent-template.yaml -Raw) -replace 'VALUE_AKS_RESOURCE_ID', $aksResourceId -replace 'VALUE_AKS_REGION', $aksRegion -replace 'VALUE_WSID', $base64EncodedWsId -replace 'VALUE_KEY', $base64EncodedKey | Set-Content $desktopPath\deployments\omsagent-$clusterName.yaml kubectl delete -f $desktopPath\deployments\omsagent-$clusterName.yaml kubectl apply -f $desktopPath\deployments\omsagent-$clusterName.yaml + Write-Host "Upgraded omsagent" } catch { Write-Host ("Agent deployment failed with an error: '" + $Error[0] + "' ") -ForegroundColor Red } -Write-Host "Upgraded omsagent" + From 651d2714ca80729279fa441d05aea988568fa681 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 23 Apr 2019 13:57:51 -0700 Subject: [PATCH 10/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index b548384d..31a375d6 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -90,7 +90,7 @@ The VALUE_AKS_RESOURCE_ID (resource id of the cluster) can be found in the Prope * Select-AzSubscription -SubscriptionName * New-AzResourceGroupDeployment -Name opt-out -ResourceGroupName -TemplateFile .\HealthPreviewOnboarding.json -TemplateParameterFile .\HealthPreviewOnboardingParams.json -7. Copy the following content into a yaml file: +7. Copy the following content into a yaml file: (You will use this file to do a kubectl apply on the kubernetes cluster) ```yaml apiVersion: v1 kind: ServiceAccount @@ -589,6 +589,5 @@ spec: * VALUE_KEY -- base 64 encoded Primary Shared Key of the workspace. To get this, go to the portal -- log analytics workspace -- Advanced Settings. REMEMBER: PASTE the base 64 encoded value of the key (which is base 64 encoded to start with) 10. Set the context in your local machine to the AKS cluster -az aks get-credentials -n -g - -11. kubectl apply -f omsagent.yaml \ No newline at end of file +Import-AzAksCredential -ResourceGroupName -Name +11. kubectl apply -f omsagent.yaml From ef64af08069a70abafdbe31bee56c2be6a6f8107 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 23 Apr 2019 14:01:47 -0700 Subject: [PATCH 11/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 31a375d6..1013ca66 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -1,9 +1,9 @@ ## Overview -The following documentation outlines the script required to upgrade an existing cluster onboarded to a Log Analytics workspace, to an agent running the workflow that generates health monitor signals into the same workspace. +The following documentation outlines the steps required to upgrade an existing cluster onboarded to a Log Analytics workspace running the omsagent, to an agent running the workflow that generates health monitor signals into the same workspace. * Do a custom off-boarding of the cluster from Monitoring * Installing the new agent that generates health monitor signals -* Please refer to the powershell script +* Please refer to the powershell [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/onboardHealth/health/HealthAgentOnboarding.ps1) for another way to perform the same steps on your cluster ## Prerequisites * Cluster that has already been onboarded to Monitoring using a Log Analytics workspace From b15fe9206852677cbf4bd86b08d7a2d5c1e0027b Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 23 Apr 2019 14:03:00 -0700 Subject: [PATCH 12/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 1013ca66..77d3d3b9 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -3,7 +3,11 @@ The following documentation outlines the steps required to upgrade an existing c * Do a custom off-boarding of the cluster from Monitoring * Installing the new agent that generates health monitor signals -* Please refer to the powershell [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/onboardHealth/health/HealthAgentOnboarding.ps1) for another way to perform the same steps on your cluster +* Please refer to the powershell [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/onboardHealth/health/HealthAgentOnboarding.ps1) for another way to perform the same steps on your cluster. The script basically does the following: + * Installs necessary powershell modules + * Onboards Container Insights solution to the supplied LA workspace if not already onboarded + * Updates the cluster metadata to link the LA workspace ID to the cluster + ## Prerequisites * Cluster that has already been onboarded to Monitoring using a Log Analytics workspace From 5332ff07a529d127c2614f9bb5f3abc86d935a67 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 23 Apr 2019 14:03:28 -0700 Subject: [PATCH 13/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 77d3d3b9..938c53f5 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -4,9 +4,9 @@ The following documentation outlines the steps required to upgrade an existing c * Do a custom off-boarding of the cluster from Monitoring * Installing the new agent that generates health monitor signals * Please refer to the powershell [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/onboardHealth/health/HealthAgentOnboarding.ps1) for another way to perform the same steps on your cluster. The script basically does the following: - * Installs necessary powershell modules - * Onboards Container Insights solution to the supplied LA workspace if not already onboarded - * Updates the cluster metadata to link the LA workspace ID to the cluster + * Installs necessary powershell modules + * Onboards Container Insights solution to the supplied LA workspace if not already onboarded + * Updates the cluster metadata to link the LA workspace ID to the cluster ## Prerequisites From 584d922ce0612dd6e17a0ce4964bb8e9ddbb7fd1 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Tue, 23 Apr 2019 14:05:06 -0700 Subject: [PATCH 14/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 938c53f5..dd197641 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -595,3 +595,6 @@ spec: 10. Set the context in your local machine to the AKS cluster Import-AzAksCredential -ResourceGroupName -Name 11. kubectl apply -f omsagent.yaml + +Once the above steps are done, it can take upto 20 minutes for the health related data to show up which can be accessed using the following link: + From 09ca730d27d1b1ddde5b7121732711d0e0f69598 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Fri, 26 Apr 2019 15:41:05 -0700 Subject: [PATCH 15/55] Integrate telegraf changes --- health/HealthOnboarding.md | 30 ++++++++++------- health/omsagent-template.yaml | 63 ++++++++++++++++++++++++++++++++--- 2 files changed, 76 insertions(+), 17 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index dd197641..0038bc5b 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -394,12 +394,12 @@ spec: dsName: "omsagent-ds" annotations: agentVersion: "1.8.1.256" - dockerProviderVersion: "3.0.0-5" + dockerProviderVersion: "3.0.0-4" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "microsoft/oms:healthpreview04152019" + image: "microsoft/oms:healthpreview04262019" imagePullPolicy: IfNotPresent resources: limits: @@ -407,15 +407,15 @@ spec: memory: 300Mi requests: cpu: 50m - memory: 150Mi + memory: 225Mi env: - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID" - name: AKS_REGION value: "VALUE_AKS_REGION" - #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters - #- name: ACS_RESOURCE_NAME - #value: "my_acs_cluster_name" + # Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + # - name: ACS_RESOURCE_NAME + # value: "aks-engine-health" - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION value: "true" - name: CONTROLLER_TYPE @@ -432,6 +432,9 @@ spec: - containerPort: 25224 protocol: UDP volumeMounts: + - mountPath: /hostfs + name: host-root + readOnly: true - mountPath: /var/run/host name: docker-sock - mountPath: /var/log @@ -448,7 +451,7 @@ spec: command: - /bin/bash - -c - - ps -ef | grep omsagent | grep -v "grep" + - (ps -ef | grep omsagent | grep -v "grep") && (ps -ef | grep td-agent-bit | grep -v "grep") initialDelaySeconds: 60 periodSeconds: 60 nodeSelector: @@ -460,6 +463,9 @@ spec: value: "true" effect: "NoSchedule" volumes: + - name: host-root + hostPath: + path: / - name: docker-sock hostPath: path: /var/run @@ -497,12 +503,12 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "1.8.1.256" - dockerProviderVersion: "3.0.0-5" + dockerProviderVersion: "3.0.0-4" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "microsoft/oms:healthpreview04152019" + image: "microsoft/oms:healthpreview04262019" imagePullPolicy: IfNotPresent resources: limits: @@ -518,9 +524,9 @@ spec: value: "VALUE_AKS_REGION" #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters # - name: ACS_RESOURCE_NAME - # value: "my_acs_cluster_name" - # - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION - # value: "true" + # value: "aks-engine-health" + - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION + value: "true" - name: CONTROLLER_TYPE value: "ReplicaSet" - name: NODE_IP diff --git a/health/omsagent-template.yaml b/health/omsagent-template.yaml index fb4ef5ac..dd53f67a 100644 --- a/health/omsagent-template.yaml +++ b/health/omsagent-template.yaml @@ -86,6 +86,14 @@ data: run_interval 60s log_level debug + + #cadvisor perf- Windows nodes + + type wincadvisorperf + tag oms.api.wincadvisorperf + run_interval 60s + log_level debug + type filter_inventory2mdm @@ -93,6 +101,14 @@ data: log_level info + # custom_metrics_mdm filter plugin for perf data from windows nodes + + type filter_cadvisor2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + log_level info + + type out_oms log_level debug @@ -208,6 +224,37 @@ data: max_retry_wait 9m retry_mdm_post_wait_minutes 60 + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_wincadvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + type out_oms_api @@ -254,7 +301,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "microsoft/oms:healthpreview04172019" + image: "microsoft/oms:healthpreview04262019" imagePullPolicy: IfNotPresent resources: limits: @@ -262,7 +309,7 @@ spec: memory: 300Mi requests: cpu: 50m - memory: 150Mi + memory: 225Mi env: - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID" @@ -287,6 +334,9 @@ spec: - containerPort: 25224 protocol: UDP volumeMounts: + - mountPath: /hostfs + name: host-root + readOnly: true - mountPath: /var/run/host name: docker-sock - mountPath: /var/log @@ -303,7 +353,7 @@ spec: command: - /bin/bash - -c - - ps -ef | grep main | grep -v "grep" + - (ps -ef | grep omsagent | grep -v "grep") && (ps -ef | grep td-agent-bit | grep -v "grep") initialDelaySeconds: 60 periodSeconds: 60 nodeSelector: @@ -315,6 +365,9 @@ spec: value: "true" effect: "NoSchedule" volumes: + - name: host-root + hostPath: + path: / - name: docker-sock hostPath: path: /var/run @@ -357,7 +410,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "microsoft/oms:healthpreview04172019" + image: "microsoft/oms:healthpreview04262019" imagePullPolicy: IfNotPresent resources: limits: @@ -408,7 +461,7 @@ spec: command: - /bin/bash - -c - - ps -ef | grep main | grep -v "grep" + - ps -ef | grep omsagent | grep -v "grep" initialDelaySeconds: 60 periodSeconds: 60 nodeSelector: From 3f00b86a484248f2571134b3dcfdb8dd847578a6 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Fri, 26 Apr 2019 15:58:15 -0700 Subject: [PATCH 16/55] Fix a bug in the script --- health/HealthAgentOnboarding.ps1 | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/health/HealthAgentOnboarding.ps1 b/health/HealthAgentOnboarding.ps1 index 9d5991dc..2d9f79e3 100644 --- a/health/HealthAgentOnboarding.ps1 +++ b/health/HealthAgentOnboarding.ps1 @@ -291,11 +291,10 @@ if ($false -eq $isSolutionOnboarded) { -ResourceGroupName $workspaceResourceGroupName ` -TemplateUri https://raw.githubusercontent.com/Microsoft/OMS-docker/ci_feature/docs/templates/azuremonitor-containerSolution.json ` -TemplateParameterObject $Parameters -ErrorAction Stop` - Write-Host("") + Write-Host("Successfully added Container Insights Solution") -ForegroundColor Green - Write-Host("") } catch { Write-Host ("Template deployment failed with an error: '" + $Error[0] + "' ") -ForegroundColor Red @@ -304,7 +303,7 @@ if ($false -eq $isSolutionOnboarded) { } -Write-Host("Successfully added Container Insights Solution to workspace" + $workspaceName) -ForegroundColor Green +Write-Host("Successfully added Container Insights Solution to workspace " + $workspaceName) -ForegroundColor Green try { $DeploymentName = "ClusterHealthOnboarding-" + ((Get-Date).ToUniversalTime()).ToString('MMdd-HHmm') @@ -313,11 +312,13 @@ try { $Parameters.Add("aksResourceLocation", $aksResourceLocation) $Parameters.Add("workspaceResourceId", $logAnalyticsWorkspaceResourceId) - + Write-Host " Onboarding cluster to provided LA workspace " + New-AzResourceGroupDeployment -Name $DeploymentName ` -ResourceGroupName $workspaceResourceGroupName ` -TemplateUri https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/onboardHealth/health/customOnboarding.json ` -TemplateParameterObject $Parameters -ErrorAction Stop` + Write-Host("") Write-Host("Successfully custom onboarded cluster to Monitoring") -ForegroundColor Green From cfb0a6d3a3697688b7a2af38acad391f2defec61 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Tue, 25 Jun 2019 12:59:50 -0700 Subject: [PATCH 17/55] Updating template --- health/omsagent-template.yaml | 120 ++++++++++++++++++++++++++++++---- 1 file changed, 109 insertions(+), 11 deletions(-) diff --git a/health/omsagent-template.yaml b/health/omsagent-template.yaml index dd53f67a..cbdbf8f5 100644 --- a/health/omsagent-template.yaml +++ b/health/omsagent-template.yaml @@ -11,7 +11,12 @@ metadata: rules: - apiGroups: [""] resources: ["pods", "events", "nodes", "namespaces", "services"] + verbs: ["list", "get", "watch"] +- apiGroups: ["extensions"] + resources: ["deployments"] verbs: ["list"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1beta1 @@ -31,6 +36,12 @@ apiVersion: v1 data: kube.conf: |- # Fluentd config file for OMS Docker - cluster components (kubeAPI) + #fluent forward plugin + + type forward + port 25235 + bind 0.0.0.0 + #Kubernetes pod inventory @@ -82,7 +93,7 @@ data: #Kubernetes health type kubehealth - tag oms.api.KubeHealth.AgentCollectionTime + tag oms.api.KubeHealth.ReplicaSet run_interval 60s log_level debug @@ -108,6 +119,10 @@ data: metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes log_level info + #health model aggregation filter + + type filter_health_model_builder + type out_oms @@ -282,6 +297,67 @@ data: WSID: "VALUE_WSID" KEY: "VALUE_KEY" --- +kind: Service +apiVersion: v1 +metadata: + name: replicaset-service + namespace: kube-system +spec: + selector: + rsName: "omsagent-rs" + ports: + - protocol: TCP + port: 25235 + targetPort: in-rs-tcp +--- +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: azurefile +provisioner: kubernetes.io/azure-file +mountOptions: + - dir_mode=0777 + - file_mode=0777 + - uid=1000 + - gid=1000 +parameters: + skuName: Standard_LRS +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: system:azure-cloud-provider +rules: +- apiGroups: [''] + resources: ['secrets'] + verbs: ['get','create'] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: system:azure-cloud-provider +roleRef: + kind: ClusterRole + apiGroup: rbac.authorization.k8s.io + name: system:azure-cloud-provider +subjects: +- kind: ServiceAccount + name: persistent-volume-binder + namespace: kube-system +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: azurefile + namespace: kube-system +spec: + accessModes: + - ReadWriteMany + storageClassName: azurefile + resources: + requests: + storage: 10Mi +--- apiVersion: extensions/v1beta1 kind: DaemonSet metadata: @@ -295,20 +371,21 @@ spec: labels: dsName: "omsagent-ds" annotations: - agentVersion: "1.8.1.256" - dockerProviderVersion: "3.0.0-4" + agentVersion: "1.10.0.1" + dockerProviderVersion: "5.0.0-1" + schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "microsoft/oms:healthpreview04262019" + image: "rdilip83/healthpreview06252019-1" imagePullPolicy: IfNotPresent resources: limits: cpu: 150m memory: 300Mi requests: - cpu: 50m + cpu: 75m memory: 225Mi env: - name: AKS_RESOURCE_ID @@ -347,13 +424,15 @@ spec: name: azure-json-path - mountPath: /etc/omsagent-secret name: omsagent-secret + - mountPath: /etc/config/settings + name: settings-vol-config readOnly: true livenessProbe: exec: command: - /bin/bash - -c - - (ps -ef | grep omsagent | grep -v "grep") && (ps -ef | grep td-agent-bit | grep -v "grep") + - /opt/livenessprobe.sh initialDelaySeconds: 60 periodSeconds: 60 nodeSelector: @@ -386,6 +465,10 @@ spec: - name: omsagent-secret secret: secretName: omsagent-secret + - name: settings-vol-config + configMap: + name: container-azm-ms-agentconfig + optional: true --- apiVersion: extensions/v1beta1 kind: Deployment @@ -404,13 +487,14 @@ spec: labels: rsName: "omsagent-rs" annotations: - agentVersion: "1.8.1.256" - dockerProviderVersion: "3.0.0-4" + agentVersion: "1.10.0.1" + dockerProviderVersion: "5.0.0-1" + schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "microsoft/oms:healthpreview04262019" + image: "rdilip83/healthpreview06252019-1" imagePullPolicy: IfNotPresent resources: limits: @@ -418,7 +502,7 @@ spec: memory: 500Mi requests: cpu: 50m - memory: 100Mi + memory: 175Mi env: - name: AKS_RESOURCE_ID value: "VALUE_AKS_RESOURCE_ID" @@ -442,6 +526,9 @@ spec: protocol: TCP - containerPort: 25224 protocol: UDP + - containerPort: 25235 + protocol: TCP + name: in-rs-tcp volumeMounts: - mountPath: /var/run/host name: docker-sock @@ -456,6 +543,11 @@ spec: readOnly: true - mountPath : /etc/config name: omsagent-rs-config + - mountPath: /etc/config/settings + name: settings-vol-config + readOnly: true + - mountPath: "/mnt/azure" + name: azurefile-pv livenessProbe: exec: command: @@ -489,4 +581,10 @@ spec: - name: omsagent-rs-config configMap: name: omsagent-rs-config - + - name: settings-vol-config + configMap: + name: container-azm-ms-agentconfig + optional: true + - name: azurefile-pv + persistentVolumeClaim: + claimName: azurefile From 49c858acffc5d220785f762c296a2e5cd3ffd755 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Tue, 25 Jun 2019 14:08:10 -0700 Subject: [PATCH 18/55] updating agent onboarding script --- health/HealthAgentOnboarding.ps1 | 42 ++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/health/HealthAgentOnboarding.ps1 b/health/HealthAgentOnboarding.ps1 index 2d9f79e3..5fce7211 100644 --- a/health/HealthAgentOnboarding.ps1 +++ b/health/HealthAgentOnboarding.ps1 @@ -11,6 +11,8 @@ Azure ResourceId of the log analytics workspace Id .PARAMETER aksResourceLocation Resource location of the AKS cluster resource + .PARAMETER isAksEngine + Whether this is an AKS-Engine Cluster #> param( [Parameter(mandatory = $true)] @@ -18,9 +20,23 @@ param( [Parameter(mandatory = $true)] [string]$aksResourceLocation, [Parameter(mandatory = $true)] - [string]$logAnalyticsWorkspaceResourceId + [string]$logAnalyticsWorkspaceResourceId, + [Parameter(mandatory = $true)] + [string]$isAksEngineCluster, + [Parameter(mandatory = $false)] + [string]$acsResourceName + ) + +if ($isAksEngineCluster) { + if ($null -eq $acsResourceName) { + Write-Host("acsResourceName cannot be null if aksEngineCluster is true") -ForegroundColor Red + Stop-Transcript + exit + } +} + # checks the required Powershell modules exist and if not exists, request the user permission to install $azAccountModule = Get-Module -ListAvailable -Name Az.Accounts $azResourcesModule = Get-Module -ListAvailable -Name Az.Resources @@ -281,7 +297,7 @@ $isSolutionOnboarded = $WorkspaceIPDetails.Enabled[$ContainerInsightsIndex] if ($false -eq $isSolutionOnboarded) { $DeploymentName = "ContainerInsightsSolutionOnboarding-" + ((Get-Date).ToUniversalTime()).ToString('MMdd-HHmm') - $Parameters = @{} + $Parameters = @{ } $Parameters.Add("workspaceResourceId", $logAnalyticsWorkspaceResourceID) $Parameters.Add("workspaceRegion", $WorkspaceLocation) $Parameters @@ -306,8 +322,10 @@ if ($false -eq $isSolutionOnboarded) { Write-Host("Successfully added Container Insights Solution to workspace " + $workspaceName) -ForegroundColor Green try { + $aksResourceDetails = $aksResourceId.Split("/") + $clusterResourceGroupName = $aksResourceDetails[4].Trim() $DeploymentName = "ClusterHealthOnboarding-" + ((Get-Date).ToUniversalTime()).ToString('MMdd-HHmm') - $Parameters = @{} + $Parameters = @{ } $Parameters.Add("aksResourceId", $aksResourceId) $Parameters.Add("aksResourceLocation", $aksResourceLocation) $Parameters.Add("workspaceResourceId", $logAnalyticsWorkspaceResourceId) @@ -315,7 +333,7 @@ try { Write-Host " Onboarding cluster to provided LA workspace " New-AzResourceGroupDeployment -Name $DeploymentName ` - -ResourceGroupName $workspaceResourceGroupName ` + -ResourceGroupName $clusterResourceGroupName ` -TemplateUri https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/onboardHealth/health/customOnboarding.json ` -TemplateParameterObject $Parameters -ErrorAction Stop` @@ -335,8 +353,9 @@ $desktopPath = [System.Environment]::GetFolderPath([System.Environment+SpecialFo if (-not (test-path $desktopPath\deployments) ) { Write-Host "$($desktopPath)\deployments doesn't exist, creating it" - mkdir $desktopPath\deployments|out-null -} else { + mkdir $desktopPath\deployments | out-null +} +else { Write-Host "$($desktopPath)\deployments exists, no need to create it" } @@ -360,9 +379,16 @@ try { $wsid = $WorkspaceInformation.CustomerId $base64EncodedKey = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($key)) $base64EncodedWsId = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($wsid)) - Invoke-WebRequest https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/onboardHealth/health/omsagent-template.yaml -OutFile $desktopPath\omsagent-template.yaml - (Get-Content -Path $desktopPath\omsagent-template.yaml -Raw) -replace 'VALUE_AKS_RESOURCE_ID', $aksResourceId -replace 'VALUE_AKS_REGION', $aksRegion -replace 'VALUE_WSID', $base64EncodedWsId -replace 'VALUE_KEY', $base64EncodedKey | Set-Content $desktopPath\deployments\omsagent-$clusterName.yaml + if ($isAksEngineCluster -eq $true) { + Invoke-WebRequest https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/onboardHealth/health/omsagent-template-aks-engine.yaml -OutFile $desktopPath\omsagent-template.yaml + } + else { + Invoke-WebRequest https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/kubeHealth/health/omsagent-template.yaml -OutFile $desktopPath\omsagent-template.yaml + } + + (Get-Content -Path $desktopPath\omsagent-template.yaml -Raw) -replace 'VALUE_AKS_RESOURCE_ID', $aksResourceId -replace 'VALUE_AKS_REGION', $aksRegion -replace 'VALUE_WSID', $base64EncodedWsId -replace 'VALUE_KEY', $base64EncodedKey -replace 'VALUE_ACS_RESOURCE_NAME', $acsResourceName | Set-Content $desktopPath\deployments\omsagent-$clusterName.yaml kubectl delete -f $desktopPath\deployments\omsagent-$clusterName.yaml + sleep 10 kubectl apply -f $desktopPath\deployments\omsagent-$clusterName.yaml Write-Host "Upgraded omsagent" } From 91a79f199b8bfe4cc788efaddc2f6652a112c258 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Thu, 27 Jun 2019 08:32:44 -0700 Subject: [PATCH 19/55] Updating image name --- health/omsagent-template.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/health/omsagent-template.yaml b/health/omsagent-template.yaml index cbdbf8f5..080d497e 100644 --- a/health/omsagent-template.yaml +++ b/health/omsagent-template.yaml @@ -378,7 +378,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "rdilip83/healthpreview06252019-1" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" imagePullPolicy: IfNotPresent resources: limits: @@ -494,7 +494,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "rdilip83/healthpreview06252019-1" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" imagePullPolicy: IfNotPresent resources: limits: From 68b31ba06233bf485849827770a51d87d52ab8ee Mon Sep 17 00:00:00 2001 From: r-dilip Date: Thu, 27 Jun 2019 13:19:06 -0700 Subject: [PATCH 20/55] AKS Engine Template --- health/omsagent-template-aks-engine.yaml | 590 +++++++++++++++++++++++ 1 file changed, 590 insertions(+) create mode 100644 health/omsagent-template-aks-engine.yaml diff --git a/health/omsagent-template-aks-engine.yaml b/health/omsagent-template-aks-engine.yaml new file mode 100644 index 00000000..5c809e26 --- /dev/null +++ b/health/omsagent-template-aks-engine.yaml @@ -0,0 +1,590 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: omsagent + namespace: kube-system +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1beta1 +metadata: + name: omsagent-reader +rules: +- apiGroups: [""] + resources: ["pods", "events", "nodes", "namespaces", "services"] + verbs: ["list", "get", "watch"] +- apiGroups: ["extensions"] + resources: ["deployments"] + verbs: ["list"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1beta1 +metadata: + name: omsagentclusterrolebinding +subjects: + - kind: ServiceAccount + name: omsagent + namespace: kube-system +roleRef: + kind: ClusterRole + name: omsagent-reader + apiGroup: rbac.authorization.k8s.io +--- +kind: ConfigMap +apiVersion: v1 +data: + kube.conf: |- + # Fluentd config file for OMS Docker - cluster components (kubeAPI) + #fluent forward plugin + + type forward + port 25235 + bind 0.0.0.0 + + + #Kubernetes pod inventory + + type kubepodinventory + tag oms.containerinsights.KubePodInventory + run_interval 60s + log_level debug + + + #Kubernetes events + + type kubeevents + tag oms.containerinsights.KubeEvents + run_interval 60s + log_level debug + + + #Kubernetes logs + + type kubelogs + tag oms.api.KubeLogs + run_interval 60s + + + #Kubernetes services + + type kubeservices + tag oms.containerinsights.KubeServices + run_interval 60s + log_level debug + + + #Kubernetes Nodes + + type kubenodeinventory + tag oms.containerinsights.KubeNodeInventory + run_interval 60s + log_level debug + + + #Kubernetes perf + + type kubeperf + tag oms.api.KubePerf + run_interval 60s + log_level debug + + + #Kubernetes health + + type kubehealth + tag oms.api.KubeHealth.ReplicaSet + run_interval 60s + log_level debug + + + #cadvisor perf- Windows nodes + + type wincadvisorperf + tag oms.api.wincadvisorperf + run_interval 60s + log_level debug + + + + type filter_inventory2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + log_level info + + + # custom_metrics_mdm filter plugin for perf data from windows nodes + + type filter_cadvisor2mdm + custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + log_level info + + #health model aggregation filter + + type filter_health_model_builder + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 5m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer + buffer_queue_limit 10 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_kubernetes_logs*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeservices*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_oms + log_level debug + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer + buffer_queue_limit 20 + flush_interval 20s + retry_limit 10 + retry_wait 15s + max_retry_wait 9m + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + + + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_wincadvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + + + + type out_mdm + log_level debug + num_threads 5 + buffer_chunk_limit 20m + buffer_type file + buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 30s + max_retry_wait 9m + retry_mdm_post_wait_minutes 60 + + + + type out_oms_api + log_level debug + buffer_chunk_limit 10m + buffer_type file + buffer_path %STATE_DIR_WS%/out_oms_api_kubehealth*.buffer + buffer_queue_limit 10 + flush_interval 20s + retry_limit 10 + retry_wait 30s + +metadata: + name: omsagent-rs-config + namespace: kube-system +--- +apiVersion: v1 +kind: Secret +metadata: + name: omsagent-secret + namespace: kube-system +type: Opaque +data: + #BASE64 ENCODED (Both WSID & KEY) INSIDE DOUBLE QUOTE ("") + WSID: "VALUE_WSID" + KEY: "VALUE_KEY" +--- +kind: Service +apiVersion: v1 +metadata: + name: replicaset-service + namespace: kube-system +spec: + selector: + rsName: "omsagent-rs" + ports: + - protocol: TCP + port: 25235 + targetPort: in-rs-tcp +--- +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: azurefile +provisioner: kubernetes.io/azure-file +mountOptions: + - dir_mode=0777 + - file_mode=0777 + - uid=1000 + - gid=1000 +parameters: + skuName: Standard_LRS +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: system:azure-cloud-provider +rules: +- apiGroups: [''] + resources: ['secrets'] + verbs: ['get','create'] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: system:azure-cloud-provider +roleRef: + kind: ClusterRole + apiGroup: rbac.authorization.k8s.io + name: system:azure-cloud-provider +subjects: +- kind: ServiceAccount + name: persistent-volume-binder + namespace: kube-system +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: azurefile + namespace: kube-system +spec: + accessModes: + - ReadWriteMany + storageClassName: azurefile + resources: + requests: + storage: 10Mi +--- +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: omsagent + namespace: kube-system +spec: + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + dsName: "omsagent-ds" + annotations: + agentVersion: "1.10.0.1" + dockerProviderVersion: "5.0.0-1" + schema-versions: "v1" + spec: + serviceAccountName: omsagent + containers: + - name: omsagent + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 150m + memory: 300Mi + requests: + cpu: 75m + memory: 225Mi + env: + # - name: AKS_RESOURCE_ID + # value: "VALUE_AKS_RESOURCE_ID" + # - name: AKS_REGION + # value: "VALUE_AKS_REGION" + # Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + - name: ACS_RESOURCE_NAME + value: "VALUE_ACS_RESOURCE_NAME" + - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION + value: "true" + - name: CONTROLLER_TYPE + value: "DaemonSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + securityContext: + privileged: true + ports: + - containerPort: 25225 + protocol: TCP + - containerPort: 25224 + protocol: UDP + volumeMounts: + - mountPath: /hostfs + name: host-root + readOnly: true + - mountPath: /var/run/host + name: docker-sock + - mountPath: /var/log + name: host-log + - mountPath: /var/lib/docker/containers + name: containerlog-path + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + - mountPath: /etc/config/settings + name: settings-vol-config + readOnly: true + livenessProbe: + exec: + command: + - /bin/bash + - -c + - /opt/livenessprobe.sh + initialDelaySeconds: 60 + periodSeconds: 60 + nodeSelector: + beta.kubernetes.io/os: linux + # Tolerate a NoSchedule taint on master that ACS Engine sets. + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "true" + effect: "NoSchedule" + volumes: + - name: host-root + hostPath: + path: / + - name: docker-sock + hostPath: + path: /var/run + - name: container-hostname + hostPath: + path: /etc/hostname + - name: host-log + hostPath: + path: /var/log + - name: containerlog-path + hostPath: + path: /var/lib/docker/containers + - name: azure-json-path + hostPath: + path: /etc/kubernetes + - name: omsagent-secret + secret: + secretName: omsagent-secret + - name: settings-vol-config + configMap: + name: container-azm-ms-agentconfig + optional: true +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: omsagent-rs + namespace: kube-system +spec: + replicas: 1 + selector: + matchLabels: + rsName: "omsagent-rs" + strategy: + type: RollingUpdate + template: + metadata: + labels: + rsName: "omsagent-rs" + annotations: + agentVersion: "1.10.0.1" + dockerProviderVersion: "5.0.0-1" + schema-versions: "v1" + spec: + serviceAccountName: omsagent + containers: + - name: omsagent + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 150m + memory: 500Mi + requests: + cpu: 50m + memory: 175Mi + env: + # - name: AKS_RESOURCE_ID + # value: "VALUE_AKS_RESOURCE_ID" + # - name: AKS_REGION + # value: "VALUE_AKS_REGION" + # Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + - name: ACS_RESOURCE_NAME + value: "aks-engine-health" + - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION + value: "true" + - name: CONTROLLER_TYPE + value: "ReplicaSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + securityContext: + privileged: true + ports: + - containerPort: 25225 + protocol: TCP + - containerPort: 25224 + protocol: UDP + - containerPort: 25235 + protocol: TCP + name: in-rs-tcp + volumeMounts: + - mountPath: /var/run/host + name: docker-sock + - mountPath: /var/log + name: host-log + - mountPath: /var/lib/docker/containers + name: containerlog-path + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + readOnly: true + - mountPath : /etc/config + name: omsagent-rs-config + - mountPath: /etc/config/settings + name: settings-vol-config + readOnly: true + - mountPath: "/mnt/azure" + name: azurefile-pv + livenessProbe: + exec: + command: + - /bin/bash + - -c + - ps -ef | grep omsagent | grep -v "grep" + initialDelaySeconds: 60 + periodSeconds: 60 + nodeSelector: + beta.kubernetes.io/os: linux + kubernetes.io/role: agent + volumes: + - name: docker-sock + hostPath: + path: /var/run + - name: container-hostname + hostPath: + path: /etc/hostname + - name: host-log + hostPath: + path: /var/log + - name: containerlog-path + hostPath: + path: /var/lib/docker/containers + - name: azure-json-path + hostPath: + path: /etc/kubernetes + - name: omsagent-secret + secret: + secretName: omsagent-secret + - name: omsagent-rs-config + configMap: + name: omsagent-rs-config + - name: settings-vol-config + configMap: + name: container-azm-ms-agentconfig + optional: true + - name: azurefile-pv + persistentVolumeClaim: + claimName: azurefile From 7a80b1e8ef2b8b412de76a0edce328698c974879 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Thu, 27 Jun 2019 13:30:06 -0700 Subject: [PATCH 21/55] Added AKS Engine Onboarding steps --- health/HealthOnboarding.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 0038bc5b..4216043a 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -604,3 +604,13 @@ Import-AzAksCredential -ResourceGroupName -Name + + + +### AKS Engine Onboarding +1. Add Container Insights Solution to your workspace using the instructions [here] (http://aka.ms/coinhelmdoc) +2. Tag your AKS-Engine cluster appropriately using the instructions [here] (http://aka.ms/coin-acs-tag-doc) +3. Set the current k8s context to be your AKS Engine cluster +4. Download the [omsagent-template-aks-engine.yaml] (https://github.com/microsoft/OMS-docker/blob/dilipr/kubeHealth/health/omsagent-template-aks-engine.yaml) file to your local machine +5. Update the Values of VALUE_ACS_RESOURCE_NAME, VALUE_WSID {base 64 encoded workspace id} and VALUE_KEY {base 64 encoded workspace key}. See [here] (https://github.com/Azure/aks-engine/blob/master/examples/addons/container-monitoring/README.md) on instructions to get the Workspace ID and Key of the file downloaded in Step 5 above +6. Run kubectl apply on the file {kubectl apply -f } From a462aefcc7f8a3b458eb8dc76f645c32d569d9ae Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Thu, 27 Jun 2019 13:31:33 -0700 Subject: [PATCH 22/55] Fix Links --- health/HealthOnboarding.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 4216043a..72cb313c 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -608,9 +608,9 @@ Once the above steps are done, it can take upto 20 minutes for the health relate ### AKS Engine Onboarding -1. Add Container Insights Solution to your workspace using the instructions [here] (http://aka.ms/coinhelmdoc) -2. Tag your AKS-Engine cluster appropriately using the instructions [here] (http://aka.ms/coin-acs-tag-doc) +1. Add Container Insights Solution to your workspace using the instructions [here](http://aka.ms/coinhelmdoc) +2. Tag your AKS-Engine cluster appropriately using the instructions [here](http://aka.ms/coin-acs-tag-doc) 3. Set the current k8s context to be your AKS Engine cluster -4. Download the [omsagent-template-aks-engine.yaml] (https://github.com/microsoft/OMS-docker/blob/dilipr/kubeHealth/health/omsagent-template-aks-engine.yaml) file to your local machine -5. Update the Values of VALUE_ACS_RESOURCE_NAME, VALUE_WSID {base 64 encoded workspace id} and VALUE_KEY {base 64 encoded workspace key}. See [here] (https://github.com/Azure/aks-engine/blob/master/examples/addons/container-monitoring/README.md) on instructions to get the Workspace ID and Key of the file downloaded in Step 5 above +4. Download the [omsagent-template-aks-engine.yaml](https://github.com/microsoft/OMS-docker/blob/dilipr/kubeHealth/health/omsagent-template-aks-engine.yaml) file to your local machine +5. Update the Values of VALUE_ACS_RESOURCE_NAME, VALUE_WSID {base 64 encoded workspace id} and VALUE_KEY {base 64 encoded workspace key}. See [here](https://github.com/Azure/aks-engine/blob/master/examples/addons/container-monitoring/README.md) on instructions to get the Workspace ID and Key of the file downloaded in Step 5 above 6. Run kubectl apply on the file {kubectl apply -f } From b553bce02ab6fdf32fac4b525ef451902edc2192 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Thu, 27 Jun 2019 13:32:54 -0700 Subject: [PATCH 23/55] Minor Update --- health/HealthOnboarding.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 72cb313c..6b2c50f3 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -613,4 +613,4 @@ Once the above steps are done, it can take upto 20 minutes for the health relate 3. Set the current k8s context to be your AKS Engine cluster 4. Download the [omsagent-template-aks-engine.yaml](https://github.com/microsoft/OMS-docker/blob/dilipr/kubeHealth/health/omsagent-template-aks-engine.yaml) file to your local machine 5. Update the Values of VALUE_ACS_RESOURCE_NAME, VALUE_WSID {base 64 encoded workspace id} and VALUE_KEY {base 64 encoded workspace key}. See [here](https://github.com/Azure/aks-engine/blob/master/examples/addons/container-monitoring/README.md) on instructions to get the Workspace ID and Key of the file downloaded in Step 5 above -6. Run kubectl apply on the file {kubectl apply -f } +6. Run kubectl apply on the file {kubectl apply -f path_to_file_in_step_4} From 059f23715b61ab7def69513acbaf09eff2c9e8c8 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 11:05:33 -0700 Subject: [PATCH 24/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 6b2c50f3..269e4ba7 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -7,16 +7,20 @@ The following documentation outlines the steps required to upgrade an existing c * Installs necessary powershell modules * Onboards Container Insights solution to the supplied LA workspace if not already onboarded * Updates the cluster metadata to link the LA workspace ID to the cluster + * Script Pre-reqs: + * kubectl should have been installed and be present in the path + * script should run in an elevated command prompt ## Prerequisites * Cluster that has already been onboarded to Monitoring using a Log Analytics workspace -* kubectl -* Powershell with the following modules installed +* kubectl should be intalled and should be available in the path +* Powershell with the following modules installed (Else the onboarding script will install those for you) * Az.Accounts * Az.Resources * Az.OperationalInsights * Az.Aks +* Run in an elevated powershell window ## Steps 1. Copy and paste the following JSON into a file. From 7b6c550fdb31a0825e808f4255bc87448aa45301 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 11:29:36 -0700 Subject: [PATCH 25/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 269e4ba7..e0ea7518 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -7,9 +7,9 @@ The following documentation outlines the steps required to upgrade an existing c * Installs necessary powershell modules * Onboards Container Insights solution to the supplied LA workspace if not already onboarded * Updates the cluster metadata to link the LA workspace ID to the cluster - * Script Pre-reqs: - * kubectl should have been installed and be present in the path - * script should run in an elevated command prompt + * __Script Pre-reqs:__ + * kubectl should have been installed and be present in the path + * script should run in an elevated command prompt ## Prerequisites From 2292298df7b21d60631dac7ef910cd6ccb66a381 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 11:32:01 -0700 Subject: [PATCH 26/55] Create optouttemplate.json --- health/optouttemplate.json | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 health/optouttemplate.json diff --git a/health/optouttemplate.json b/health/optouttemplate.json new file mode 100644 index 00000000..6e75999c --- /dev/null +++ b/health/optouttemplate.json @@ -0,0 +1,36 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "aksResourceId": { + "type": "string", + "metadata": { + "description": "AKS Cluster resource id" + } + }, + "aksResourceLocation": { + "type": "string", + "metadata": { + "description": "Location of the AKS resource e.g. \"East US\"" + } + }, + }, + "resources": [ + { + "name": "[split(parameters('aksResourceId'),'/')[8]]", + "type": "Microsoft.ContainerService/managedClusters", + "location": "[parameters('aksResourceLocation')]", + "apiVersion": "2018-03-31", + "properties": { + "mode": "Incremental", + "id": "[parameters('aksResourceId')]", + "addonProfiles": { + "omsagent": { + "enabled": false, + "config": null + } + } + } + } + ] +} From e4b6189a9bbd0e75c35a1208cce247dd82f275c0 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 11:37:53 -0700 Subject: [PATCH 27/55] Update optouttemplate.json From bacd58cebd615081f8ef88f35b51dccf183c1ea8 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Mon, 1 Jul 2019 11:40:20 -0700 Subject: [PATCH 28/55] update optouttemplate --- health/optouttemplate.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/health/optouttemplate.json b/health/optouttemplate.json index 6e75999c..dbd34027 100644 --- a/health/optouttemplate.json +++ b/health/optouttemplate.json @@ -33,4 +33,4 @@ } } ] -} +} \ No newline at end of file From ee36ca34ad79959ec6d1282335452c18e2d9119b Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 11:43:28 -0700 Subject: [PATCH 29/55] Update optouttemplate.json --- health/optouttemplate.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/health/optouttemplate.json b/health/optouttemplate.json index dbd34027..b036aba2 100644 --- a/health/optouttemplate.json +++ b/health/optouttemplate.json @@ -13,7 +13,7 @@ "metadata": { "description": "Location of the AKS resource e.g. \"East US\"" } - }, + } }, "resources": [ { @@ -33,4 +33,4 @@ } } ] -} \ No newline at end of file +} From e0a4fa676a4257c437e41a73df4973093cd5fec4 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Mon, 1 Jul 2019 11:55:26 -0700 Subject: [PATCH 30/55] update optouttemplate --- health/optouttemplate.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/health/optouttemplate.json b/health/optouttemplate.json index dbd34027..c8ea7c2f 100644 --- a/health/optouttemplate.json +++ b/health/optouttemplate.json @@ -13,7 +13,7 @@ "metadata": { "description": "Location of the AKS resource e.g. \"East US\"" } - }, + } }, "resources": [ { From 3d2d456bb61ba49255845c9eb09e8b4b8c904664 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 12:07:26 -0700 Subject: [PATCH 31/55] Update optouttemplate.json From e5b9ac8ab60dc7c09dc309751519e9119c0fe348 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 12:13:21 -0700 Subject: [PATCH 32/55] Update optouttemplate.json --- health/optouttemplate.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/health/optouttemplate.json b/health/optouttemplate.json index b036aba2..fe602dfc 100644 --- a/health/optouttemplate.json +++ b/health/optouttemplate.json @@ -1,3 +1,5 @@ + + { "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", "contentVersion": "1.0.0.0", From d8dc4dabe6efd2bddbd978c3f2e50ad1896541f0 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 12:13:46 -0700 Subject: [PATCH 33/55] Update optouttemplate.json --- health/optouttemplate.json | 2 -- 1 file changed, 2 deletions(-) diff --git a/health/optouttemplate.json b/health/optouttemplate.json index fe602dfc..b036aba2 100644 --- a/health/optouttemplate.json +++ b/health/optouttemplate.json @@ -1,5 +1,3 @@ - - { "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", "contentVersion": "1.0.0.0", From 34c36cd38ded872bdc8e679d193704c1a05fb5bf Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 12:14:40 -0700 Subject: [PATCH 34/55] Updatr From 709f9935e5192b01e99fa128d9e8d71102130e1f Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 12:55:17 -0700 Subject: [PATCH 35/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 179 +++++++++++++++++++++++++++++-------- 1 file changed, 140 insertions(+), 39 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index e0ea7518..2c6654dc 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -1,18 +1,26 @@ ## Overview The following documentation outlines the steps required to upgrade an existing cluster onboarded to a Log Analytics workspace running the omsagent, to an agent running the workflow that generates health monitor signals into the same workspace. +## Script Prerequisites +* script should run in an elevated command prompt +* kubectl should have been installed and be present in the path + +## What does the script do: * Do a custom off-boarding of the cluster from Monitoring -* Installing the new agent that generates health monitor signals -* Please refer to the powershell [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/onboardHealth/health/HealthAgentOnboarding.ps1) for another way to perform the same steps on your cluster. The script basically does the following: - * Installs necessary powershell modules - * Onboards Container Insights solution to the supplied LA workspace if not already onboarded - * Updates the cluster metadata to link the LA workspace ID to the cluster - * __Script Pre-reqs:__ - * kubectl should have been installed and be present in the path - * script should run in an elevated command prompt - - -## Prerequisites +* Installs necessary powershell modules +* Onboards Container Insights solution to the supplied LA workspace if not already onboarded +* Updates the cluster metadata to link the LA workspace ID to the cluster +* Installs the new agent that generates health monitor signals + +## Script Execution +* Download the script from [here](https://github.com/Microsoft/OMS-docker/blob/dilipr/onboardHealth/health/HealthAgentOnboarding.ps1) +* Run the script: + .\HealthAgentOnboarding.ps1 -aksResourceId -aksResourceLocation * New-AzResourceGroupDeployment -Name opt-out -ResourceGroupName -TemplateFile .\HealthPreviewOnboarding.json -TemplateParameterFile .\HealthPreviewOnboardingParams.json -7. Copy the following content into a yaml file: (You will use this file to do a kubectl apply on the kubernetes cluster) +7. Copy the following content into a yaml file: (You will use this file to do a kubectl apply on the kubernetes cluster). This file is also available [here](https://raw.githubusercontent.com/microsoft/OMS-docker/dilipr/kubeHealth/health/omsagent-template.yaml) ```yaml apiVersion: v1 kind: ServiceAccount @@ -113,7 +121,12 @@ metadata: rules: - apiGroups: [""] resources: ["pods", "events", "nodes", "namespaces", "services"] + verbs: ["list", "get", "watch"] +- apiGroups: ["extensions"] + resources: ["deployments"] verbs: ["list"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1beta1 @@ -133,6 +146,12 @@ apiVersion: v1 data: kube.conf: |- # Fluentd config file for OMS Docker - cluster components (kubeAPI) + #fluent forward plugin + + type forward + port 25235 + bind 0.0.0.0 + #Kubernetes pod inventory @@ -184,7 +203,7 @@ data: #Kubernetes health type kubehealth - tag oms.api.KubeHealth.AgentCollectionTime + tag oms.api.KubeHealth.ReplicaSet run_interval 60s log_level debug @@ -211,6 +230,11 @@ data: log_level info + #health model aggregation filter + + type filter_health_model_builder + + type out_oms log_level debug @@ -397,31 +421,30 @@ spec: labels: dsName: "omsagent-ds" annotations: - agentVersion: "1.8.1.256" - dockerProviderVersion: "3.0.0-4" + agentVersion: "1.10.0.1" + dockerProviderVersion: "5.0.0-0" + schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "microsoft/oms:healthpreview04262019" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" imagePullPolicy: IfNotPresent resources: limits: cpu: 150m memory: 300Mi requests: - cpu: 50m + cpu: 75m memory: 225Mi env: - name: AKS_RESOURCE_ID - value: "VALUE_AKS_RESOURCE_ID" + value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION - value: "VALUE_AKS_REGION" - # Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters - # - name: ACS_RESOURCE_NAME - # value: "aks-engine-health" - - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION - value: "true" + value: "VALUE_AKS_REGION_VALUE" + #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + #- name: ACS_RESOURCE_NAME + #value: "my_acs_cluster_name" - name: CONTROLLER_TYPE value: "DaemonSet" - name: NODE_IP @@ -449,13 +472,15 @@ spec: name: azure-json-path - mountPath: /etc/omsagent-secret name: omsagent-secret + - mountPath: /etc/config/settings + name: settings-vol-config readOnly: true livenessProbe: exec: command: - /bin/bash - -c - - (ps -ef | grep omsagent | grep -v "grep") && (ps -ef | grep td-agent-bit | grep -v "grep") + - /opt/livenessprobe.sh initialDelaySeconds: 60 periodSeconds: 60 nodeSelector: @@ -488,6 +513,10 @@ spec: - name: omsagent-secret secret: secretName: omsagent-secret + - name: settings-vol-config + configMap: + name: container-azm-ms-agentconfig + optional: true --- apiVersion: extensions/v1beta1 kind: Deployment @@ -506,13 +535,14 @@ spec: labels: rsName: "omsagent-rs" annotations: - agentVersion: "1.8.1.256" - dockerProviderVersion: "3.0.0-4" + agentVersion: "1.10.0.1" + dockerProviderVersion: "5.0.0-0" + schema-versions: "v1" spec: serviceAccountName: omsagent containers: - name: omsagent - image: "microsoft/oms:healthpreview04262019" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" imagePullPolicy: IfNotPresent resources: limits: @@ -520,17 +550,15 @@ spec: memory: 500Mi requests: cpu: 50m - memory: 100Mi + memory: 175Mi env: - - name: AKS_RESOURCE_ID - value: "VALUE_AKS_RESOURCE_ID" - - name: AKS_REGION - value: "VALUE_AKS_REGION" + #- name: AKS_RESOURCE_ID + # value: "VALUE_AKS_RESOURCE_ID_VALUE" + #- name: AKS_REGION + # value: "VALUE_AKS_RESOURCE_REGION_VALUE" #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters - # - name: ACS_RESOURCE_NAME - # value: "aks-engine-health" - - name: DISABLE_KUBE_SYSTEM_LOG_COLLECTION - value: "true" + - name: ACS_RESOURCE_NAME + value: "my_acs_cluster_name" - name: CONTROLLER_TYPE value: "ReplicaSet" - name: NODE_IP @@ -558,6 +586,11 @@ spec: readOnly: true - mountPath : /etc/config name: omsagent-rs-config + - mountPath: /etc/config/settings + name: settings-vol-config + readOnly: true + - mountPath: "/mnt/azure" + name: azurefile-pv livenessProbe: exec: command: @@ -590,7 +623,75 @@ spec: secretName: omsagent-secret - name: omsagent-rs-config configMap: - name: omsagent-rs-config + name: omsagent-rs-config + - name: settings-vol-config + configMap: + name: container-azm-ms-agentconfig + optional: true + - name: azurefile-pv + persistentVolumeClaim: + claimName: azurefile +--- +kind: Service +apiVersion: v1 +metadata: + name: repliceset-service + namespace: kube-system +spec: + selector: + rsName: "omsagent-rs" + ports: + - protocol: TCP + port: 25235 + targetPort: in-rs-tcp + nodePort: 25235 +--- +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: azurefile +provisioner: kubernetes.io/azure-file +mountOptions: + - dir_mode=0777 + - file_mode=0777 + - uid=1000 + - gid=1000 +parameters: + skuName: Standard_LRS +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: system:azure-cloud-provider +rules: +- apiGroups: [''] + resources: ['secrets'] + verbs: ['get','create'] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: system:azure-cloud-provider +roleRef: + kind: ClusterRole + apiGroup: rbac.authorization.k8s.io + name: system:azure-cloud-provider +subjects: +- kind: ServiceAccount + name: persistent-volume-binder + namespace: kube-system +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: azurefile +spec: + accessModes: + - ReadWriteMany + storageClassName: azurefile + resources: + requests: + storage: 10Mi ``` 8. save this file as omsagent.yaml From e98dcd092ccff8bd82373b50c47897eb8932eb43 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 12:57:59 -0700 Subject: [PATCH 36/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 2c6654dc..b3fb64ff 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -1,18 +1,18 @@ ## Overview The following documentation outlines the steps required to upgrade an existing cluster onboarded to a Log Analytics workspace running the omsagent, to an agent running the workflow that generates health monitor signals into the same workspace. -## Script Prerequisites +#### Script Prerequisites * script should run in an elevated command prompt * kubectl should have been installed and be present in the path -## What does the script do: +#### What does the script do: * Do a custom off-boarding of the cluster from Monitoring * Installs necessary powershell modules * Onboards Container Insights solution to the supplied LA workspace if not already onboarded * Updates the cluster metadata to link the LA workspace ID to the cluster -* Installs the new agent that generates health monitor signals +* Installs the new agent that generates health monitor signals (using kubectl) -## Script Execution +#### Script Execution * Download the script from [here](https://github.com/Microsoft/OMS-docker/blob/dilipr/onboardHealth/health/HealthAgentOnboarding.ps1) * Run the script: .\HealthAgentOnboarding.ps1 -aksResourceId -aksResourceLocation Date: Mon, 1 Jul 2019 12:59:26 -0700 Subject: [PATCH 37/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index b3fb64ff..bcd78632 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -1,6 +1,9 @@ ## Overview The following documentation outlines the steps required to upgrade an existing cluster onboarded to a Log Analytics workspace running the omsagent, to an agent running the workflow that generates health monitor signals into the same workspace. +## Script +We have a handy script which can onboards your AKS clusters to a version of the agent that can generate the health model. Read on to find out more + #### Script Prerequisites * script should run in an elevated command prompt * kubectl should have been installed and be present in the path From af603c66aa442e4f1e127ac165bf24f0e9217bc6 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 13:29:43 -0700 Subject: [PATCH 38/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index bcd78632..459f87e4 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -20,6 +20,9 @@ We have a handy script which can onboards your AKS clusters to a version of the * Run the script: .\HealthAgentOnboarding.ps1 -aksResourceId -aksResourceLocation +* There should be a new tab named "Health" in Cluster Insights ## Manual Steps From ad8a5d59afc3c76bd97a01f7d6bed762e224e0d7 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 13:38:41 -0700 Subject: [PATCH 39/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 459f87e4..07ffd40f 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -16,13 +16,15 @@ We have a handy script which can onboards your AKS clusters to a version of the * Installs the new agent that generates health monitor signals (using kubectl) #### Script Execution -* Download the script from [here](https://github.com/Microsoft/OMS-docker/blob/dilipr/onboardHealth/health/HealthAgentOnboarding.ps1) +* Download the script from [here](https://github.com/Microsoft/OMS-docker/blob/dilipr/kubeHealth/health/HealthAgentOnboarding.ps1) * Run the script: .\HealthAgentOnboarding.ps1 -aksResourceId -aksResourceLocation * There should be a new tab named "Health" in Cluster Insights +* Note: It might take about 15-20 min after the script runs for the data to show up in the Insights Page of the Cluster ## Manual Steps From cf2d2babbe6e8862a343e29b97949dbb1c6f284f Mon Sep 17 00:00:00 2001 From: r-dilip Date: Mon, 1 Jul 2019 13:41:52 -0700 Subject: [PATCH 40/55] Fix issue where cluster and workspace are in different subscriptions --- health/HealthAgentOnboarding.ps1 | 94 ++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 41 deletions(-) diff --git a/health/HealthAgentOnboarding.ps1 b/health/HealthAgentOnboarding.ps1 index 5fce7211..a0735d83 100644 --- a/health/HealthAgentOnboarding.ps1 +++ b/health/HealthAgentOnboarding.ps1 @@ -11,8 +11,6 @@ Azure ResourceId of the log analytics workspace Id .PARAMETER aksResourceLocation Resource location of the AKS cluster resource - .PARAMETER isAksEngine - Whether this is an AKS-Engine Cluster #> param( [Parameter(mandatory = $true)] @@ -20,23 +18,10 @@ param( [Parameter(mandatory = $true)] [string]$aksResourceLocation, [Parameter(mandatory = $true)] - [string]$logAnalyticsWorkspaceResourceId, - [Parameter(mandatory = $true)] - [string]$isAksEngineCluster, - [Parameter(mandatory = $false)] - [string]$acsResourceName - + [string]$logAnalyticsWorkspaceResourceId ) -if ($isAksEngineCluster) { - if ($null -eq $acsResourceName) { - Write-Host("acsResourceName cannot be null if aksEngineCluster is true") -ForegroundColor Red - Stop-Transcript - exit - } -} - # checks the required Powershell modules exist and if not exists, request the user permission to install $azAccountModule = Get-Module -ListAvailable -Name Az.Accounts $azResourcesModule = Get-Module -ListAvailable -Name Az.Resources @@ -229,7 +214,7 @@ else { try { Write-Host("Current Subscription:") $account - Write-Host("Changing to subscription: $workspaceSubscriptionId") + Write-Host("Changing to workspace subscription: $workspaceSubscriptionId") Set-AzContext -SubscriptionId $workspaceSubscriptionId } catch { @@ -240,20 +225,26 @@ else { exit } } + + $WorkspaceInformation = Get-AzOperationalInsightsWorkspace -ResourceGroupName $workspaceResourceGroupName -Name $workspaceName -ErrorAction Stop + $key = (Get-AzOperationalInsightsWorkspaceSharedKeys -ResourceGroupName $workspaceResourceGroupName -Name $workspaceName).PrimarySharedKey + $wsid = $WorkspaceInformation.CustomerId + $base64EncodedKey = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($key)) + $base64EncodedWsId = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($wsid)) } # validate specified logAnalytics workspace exists and got access permissions -Write-Host("Checking specified logAnalyticsWorkspaceResourceId exists and got access...") +#Write-Host("Checking specified logAnalyticsWorkspaceResourceId exists and got access...") -try { - $WorkspaceInformation = Get-AzOperationalInsightsWorkspace -ResourceGroupName $workspaceResourceGroupName -Name $workspaceName -ErrorAction Stop -} -catch { - Write-Host("") - Write-Host("Could not fetch details for the workspace : '" + $workspaceName + "'. Please make sure that it hasn't been deleted and you have access to it.") -ForegroundColor Red - Stop-Transcript - exit -} +# try { + +# } +# catch { +# Write-Host("") +# Write-Host("Could not fetch details for the workspace : '" + $workspaceName + "'. Please make sure that it hasn't been deleted and you have access to it.") -ForegroundColor Red +# Stop-Transcript +# exit +# } Write-Host("Successfully verified specified logAnalyticsWorkspaceResourceId valid and exists...") -ForegroundColor Green @@ -324,15 +315,44 @@ Write-Host("Successfully added Container Insights Solution to workspace " + $wor try { $aksResourceDetails = $aksResourceId.Split("/") $clusterResourceGroupName = $aksResourceDetails[4].Trim() - $DeploymentName = "ClusterHealthOnboarding-" + ((Get-Date).ToUniversalTime()).ToString('MMdd-HHmm') + $clusterSubscriptionId = $aksResourceDetails[2].Trim() + $clusterName = $aksResourceDetails[8].Trim() $Parameters = @{ } $Parameters.Add("aksResourceId", $aksResourceId) $Parameters.Add("aksResourceLocation", $aksResourceLocation) - $Parameters.Add("workspaceResourceId", $logAnalyticsWorkspaceResourceId) + $Parameters Write-Host " Onboarding cluster to provided LA workspace " - New-AzResourceGroupDeployment -Name $DeploymentName ` + if ($account.Subscription.Id -eq $clusterSubscriptionId) { + Write-Host("Subscription: $clusterSubscriptionId is already selected. Account details: ") + $account + } + else { + try { + Write-Host("Current Subscription:") + $account + Write-Host("Changing to subscription: $clusterSubscriptionId") + Set-AzContext -SubscriptionId $clusterSubscriptionId + } + catch { + Write-Host("") + Write-Host("Could not select subscription with ID : " + $workspaceSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit + } + } + + Write-Host("Disabling Monitoring using template deployment") + + $DeploymentName = "OptOutMonitoring-" + ((Get-Date).ToUniversalTime()).ToString('MMdd-HHmm') + New-AzResourceGroupDeployment -Name $DeploymentName -ResourceGroupName $clusterResourceGroupName -TemplateUri https://raw.githubusercontent.com/microsoft/OMS-docker/dilipr/kubeHealth/health/optouttemplate.json -TemplateParameterObject $Parameters -ErrorAction Stop + + Write-Host("Enabling Custom Monitoring using template deployment") + $DeploymentName = "ClusterHealthOnboarding-" + ((Get-Date).ToUniversalTime()).ToString('MMdd-HHmm') + $Parameters.Add("workspaceResourceId", $logAnalyticsWorkspaceResourceId) + New-AzResourceGroupDeployment -Name $DeploymentName ` -ResourceGroupName $clusterResourceGroupName ` -TemplateUri https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/onboardHealth/health/customOnboarding.json ` -TemplateParameterObject $Parameters -ErrorAction Stop` @@ -345,6 +365,7 @@ try { } catch { Write-Host ("Template deployment failed with an error: '" + $Error[0] + "' ") -ForegroundColor Red + exit #Write-Host("Please contact us by emailing askcoin@microsoft.com for help") -ForegroundColor Red } @@ -375,17 +396,8 @@ try { Import-AzAksCredential -Id $aksResourceId -Force - $key = (Get-AzOperationalInsightsWorkspaceSharedKeys -ResourceGroupName $workspaceResourceGroupName -Name $workspaceName).PrimarySharedKey - $wsid = $WorkspaceInformation.CustomerId - $base64EncodedKey = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($key)) - $base64EncodedWsId = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($wsid)) - if ($isAksEngineCluster -eq $true) { - Invoke-WebRequest https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/onboardHealth/health/omsagent-template-aks-engine.yaml -OutFile $desktopPath\omsagent-template.yaml - } - else { - Invoke-WebRequest https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/kubeHealth/health/omsagent-template.yaml -OutFile $desktopPath\omsagent-template.yaml - } - + Invoke-WebRequest https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/kubeHealth/health/omsagent-template.yaml -OutFile $desktopPath\omsagent-template.yaml + (Get-Content -Path $desktopPath\omsagent-template.yaml -Raw) -replace 'VALUE_AKS_RESOURCE_ID', $aksResourceId -replace 'VALUE_AKS_REGION', $aksRegion -replace 'VALUE_WSID', $base64EncodedWsId -replace 'VALUE_KEY', $base64EncodedKey -replace 'VALUE_ACS_RESOURCE_NAME', $acsResourceName | Set-Content $desktopPath\deployments\omsagent-$clusterName.yaml kubectl delete -f $desktopPath\deployments\omsagent-$clusterName.yaml sleep 10 From 8c3939f510ec96072a6c8b365c887986ddcdc47f Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 13:46:25 -0700 Subject: [PATCH 41/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 07ffd40f..c43084f1 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -2,7 +2,7 @@ The following documentation outlines the steps required to upgrade an existing cluster onboarded to a Log Analytics workspace running the omsagent, to an agent running the workflow that generates health monitor signals into the same workspace. ## Script -We have a handy script which can onboards your AKS clusters to a version of the agent that can generate the health model. Read on to find out more +We have a handy [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/kubeHealth/health/HealthAgentOnboarding.ps1) which can onboard your AKS clusters to a version of the agent that can generate the health model. Read on to find out more #### Script Prerequisites * script should run in an elevated command prompt @@ -26,7 +26,17 @@ We have a handy script which can onboards your AKS clusters to a version of the * There should be a new tab named "Health" in Cluster Insights * Note: It might take about 15-20 min after the script runs for the data to show up in the Insights Page of the Cluster -## Manual Steps + +### AKS Engine Onboarding +1. Add Container Insights Solution to your workspace using the instructions [here](http://aka.ms/coinhelmdoc) +2. Tag your AKS-Engine cluster appropriately using the instructions [here](http://aka.ms/coin-acs-tag-doc) +3. Set the current k8s context to be your AKS Engine cluster (the kube-config should refer to your AKS-Engine cluster) +4. Download the [omsagent-template-aks-engine.yaml](https://github.com/microsoft/OMS-docker/blob/dilipr/kubeHealth/health/omsagent-template-aks-engine.yaml) file to your local machine +5. Update the Values of VALUE_ACS_RESOURCE_NAME, VALUE_WSID {base 64 encoded workspace id} and VALUE_KEY {base 64 encoded workspace key}. See [here](https://github.com/Azure/aks-engine/blob/master/examples/addons/container-monitoring/README.md) on instructions to get the Workspace ID and Key of the file downloaded in Step 5 above +6. Run kubectl apply on the file {kubectl apply -f path_to_file_in_step_4} + + +## Manual Steps (AKS cluster) #### Prerequisites * Cluster that has already been onboarded to Monitoring using a Log Analytics workspace @@ -718,12 +728,3 @@ Import-AzAksCredential -ResourceGroupName -Name - - -### AKS Engine Onboarding -1. Add Container Insights Solution to your workspace using the instructions [here](http://aka.ms/coinhelmdoc) -2. Tag your AKS-Engine cluster appropriately using the instructions [here](http://aka.ms/coin-acs-tag-doc) -3. Set the current k8s context to be your AKS Engine cluster (the kube-config should refer to your AKS-Engine cluster) -4. Download the [omsagent-template-aks-engine.yaml](https://github.com/microsoft/OMS-docker/blob/dilipr/kubeHealth/health/omsagent-template-aks-engine.yaml) file to your local machine -5. Update the Values of VALUE_ACS_RESOURCE_NAME, VALUE_WSID {base 64 encoded workspace id} and VALUE_KEY {base 64 encoded workspace key}. See [here](https://github.com/Azure/aks-engine/blob/master/examples/addons/container-monitoring/README.md) on instructions to get the Workspace ID and Key of the file downloaded in Step 5 above -6. Run kubectl apply on the file {kubectl apply -f path_to_file_in_step_4} From 71bcd4c54d621baf20c3660a36c7b9b5b64a424d Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 13:47:16 -0700 Subject: [PATCH 42/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index c43084f1..03ce006c 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -1,7 +1,7 @@ ## Overview The following documentation outlines the steps required to upgrade an existing cluster onboarded to a Log Analytics workspace running the omsagent, to an agent running the workflow that generates health monitor signals into the same workspace. -## Script +### Onboarding using a script (AKS Engine) We have a handy [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/kubeHealth/health/HealthAgentOnboarding.ps1) which can onboard your AKS clusters to a version of the agent that can generate the health model. Read on to find out more #### Script Prerequisites From 78cdbdad605a3aaf95f1a6a3ba5f053ef35bb5b0 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 14:36:04 -0700 Subject: [PATCH 43/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 03ce006c..9b5de779 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -18,8 +18,12 @@ We have a handy [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/kub #### Script Execution * Download the script from [here](https://github.com/Microsoft/OMS-docker/blob/dilipr/kubeHealth/health/HealthAgentOnboarding.ps1) * Run the script: - .\HealthAgentOnboarding.ps1 -aksResourceId -aksResourceLocation -aksResourceLocation + -logAnalyticsWorkspaceResourceId (e.g./subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourceGroups/dilipr-health-preview/providers/Microsoft.OperationalInsights/workspaces/dilipr-health-preview) + * Please make sure the right location of the AKS cluster is passed in to the script (without spaces e.g. eastus, southcentralus) + +#### Notes +* After running the script, if there is more than one version of the omsagent DaemonSet running on a node (you can figure this out by running __kubecetl get pods -n kube-system -o wide__), [disable monitoring](https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-optout) and re-run the onboarding script #### Viewing the health model * Navigate to From 95f8caaee006f1c97edf82ae398eac2bfe57ec6a Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 1 Jul 2019 14:41:18 -0700 Subject: [PATCH 44/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 9b5de779..e48eaf9e 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -1,7 +1,7 @@ ## Overview The following documentation outlines the steps required to upgrade an existing cluster onboarded to a Log Analytics workspace running the omsagent, to an agent running the workflow that generates health monitor signals into the same workspace. -### Onboarding using a script (AKS Engine) +### Onboarding using a script (AKS) We have a handy [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/kubeHealth/health/HealthAgentOnboarding.ps1) which can onboard your AKS clusters to a version of the agent that can generate the health model. Read on to find out more #### Script Prerequisites From 4d867fea8f8a7b8fe89a728a9376220f27413b85 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Wed, 3 Jul 2019 10:36:30 -0700 Subject: [PATCH 45/55] A few changes based on Feedback 1. exit if cluster is already onboarded 2. exit if cluster is already opted out of monitoring, but omsagent still runs --- health/HealthAgentOnboarding.ps1 | 150 +++++++++++++++++-------------- 1 file changed, 85 insertions(+), 65 deletions(-) diff --git a/health/HealthAgentOnboarding.ps1 b/health/HealthAgentOnboarding.ps1 index a0735d83..20af19ea 100644 --- a/health/HealthAgentOnboarding.ps1 +++ b/health/HealthAgentOnboarding.ps1 @@ -22,6 +22,8 @@ param( ) +$OptOutLink = "https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-optout" + # checks the required Powershell modules exist and if not exists, request the user permission to install $azAccountModule = Get-Module -ListAvailable -Name Az.Accounts $azResourcesModule = Get-Module -ListAvailable -Name Az.Resources @@ -169,13 +171,16 @@ if ($workspaceResourceDetails.Length -ne 9) { $workspaceSubscriptionId = $workspaceResourceDetails[2] $workspaceSubscriptionId = $workspaceSubscriptionId.Trim() - $workspaceResourceGroupName = $workspaceResourceDetails[4] $workspaceResourceGroupName = $workspaceResourceGroupName.Trim() - $workspaceName = $workspaceResourceDetails[8] $workspaceResourceGroupName = $workspaceResourceGroupName.Trim() +$aksResourceDetails = $aksResourceId.Split("/") +$clusterResourceGroupName = $aksResourceDetails[4].Trim() +$clusterSubscriptionId = $aksResourceDetails[2].Trim() +$clusterName = $aksResourceDetails[8].Trim() + Write-Host("LogAnalytics Workspace SubscriptionId : '" + $workspaceSubscriptionId + "' ") -ForegroundColor Green try { @@ -195,59 +200,97 @@ catch { if ($null -eq $account.Account) { try { Write-Host("Please login...") - Connect-AzAccount -subscriptionid $workspaceSubscriptionId + Connect-AzAccount -subscriptionid $clusterSubscriptionId } catch { Write-Host("") - Write-Host("Could not select subscription with ID : " + $workspaceSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red + Write-Host("Could not select subscription with ID : " + $clusterSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red Write-Host("") Stop-Transcript exit } } + +Write-Host("Checking if cluster is onboarded to Container Monitoring") +if ($account.Subscription.Id -eq $clusterSubscriptionId) { + Write-Host("Subscription: $clusterSubscriptionId is already selected. Account details: ") + $account +} else { - if ($account.Subscription.Id -eq $workspaceSubscriptionId) { - Write-Host("Subscription: $SubscriptionId is already selected. Account details: ") + try { + Write-Host("Current Subscription:") $account + Write-Host("Changing to workspace subscription: $clusterSubscriptionId") + Set-AzContext -SubscriptionId $clusterSubscriptionId + } - else { - try { - Write-Host("Current Subscription:") - $account - Write-Host("Changing to workspace subscription: $workspaceSubscriptionId") - Set-AzContext -SubscriptionId $workspaceSubscriptionId - } - catch { - Write-Host("") - Write-Host("Could not select subscription with ID : " + $workspaceSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red - Write-Host("") - Stop-Transcript - exit - } + catch { + Write-Host("") + Write-Host("Could not select subscription with ID : " + $workspaceSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit + } +} + +try { + $resources = Get-AzResource -ResourceGroupName $clusterResourceGroupName -Name $clusterName -ResourceType "Microsoft.ContainerService/managedClusters" -ExpandProperties -ErrorAction Stop -WarningAction Stop + $clusterResource = $resources[0] + + $props = ($clusterResource.Properties | ConvertTo-Json).toLower() | ConvertFrom-Json + + if ($true -eq $props.addonprofiles.omsagent.enabled -and $null -ne $props.addonprofiles.omsagent.config) { + Write-Host("Your cluster is already onboarded to Azure monitor for containers. Please refer to the following documentation to opt-out and re-run this script again:") -ForegroundColor Red; + Write-Host("") + Write-Host($OptOutLink) -ForegroundColor Red + Write-Host("") + throw } - $WorkspaceInformation = Get-AzOperationalInsightsWorkspace -ResourceGroupName $workspaceResourceGroupName -Name $workspaceName -ErrorAction Stop - $key = (Get-AzOperationalInsightsWorkspaceSharedKeys -ResourceGroupName $workspaceResourceGroupName -Name $workspaceName).PrimarySharedKey - $wsid = $WorkspaceInformation.CustomerId - $base64EncodedKey = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($key)) - $base64EncodedWsId = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($wsid)) + Write-Host("Setting context to the current cluster") + Import-AzAksCredential -Id $aksResourceId -Force + $omsagentCount = kubectl get pods -n kube-system | Select-String omsagent + if ($null -eq $omsagentCount) { + Write-Host ("OmsAgent is not running. Proceeding to do custom onboarding for Health Agent") + } + else { + Write-Host ("Cluster is not enabled for Monitoring. But detected omsagent pods. Please wait for 30 min to ensure that omsagent containers are completely stopped and re-run this script") -ForegroundColor Red + Stop-Transcript + exit + } +} +catch { + Write-Host("Error when checking if cluster is already onboarded") + exit } -# validate specified logAnalytics workspace exists and got access permissions -#Write-Host("Checking specified logAnalyticsWorkspaceResourceId exists and got access...") -# try { - -# } -# catch { -# Write-Host("") -# Write-Host("Could not fetch details for the workspace : '" + $workspaceName + "'. Please make sure that it hasn't been deleted and you have access to it.") -ForegroundColor Red -# Stop-Transcript -# exit -# } +if ($account.Subscription.Id -eq $workspaceSubscriptionId) { + Write-Host("Subscription: $workspaceSubscriptionId is already selected. Account details: ") + $account +} +else { + try { + Write-Host("Current Subscription:") + $account + Write-Host("Changing to workspace subscription: $workspaceSubscriptionId") + Set-AzContext -SubscriptionId $workspaceSubscriptionId + } + catch { + Write-Host("") + Write-Host("Could not select subscription with ID : " + $workspaceSubscriptionId + ". Please make sure the ID you entered is correct and you have access to the cluster" ) -ForegroundColor Red + Write-Host("") + Stop-Transcript + exit + } +} +$WorkspaceInformation = Get-AzOperationalInsightsWorkspace -ResourceGroupName $workspaceResourceGroupName -Name $workspaceName -ErrorAction Stop +$key = (Get-AzOperationalInsightsWorkspaceSharedKeys -ResourceGroupName $workspaceResourceGroupName -Name $workspaceName).PrimarySharedKey +$wsid = $WorkspaceInformation.CustomerId +$base64EncodedKey = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($key)) +$base64EncodedWsId = [System.Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes($wsid)) Write-Host("Successfully verified specified logAnalyticsWorkspaceResourceId valid and exists...") -ForegroundColor Green - $WorkspaceLocation = $WorkspaceInformation.Location if ($null -eq $WorkspaceLocation) { @@ -313,13 +356,11 @@ if ($false -eq $isSolutionOnboarded) { Write-Host("Successfully added Container Insights Solution to workspace " + $workspaceName) -ForegroundColor Green try { - $aksResourceDetails = $aksResourceId.Split("/") - $clusterResourceGroupName = $aksResourceDetails[4].Trim() - $clusterSubscriptionId = $aksResourceDetails[2].Trim() - $clusterName = $aksResourceDetails[8].Trim() $Parameters = @{ } $Parameters.Add("aksResourceId", $aksResourceId) $Parameters.Add("aksResourceLocation", $aksResourceLocation) + $Parameters.Add("workspaceResourceId", $logAnalyticsWorkspaceResourceId) + $DeploymentName = "ClusterHealthOnboarding-" + ((Get-Date).ToUniversalTime()).ToString('MMdd-HHmm') $Parameters Write-Host " Onboarding cluster to provided LA workspace " @@ -344,14 +385,7 @@ try { } } - Write-Host("Disabling Monitoring using template deployment") - - $DeploymentName = "OptOutMonitoring-" + ((Get-Date).ToUniversalTime()).ToString('MMdd-HHmm') - New-AzResourceGroupDeployment -Name $DeploymentName -ResourceGroupName $clusterResourceGroupName -TemplateUri https://raw.githubusercontent.com/microsoft/OMS-docker/dilipr/kubeHealth/health/optouttemplate.json -TemplateParameterObject $Parameters -ErrorAction Stop - Write-Host("Enabling Custom Monitoring using template deployment") - $DeploymentName = "ClusterHealthOnboarding-" + ((Get-Date).ToUniversalTime()).ToString('MMdd-HHmm') - $Parameters.Add("workspaceResourceId", $logAnalyticsWorkspaceResourceId) New-AzResourceGroupDeployment -Name $DeploymentName ` -ResourceGroupName $clusterResourceGroupName ` -TemplateUri https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/onboardHealth/health/customOnboarding.json ` @@ -369,9 +403,7 @@ catch { #Write-Host("Please contact us by emailing askcoin@microsoft.com for help") -ForegroundColor Red } - $desktopPath = [System.Environment]::GetFolderPath([System.Environment+SpecialFolder]::Desktop) - if (-not (test-path $desktopPath\deployments) ) { Write-Host "$($desktopPath)\deployments doesn't exist, creating it" mkdir $desktopPath\deployments | out-null @@ -379,34 +411,22 @@ if (-not (test-path $desktopPath\deployments) ) { else { Write-Host "$($desktopPath)\deployments exists, no need to create it" } - - try { $aksResourceDetails = $aksResourceId.Split("/") - - if ($aksResourceDetails.Length -ne 9) { Write-Host("aksResourceDetails should be valid Azure Resource Id format") -ForegroundColor Red exit } - $clusterName = $aksResourceDetails[8].Trim() $clusterResourceGroupName = $aksResourceDetails[4].Trim() - Import-AzAksCredential -Id $aksResourceId -Force - Invoke-WebRequest https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/kubeHealth/health/omsagent-template.yaml -OutFile $desktopPath\omsagent-template.yaml - (Get-Content -Path $desktopPath\omsagent-template.yaml -Raw) -replace 'VALUE_AKS_RESOURCE_ID', $aksResourceId -replace 'VALUE_AKS_REGION', $aksRegion -replace 'VALUE_WSID', $base64EncodedWsId -replace 'VALUE_KEY', $base64EncodedKey -replace 'VALUE_ACS_RESOURCE_NAME', $acsResourceName | Set-Content $desktopPath\deployments\omsagent-$clusterName.yaml - kubectl delete -f $desktopPath\deployments\omsagent-$clusterName.yaml - sleep 10 + (Get-Content -Path $desktopPath\omsagent-template.yaml -Raw) -replace 'VALUE_AKS_RESOURCE_ID', $aksResourceId -replace 'VALUE_AKS_REGION', $aksResourceLocation -replace 'VALUE_WSID', $base64EncodedWsId -replace 'VALUE_KEY', $base64EncodedKey -replace 'VALUE_ACS_RESOURCE_NAME', $acsResourceName | Set-Content $desktopPath\deployments\omsagent-$clusterName.yaml kubectl apply -f $desktopPath\deployments\omsagent-$clusterName.yaml - Write-Host "Upgraded omsagent" + Write-Host "Successfully onboarded to health model omsagent" -ForegroundColor Green } catch { Write-Host ("Agent deployment failed with an error: '" + $Error[0] + "' ") -ForegroundColor Red -} - - - +} \ No newline at end of file From c0fec9280beb50177e0f4cf48353160e422eb400 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Wed, 3 Jul 2019 15:07:55 -0700 Subject: [PATCH 46/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index e48eaf9e..0d3bab7c 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -9,7 +9,6 @@ We have a handy [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/kub * kubectl should have been installed and be present in the path #### What does the script do: -* Do a custom off-boarding of the cluster from Monitoring * Installs necessary powershell modules * Onboards Container Insights solution to the supplied LA workspace if not already onboarded * Updates the cluster metadata to link the LA workspace ID to the cluster @@ -22,9 +21,6 @@ We have a handy [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/kub -logAnalyticsWorkspaceResourceId (e.g./subscriptions/72c8e8ca-dc16-47dc-b65c-6b5875eb600a/resourceGroups/dilipr-health-preview/providers/Microsoft.OperationalInsights/workspaces/dilipr-health-preview) * Please make sure the right location of the AKS cluster is passed in to the script (without spaces e.g. eastus, southcentralus) -#### Notes -* After running the script, if there is more than one version of the omsagent DaemonSet running on a node (you can figure this out by running __kubecetl get pods -n kube-system -o wide__), [disable monitoring](https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-optout) and re-run the onboarding script - #### Viewing the health model * Navigate to * There should be a new tab named "Health" in Cluster Insights @@ -32,6 +28,8 @@ We have a handy [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/kub ### AKS Engine Onboarding +Before proceeding with the onboarding steps, opt out of monitoring using the steps outlined [here] + 1. Add Container Insights Solution to your workspace using the instructions [here](http://aka.ms/coinhelmdoc) 2. Tag your AKS-Engine cluster appropriately using the instructions [here](http://aka.ms/coin-acs-tag-doc) 3. Set the current k8s context to be your AKS Engine cluster (the kube-config should refer to your AKS-Engine cluster) From e584bf6059671f04fe4b1d537d03a46a7a4bb874 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Wed, 3 Jul 2019 17:06:11 -0700 Subject: [PATCH 47/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 699 +------------------------------------ 1 file changed, 4 insertions(+), 695 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 0d3bab7c..9cf6531f 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -22,7 +22,7 @@ We have a handy [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/kub * Please make sure the right location of the AKS cluster is passed in to the script (without spaces e.g. eastus, southcentralus) #### Viewing the health model -* Navigate to +* Navigate to * There should be a new tab named "Health" in Cluster Insights * Note: It might take about 15-20 min after the script runs for the data to show up in the Insights Page of the Cluster @@ -34,699 +34,8 @@ Before proceeding with the onboarding steps, opt out of monitoring using the ste 2. Tag your AKS-Engine cluster appropriately using the instructions [here](http://aka.ms/coin-acs-tag-doc) 3. Set the current k8s context to be your AKS Engine cluster (the kube-config should refer to your AKS-Engine cluster) 4. Download the [omsagent-template-aks-engine.yaml](https://github.com/microsoft/OMS-docker/blob/dilipr/kubeHealth/health/omsagent-template-aks-engine.yaml) file to your local machine -5. Update the Values of VALUE_ACS_RESOURCE_NAME, VALUE_WSID {base 64 encoded workspace id} and VALUE_KEY {base 64 encoded workspace key}. See [here](https://github.com/Azure/aks-engine/blob/master/examples/addons/container-monitoring/README.md) on instructions to get the Workspace ID and Key of the file downloaded in Step 5 above -6. Run kubectl apply on the file {kubectl apply -f path_to_file_in_step_4} +5. Update the Values of VALUE_ACS_RESOURCE_NAME, VALUE_WSID {base 64 encoded workspace id} and VALUE_KEY {base 64 encoded workspace key}. See [here](https://github.com/Azure/aks-engine/blob/master/examples/addons/container-monitoring/README.md) on instructions to get the Workspace ID and Key of the file downloaded in Step 4 above +6. Run kubectl delete on the file {kubectl delete -f path_to_file_in_step_4} +7. Run kubectl apply on the file {kubectl apply -f path_to_file_in_step_4} -## Manual Steps (AKS cluster) - -#### Prerequisites -* Cluster that has already been onboarded to Monitoring using a Log Analytics workspace -* kubectl should be intalled and should be available in the path -* Powershell with the following modules installed (Else the onboarding script will install those for you) - * Az.Accounts - * Az.Resources - * Az.OperationalInsights - * Az.Aks -* Run in an elevated powershell window - -#### Steps -1. Copy and paste the following JSON into a file. - -```json -{ - "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", - "contentVersion": "1.0.0.0", - "parameters": { - "aksResourceId": { - "type": "string", - "metadata": { - "description": "AKS Cluster Resource ID" - } - }, - "aksResourceLocation": { - "type": "string", - "metadata": { - "description": "Location of the AKS resource e.g. \"East US\"" - } - } -}, -"resources": [ - { - "name": "[split(parameters('aksResourceId'),'/')[8]]", - "type": "Microsoft.ContainerService/managedClusters", - "location": "[parameters('aksResourceLocation')]", - "apiVersion": "2018-03-31", - "properties": { - "mode": "Incremental", - "id": "[parameters('aksResourceId')]", - "addonProfiles": { - "omsagent": { - "enabled": false, - "config": { - "loganalyticsworkspaceresourceid": "[parameters('workspaceResourceId')]" - } - } - } - } - } - ] -} -``` - -2. Save this file as HealthPreviewOnboarding.json in your local folder - -3. Copy and paste the following JSON into a file - -```json -{ - "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#", - "contentVersion": "1.0.0.0", - "parameters": { - "aksResourceId": { - "value": "VALUE_AKS_RESOURCE_ID" - }, - "aksResourceLocation": { - "value": "eastus" - }, - "workspaceResourceId": { - "value": "VALUE_WORKSPACE_RESOURCE_ID" - } - } -} -``` - -4. Save this file as HealthPreviewOnboardingParams.json in your local folder - -5. Replace the contents of the VALUE_AKS_RESOURCE_ID and VALUE_WORKSPACE_RESOURCE_ID with the correct values in the HealthPreviewOnboardingParams file -The VALUE_AKS_RESOURCE_ID (resource id of the cluster) can be found in the Properties section of the AKS cluster. VALUE_WORKSPACE_RESOURCE_ID (get the value of this from the portal when the cluster is onboarded) is of the format /subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/ -- replace the subscriptionId, resourceGroupName and workspaceName values with the right ones. - -6. Run the following commands from a powershell window -* Connect-AzAccount -* Select-AzSubscription -SubscriptionName -* New-AzResourceGroupDeployment -Name opt-out -ResourceGroupName -TemplateFile .\HealthPreviewOnboarding.json -TemplateParameterFile .\HealthPreviewOnboardingParams.json - -7. Copy the following content into a yaml file: (You will use this file to do a kubectl apply on the kubernetes cluster). This file is also available [here](https://raw.githubusercontent.com/microsoft/OMS-docker/dilipr/kubeHealth/health/omsagent-template.yaml) -```yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: omsagent - namespace: kube-system ---- -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1beta1 -metadata: - name: omsagent-reader -rules: -- apiGroups: [""] - resources: ["pods", "events", "nodes", "namespaces", "services"] - verbs: ["list", "get", "watch"] -- apiGroups: ["extensions"] - resources: ["deployments"] - verbs: ["list"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1beta1 -metadata: - name: omsagentclusterrolebinding -subjects: - - kind: ServiceAccount - name: omsagent - namespace: kube-system -roleRef: - kind: ClusterRole - name: omsagent-reader - apiGroup: rbac.authorization.k8s.io ---- -kind: ConfigMap -apiVersion: v1 -data: - kube.conf: |- - # Fluentd config file for OMS Docker - cluster components (kubeAPI) - #fluent forward plugin - - type forward - port 25235 - bind 0.0.0.0 - - - #Kubernetes pod inventory - - type kubepodinventory - tag oms.containerinsights.KubePodInventory - run_interval 60s - log_level debug - - - #Kubernetes events - - type kubeevents - tag oms.containerinsights.KubeEvents - run_interval 60s - log_level debug - - - #Kubernetes logs - - type kubelogs - tag oms.api.KubeLogs - run_interval 60s - - - #Kubernetes services - - type kubeservices - tag oms.containerinsights.KubeServices - run_interval 60s - log_level debug - - - #Kubernetes Nodes - - type kubenodeinventory - tag oms.containerinsights.KubeNodeInventory - run_interval 60s - log_level debug - - - #Kubernetes perf - - type kubeperf - tag oms.api.KubePerf - run_interval 60s - log_level debug - - - #Kubernetes health - - type kubehealth - tag oms.api.KubeHealth.ReplicaSet - run_interval 60s - log_level debug - - - #cadvisor perf- Windows nodes - - type wincadvisorperf - tag oms.api.wincadvisorperf - run_interval 60s - log_level debug - - - - type filter_inventory2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope - log_level info - - - # custom_metrics_mdm filter plugin for perf data from windows nodes - - type filter_cadvisor2mdm - custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westEurope - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes - log_level info - - - #health model aggregation filter - - type filter_health_model_builder - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubepods*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 5m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeevents*.buffer - buffer_queue_limit 10 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - - - - type out_oms_api - log_level debug - buffer_chunk_limit 10m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_api_kubernetes_logs*.buffer - buffer_queue_limit 10 - flush_interval 20s - retry_limit 10 - retry_wait 30s - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeservices*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/state/out_oms_kubenodes*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - - - - type out_oms - log_level debug - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_containernodeinventory*.buffer - buffer_queue_limit 20 - flush_interval 20s - retry_limit 10 - retry_wait 15s - max_retry_wait 9m - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubeperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - - - - type out_mdm - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_mdm_*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - retry_mdm_post_wait_minutes 60 - - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_api_wincadvisorperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - - - - type out_mdm - log_level debug - num_threads 5 - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_mdm_cdvisorperf*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 30s - max_retry_wait 9m - retry_mdm_post_wait_minutes 60 - - - - type out_oms_api - log_level debug - buffer_chunk_limit 10m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_api_kubehealth*.buffer - buffer_queue_limit 10 - flush_interval 20s - retry_limit 10 - retry_wait 30s - -metadata: - name: omsagent-rs-config - namespace: kube-system ---- -apiVersion: v1 -kind: Secret -metadata: - name: omsagent-secret - namespace: kube-system -type: Opaque -data: - #BASE64 ENCODED (Both WSID & KEY) INSIDE DOUBLE QUOTE ("") - WSID: "VALUE_WSID" - KEY: "VALUE_KEY" ---- -apiVersion: extensions/v1beta1 -kind: DaemonSet -metadata: - name: omsagent - namespace: kube-system -spec: - updateStrategy: - type: RollingUpdate - template: - metadata: - labels: - dsName: "omsagent-ds" - annotations: - agentVersion: "1.10.0.1" - dockerProviderVersion: "5.0.0-0" - schema-versions: "v1" - spec: - serviceAccountName: omsagent - containers: - - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: 150m - memory: 300Mi - requests: - cpu: 75m - memory: 225Mi - env: - - name: AKS_RESOURCE_ID - value: "VALUE_AKS_RESOURCE_ID_VALUE" - - name: AKS_REGION - value: "VALUE_AKS_REGION_VALUE" - #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters - #- name: ACS_RESOURCE_NAME - #value: "my_acs_cluster_name" - - name: CONTROLLER_TYPE - value: "DaemonSet" - - name: NODE_IP - valueFrom: - fieldRef: - fieldPath: status.hostIP - securityContext: - privileged: true - ports: - - containerPort: 25225 - protocol: TCP - - containerPort: 25224 - protocol: UDP - volumeMounts: - - mountPath: /hostfs - name: host-root - readOnly: true - - mountPath: /var/run/host - name: docker-sock - - mountPath: /var/log - name: host-log - - mountPath: /var/lib/docker/containers - name: containerlog-path - - mountPath: /etc/kubernetes/host - name: azure-json-path - - mountPath: /etc/omsagent-secret - name: omsagent-secret - - mountPath: /etc/config/settings - name: settings-vol-config - readOnly: true - livenessProbe: - exec: - command: - - /bin/bash - - -c - - /opt/livenessprobe.sh - initialDelaySeconds: 60 - periodSeconds: 60 - nodeSelector: - beta.kubernetes.io/os: linux - # Tolerate a NoSchedule taint on master that ACS Engine sets. - tolerations: - - key: "node-role.kubernetes.io/master" - operator: "Equal" - value: "true" - effect: "NoSchedule" - volumes: - - name: host-root - hostPath: - path: / - - name: docker-sock - hostPath: - path: /var/run - - name: container-hostname - hostPath: - path: /etc/hostname - - name: host-log - hostPath: - path: /var/log - - name: containerlog-path - hostPath: - path: /var/lib/docker/containers - - name: azure-json-path - hostPath: - path: /etc/kubernetes - - name: omsagent-secret - secret: - secretName: omsagent-secret - - name: settings-vol-config - configMap: - name: container-azm-ms-agentconfig - optional: true ---- -apiVersion: extensions/v1beta1 -kind: Deployment -metadata: - name: omsagent-rs - namespace: kube-system -spec: - replicas: 1 - selector: - matchLabels: - rsName: "omsagent-rs" - strategy: - type: RollingUpdate - template: - metadata: - labels: - rsName: "omsagent-rs" - annotations: - agentVersion: "1.10.0.1" - dockerProviderVersion: "5.0.0-0" - schema-versions: "v1" - spec: - serviceAccountName: omsagent - containers: - - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: 150m - memory: 500Mi - requests: - cpu: 50m - memory: 175Mi - env: - #- name: AKS_RESOURCE_ID - # value: "VALUE_AKS_RESOURCE_ID_VALUE" - #- name: AKS_REGION - # value: "VALUE_AKS_RESOURCE_REGION_VALUE" - #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters - - name: ACS_RESOURCE_NAME - value: "my_acs_cluster_name" - - name: CONTROLLER_TYPE - value: "ReplicaSet" - - name: NODE_IP - valueFrom: - fieldRef: - fieldPath: status.hostIP - securityContext: - privileged: true - ports: - - containerPort: 25225 - protocol: TCP - - containerPort: 25224 - protocol: UDP - volumeMounts: - - mountPath: /var/run/host - name: docker-sock - - mountPath: /var/log - name: host-log - - mountPath: /var/lib/docker/containers - name: containerlog-path - - mountPath: /etc/kubernetes/host - name: azure-json-path - - mountPath: /etc/omsagent-secret - name: omsagent-secret - readOnly: true - - mountPath : /etc/config - name: omsagent-rs-config - - mountPath: /etc/config/settings - name: settings-vol-config - readOnly: true - - mountPath: "/mnt/azure" - name: azurefile-pv - livenessProbe: - exec: - command: - - /bin/bash - - -c - - ps -ef | grep omsagent | grep -v "grep" - initialDelaySeconds: 60 - periodSeconds: 60 - nodeSelector: - beta.kubernetes.io/os: linux - kubernetes.io/role: agent - volumes: - - name: docker-sock - hostPath: - path: /var/run - - name: container-hostname - hostPath: - path: /etc/hostname - - name: host-log - hostPath: - path: /var/log - - name: containerlog-path - hostPath: - path: /var/lib/docker/containers - - name: azure-json-path - hostPath: - path: /etc/kubernetes - - name: omsagent-secret - secret: - secretName: omsagent-secret - - name: omsagent-rs-config - configMap: - name: omsagent-rs-config - - name: settings-vol-config - configMap: - name: container-azm-ms-agentconfig - optional: true - - name: azurefile-pv - persistentVolumeClaim: - claimName: azurefile ---- -kind: Service -apiVersion: v1 -metadata: - name: repliceset-service - namespace: kube-system -spec: - selector: - rsName: "omsagent-rs" - ports: - - protocol: TCP - port: 25235 - targetPort: in-rs-tcp - nodePort: 25235 ---- -kind: StorageClass -apiVersion: storage.k8s.io/v1 -metadata: - name: azurefile -provisioner: kubernetes.io/azure-file -mountOptions: - - dir_mode=0777 - - file_mode=0777 - - uid=1000 - - gid=1000 -parameters: - skuName: Standard_LRS ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: system:azure-cloud-provider -rules: -- apiGroups: [''] - resources: ['secrets'] - verbs: ['get','create'] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: system:azure-cloud-provider -roleRef: - kind: ClusterRole - apiGroup: rbac.authorization.k8s.io - name: system:azure-cloud-provider -subjects: -- kind: ServiceAccount - name: persistent-volume-binder - namespace: kube-system ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: azurefile -spec: - accessModes: - - ReadWriteMany - storageClassName: azurefile - resources: - requests: - storage: 10Mi -``` - -8. save this file as omsagent.yaml - -9. Replace the following values in the file - -* VALUE_AKS_RESOURCE_ID -- Resource Id of the cluster -* VALUE_AKS_REGION -- Region the cluster is in -* VALUE_WSID -- base 64 encoded Workspace Id. To get this, go to the portal -- log analytics workspace -- Advanced Settings. REMEMBER: PASTE the base 64 encoded value -* VALUE_KEY -- base 64 encoded Primary Shared Key of the workspace. To get this, go to the portal -- log analytics workspace -- Advanced Settings. REMEMBER: PASTE the base 64 encoded value of the key (which is base 64 encoded to start with) - -10. Set the context in your local machine to the AKS cluster -Import-AzAksCredential -ResourceGroupName -Name -11. kubectl apply -f omsagent.yaml - -Once the above steps are done, it can take upto 20 minutes for the health related data to show up which can be accessed using the following link: - - From 1b78f9d565fe3bd8e073a9cc67d791d165e74ecb Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Fri, 5 Jul 2019 10:58:48 -0700 Subject: [PATCH 48/55] Update HealthOnboarding.md --- health/HealthOnboarding.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index 9cf6531f..87a70df2 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -28,8 +28,7 @@ We have a handy [script](https://github.com/Microsoft/OMS-docker/blob/dilipr/kub ### AKS Engine Onboarding -Before proceeding with the onboarding steps, opt out of monitoring using the steps outlined [here] - +If your cluster is already onboarded to Monitoring, proceed directly to step 4 and continue from there on. 1. Add Container Insights Solution to your workspace using the instructions [here](http://aka.ms/coinhelmdoc) 2. Tag your AKS-Engine cluster appropriately using the instructions [here](http://aka.ms/coin-acs-tag-doc) 3. Set the current k8s context to be your AKS Engine cluster (the kube-config should refer to your AKS-Engine cluster) From e78c7cdef48483cd687c8db04b3a79f658a7234d Mon Sep 17 00:00:00 2001 From: r-dilip Date: Thu, 18 Jul 2019 13:17:01 -0700 Subject: [PATCH 49/55] updating agent version with bug fix --- health/HealthOnboarding.md | 4 ++-- health/omsagent-template-aks-engine.yaml | 4 ++-- health/omsagent-template.yaml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/health/HealthOnboarding.md b/health/HealthOnboarding.md index e48eaf9e..bf74511a 100644 --- a/health/HealthOnboarding.md +++ b/health/HealthOnboarding.md @@ -450,7 +450,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" imagePullPolicy: IfNotPresent resources: limits: @@ -564,7 +564,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" imagePullPolicy: IfNotPresent resources: limits: diff --git a/health/omsagent-template-aks-engine.yaml b/health/omsagent-template-aks-engine.yaml index 5c809e26..e9683d32 100644 --- a/health/omsagent-template-aks-engine.yaml +++ b/health/omsagent-template-aks-engine.yaml @@ -378,7 +378,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" imagePullPolicy: IfNotPresent resources: limits: @@ -494,7 +494,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" imagePullPolicy: IfNotPresent resources: limits: diff --git a/health/omsagent-template.yaml b/health/omsagent-template.yaml index 080d497e..91ff456b 100644 --- a/health/omsagent-template.yaml +++ b/health/omsagent-template.yaml @@ -378,7 +378,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" imagePullPolicy: IfNotPresent resources: limits: @@ -494,7 +494,7 @@ spec: serviceAccountName: omsagent containers: - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview06272019" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" imagePullPolicy: IfNotPresent resources: limits: From 389bd0271babbba51eb5edad0f929ebd22886050 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Thu, 25 Jul 2019 15:25:11 -0700 Subject: [PATCH 50/55] Making changes to work in Azure CloudShell PowerShell --- health/HealthAgentOnboarding.ps1 | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/health/HealthAgentOnboarding.ps1 b/health/HealthAgentOnboarding.ps1 index 20af19ea..f9d45652 100644 --- a/health/HealthAgentOnboarding.ps1 +++ b/health/HealthAgentOnboarding.ps1 @@ -403,13 +403,13 @@ catch { #Write-Host("Please contact us by emailing askcoin@microsoft.com for help") -ForegroundColor Red } -$desktopPath = [System.Environment]::GetFolderPath([System.Environment+SpecialFolder]::Desktop) -if (-not (test-path $desktopPath\deployments) ) { - Write-Host "$($desktopPath)\deployments doesn't exist, creating it" - mkdir $desktopPath\deployments | out-null +$desktopPath = "~" +if (-not (test-path $desktopPath/deployments) ) { + Write-Host "$($desktopPath)/deployments doesn't exist, creating it" + mkdir $desktopPath/deployments | out-null } else { - Write-Host "$($desktopPath)\deployments exists, no need to create it" + Write-Host "$($desktopPath)/deployments exists, no need to create it" } try { @@ -421,12 +421,12 @@ try { $clusterName = $aksResourceDetails[8].Trim() $clusterResourceGroupName = $aksResourceDetails[4].Trim() Import-AzAksCredential -Id $aksResourceId -Force - Invoke-WebRequest https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/kubeHealth/health/omsagent-template.yaml -OutFile $desktopPath\omsagent-template.yaml + Invoke-WebRequest https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/kubeHealth/health/omsagent-template.yaml -OutFile $desktopPath/omsagent-template.yaml - (Get-Content -Path $desktopPath\omsagent-template.yaml -Raw) -replace 'VALUE_AKS_RESOURCE_ID', $aksResourceId -replace 'VALUE_AKS_REGION', $aksResourceLocation -replace 'VALUE_WSID', $base64EncodedWsId -replace 'VALUE_KEY', $base64EncodedKey -replace 'VALUE_ACS_RESOURCE_NAME', $acsResourceName | Set-Content $desktopPath\deployments\omsagent-$clusterName.yaml - kubectl apply -f $desktopPath\deployments\omsagent-$clusterName.yaml + (Get-Content -Path $desktopPath/omsagent-template.yaml -Raw) -replace 'VALUE_AKS_RESOURCE_ID', $aksResourceId -replace 'VALUE_AKS_REGION', $aksResourceLocation -replace 'VALUE_WSID', $base64EncodedWsId -replace 'VALUE_KEY', $base64EncodedKey -replace 'VALUE_ACS_RESOURCE_NAME', $acsResourceName | Set-Content $desktopPath/deployments/omsagent-$clusterName.yaml + kubectl apply -f $desktopPath/deployments/omsagent-$clusterName.yaml Write-Host "Successfully onboarded to health model omsagent" -ForegroundColor Green } catch { Write-Host ("Agent deployment failed with an error: '" + $Error[0] + "' ") -ForegroundColor Red -} \ No newline at end of file +} From 5f39989270669bd7d39f69a05f247996f3cf70b7 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Wed, 31 Jul 2019 17:33:53 -0700 Subject: [PATCH 51/55] Update path, and finish merge from ci_feature --- ci_feature_prod/Dockerfile | 2 +- ci_feature_prod/main.sh | 18 +++++++++--------- ci_feature_prod/setup.sh | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ci_feature_prod/Dockerfile b/ci_feature_prod/Dockerfile index 80d47aba..2866f1de 100644 --- a/ci_feature_prod/Dockerfile +++ b/ci_feature_prod/Dockerfile @@ -13,7 +13,7 @@ ENV HOST_ETC /hostfs/etc ENV HOST_VAR /hostfs/var ENV AZMON_COLLECT_ENV False RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes sysv-rc net-tools rsyslog cron vim dmidecode apt-transport-https && rm -rf /var/lib/apt/lists/* -COPY setup.sh main.sh $tmpdir/ +COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs $tmpdir/ WORKDIR ${tmpdir} RUN chmod 775 $tmpdir/*.sh; sync; $tmpdir/setup.sh CMD [ "/opt/main.sh" ] diff --git a/ci_feature_prod/main.sh b/ci_feature_prod/main.sh index 5813da93..b3399148 100644 --- a/ci_feature_prod/main.sh +++ b/ci_feature_prod/main.sh @@ -36,7 +36,7 @@ if [ -S ${DOCKER_SOCKET} ]; then groupadd -for -g ${DOCKER_GID} ${DOCKER_GROUP} echo "adding omsagent user to local docker group" usermod -aG ${DOCKER_GROUP} ${REGULAR_USER} -fi +fi #Run inotify as a daemon to track changes to the mounted configmap. inotifywait /etc/config/settings --daemon --recursive --outfile "/opt/inotifyoutput.txt" --event create,delete --format '%e : %T' --timefmt '+%s' @@ -48,11 +48,11 @@ else curl --unix-socket /var/run/host/docker.sock "http:/info" | python -c "import sys, json; print json.load(sys.stdin)['Name']" > /var/opt/microsoft/docker-cimprov/state/containerhostname fi #check if file was written successfully. -cat /var/opt/microsoft/docker-cimprov/state/containerhostname +cat /var/opt/microsoft/docker-cimprov/state/containerhostname #resourceid override for loganalytics data. if [ -z $AKS_RESOURCE_ID ]; then - echo "not setting customResourceId" + echo "not setting customResourceId" else export customResourceId=$AKS_RESOURCE_ID echo "export customResourceId=$AKS_RESOURCE_ID" >> ~/.bashrc @@ -63,7 +63,7 @@ fi #set agent config schema version if [ -e "/etc/config/settings/schema-version" ] && [ -s "/etc/config/settings/schema-version" ]; then #trim - config_schema_version="$(cat /etc/config/settings/schema-version | xargs)" + config_schema_version="$(cat /etc/config/settings/schema-version | xargs)" #remove all spaces config_schema_version="${config_schema_version//[[:space:]]/}" #take first 10 characters @@ -92,7 +92,7 @@ fi # Check for internet connectivity RET=`curl -s -o /dev/null -w "%{http_code}" http://www.microsoft.com/` -if [ $RET -eq 200 ]; then +if [ $RET -eq 200 ]; then # Check for workspace existence if [ -e "/etc/omsagent-secret/WSID" ]; then workspaceId=$(cat /etc/omsagent-secret/WSID) @@ -103,7 +103,7 @@ if [ $RET -eq 200 ]; then else echo "LA Onboarding:Workspace Id not mounted" fi -else +else echo "-e error Error resolving host during the onboarding request. Check the internet connectivity and/or network policy on the cluster" fi @@ -131,7 +131,7 @@ rm -f /etc/opt/microsoft/omsagent/conf/omsagent.d/omsconfig.consistencyinvoker.c if [ -z $INT ]; then if [ -a /etc/omsagent-secret/DOMAIN ]; then /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /etc/omsagent-secret/WSID` -s `cat /etc/omsagent-secret/KEY` -d `cat /etc/omsagent-secret/DOMAIN` - elif [ -a /etc/omsagent-secret/WSID ]; then + elif [ -a /etc/omsagent-secret/WSID ]; then /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /etc/omsagent-secret/WSID` -s `cat /etc/omsagent-secret/KEY` elif [ -a /run/secrets/DOMAIN ]; then /opt/microsoft/omsagent/bin/omsadmin.sh -w `cat /run/secrets/WSID` -s `cat /run/secrets/KEY` -d `cat /run/secrets/DOMAIN` @@ -159,7 +159,7 @@ service cron start #get omsagent and docker-provider versions dpkg -l | grep omsagent | awk '{print $2 " " $3}' -dpkg -l | grep docker-cimprov | awk '{print $2 " " $3}' +dpkg -l | grep docker-cimprov | awk '{print $2 " " $3}' #telegraf & fluentbit requirements if [ ! -e "/etc/config/kube.conf" ]; then @@ -272,7 +272,7 @@ fi /opt/telegraf --version dpkg -l | grep td-agent-bit | awk '{print $2 " " $3}' -#dpkg -l | grep telegraf | awk '{print $2 " " $3}' +#dpkg -l | grep telegraf | awk '{print $2 " " $3}' shutdown() { /opt/microsoft/omsagent/bin/service_control stop diff --git a/ci_feature_prod/setup.sh b/ci_feature_prod/setup.sh index c10e20d0..914fc4c8 100644 --- a/ci_feature_prod/setup.sh +++ b/ci_feature_prod/setup.sh @@ -14,7 +14,7 @@ wget https://github.com/Microsoft/OMS-Agent-for-Linux/releases/download/OMSAgent #create file to disable omi service startup script touch /etc/.omi_disable_service_control -wget https://github.com/microsoft/Docker-Provider/releases/download/healthpreview06182019/docker-cimprov-5.0.0-1.universal.x86_64.sh +wget https://github.com/microsoft/Docker-Provider/releases/download/healthpreview06182019/docker-cimprov-6.0.0-1.universal.x86_64.sh chmod 775 $TMPDIR/*.sh From 09e01fd897ad943dbf346fe73f8160af8f65a025 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Thu, 1 Aug 2019 12:01:11 -0700 Subject: [PATCH 52/55] Fixing agent version and docker version --- ci_feature/Dockerfile | 4 ++-- ci_feature/setup.sh | 2 +- ci_feature_prod/Dockerfile | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ci_feature/Dockerfile b/ci_feature/Dockerfile index e57ea1f4..aa09e978 100644 --- a/ci_feature/Dockerfile +++ b/ci_feature/Dockerfile @@ -2,10 +2,10 @@ FROM ubuntu:16.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="OMS Container Docker Provider" \ - com.microsoft.version="5.0.0-1" + com.microsoft.version="6.0.0-1" ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH OTQzNWI0M2YtOTdkNS00ZGVkLThkOTAtYjA0Nzk1OGU2ZTg3 -ENV AGENT_VERSION healthpreview07302019 +ENV AGENT_VERSION ciprod07092019 ENV HOST_MOUNT_PREFIX /hostfs ENV HOST_PROC /hostfs/proc ENV HOST_SYS /hostfs/sys diff --git a/ci_feature/setup.sh b/ci_feature/setup.sh index 6782e1fc..d8abc5d2 100644 --- a/ci_feature/setup.sh +++ b/ci_feature/setup.sh @@ -14,7 +14,7 @@ wget https://github.com/Microsoft/OMS-Agent-for-Linux/releases/download/OMSAgent #create file to disable omi service startup script touch /etc/.omi_disable_service_control -wget https://github.com/microsoft/Docker-Provider/releases/download/healthpreview06182019/docker-cimprov-5.0.0-1.universal.x86_64.sh +wget https://github.com/microsoft/Docker-Provider/releases/download/healthpreview06182019/docker-cimprov-6.0.0-1.universal.x86_64.sh chmod 775 $TMPDIR/*.sh #Extract omsbundle diff --git a/ci_feature_prod/Dockerfile b/ci_feature_prod/Dockerfile index 2866f1de..1ba97dc3 100644 --- a/ci_feature_prod/Dockerfile +++ b/ci_feature_prod/Dockerfile @@ -2,10 +2,10 @@ FROM ubuntu:16.04 MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="OMS Container Docker Provider" \ - com.microsoft.version="5.0.0-0" + com.microsoft.version="6.0.0-1" ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi -ENV AGENT_VERSION healthpreview07182019 +ENV AGENT_VERSION ciprod07092019 ENV HOST_MOUNT_PREFIX /hostfs ENV HOST_PROC /hostfs/proc ENV HOST_SYS /hostfs/sys From 3d9d24b5464d650a8511170a8e5f6a15626e8079 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Wed, 7 Aug 2019 12:43:38 -0700 Subject: [PATCH 53/55] Change pull policy to always for private preview --- ci_feature_prod/Dockerfile | 2 +- health/omsagent-template.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci_feature_prod/Dockerfile b/ci_feature_prod/Dockerfile index 1ba97dc3..167bf7d4 100644 --- a/ci_feature_prod/Dockerfile +++ b/ci_feature_prod/Dockerfile @@ -5,7 +5,7 @@ LABEL vendor=Microsoft\ Corp \ com.microsoft.version="6.0.0-1" ENV tmpdir /opt ENV APPLICATIONINSIGHTS_AUTH NzAwZGM5OGYtYTdhZC00NThkLWI5NWMtMjA3ZjM3NmM3YmRi -ENV AGENT_VERSION ciprod07092019 +ENV AGENT_VERSION healthpreview07182019 ENV HOST_MOUNT_PREFIX /hostfs ENV HOST_PROC /hostfs/proc ENV HOST_SYS /hostfs/sys diff --git a/health/omsagent-template.yaml b/health/omsagent-template.yaml index 91ff456b..845569e8 100644 --- a/health/omsagent-template.yaml +++ b/health/omsagent-template.yaml @@ -379,7 +379,7 @@ spec: containers: - name: omsagent image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always resources: limits: cpu: 150m @@ -495,7 +495,7 @@ spec: containers: - name: omsagent image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:healthpreview07182019" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always resources: limits: cpu: 150m From b56a1cbbdda2c46b5b43b96426bb7faa2fed746c Mon Sep 17 00:00:00 2001 From: r-dilip Date: Wed, 7 Aug 2019 13:44:41 -0700 Subject: [PATCH 54/55] Add sleep before kubectl apply --- health/HealthAgentOnboarding.ps1 | 1 + 1 file changed, 1 insertion(+) diff --git a/health/HealthAgentOnboarding.ps1 b/health/HealthAgentOnboarding.ps1 index f9d45652..9e9d8181 100644 --- a/health/HealthAgentOnboarding.ps1 +++ b/health/HealthAgentOnboarding.ps1 @@ -424,6 +424,7 @@ try { Invoke-WebRequest https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/kubeHealth/health/omsagent-template.yaml -OutFile $desktopPath/omsagent-template.yaml (Get-Content -Path $desktopPath/omsagent-template.yaml -Raw) -replace 'VALUE_AKS_RESOURCE_ID', $aksResourceId -replace 'VALUE_AKS_REGION', $aksResourceLocation -replace 'VALUE_WSID', $base64EncodedWsId -replace 'VALUE_KEY', $base64EncodedKey -replace 'VALUE_ACS_RESOURCE_NAME', $acsResourceName | Set-Content $desktopPath/deployments/omsagent-$clusterName.yaml + Start-Sleep 30 kubectl apply -f $desktopPath/deployments/omsagent-$clusterName.yaml Write-Host "Successfully onboarded to health model omsagent" -ForegroundColor Green } From 67caad85048e9d6766a892260c9e39a18b085690 Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Fri, 16 Aug 2019 16:56:55 -0700 Subject: [PATCH 55/55] Update HealthAgentOnboarding.ps1 --- health/HealthAgentOnboarding.ps1 | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/health/HealthAgentOnboarding.ps1 b/health/HealthAgentOnboarding.ps1 index 9e9d8181..0b0500eb 100644 --- a/health/HealthAgentOnboarding.ps1 +++ b/health/HealthAgentOnboarding.ps1 @@ -385,6 +385,9 @@ try { } } + Write-Host("Getting Tags for restoring later") + $tags = (Get-AzAks -Id $aksResourceId).Tags + Write-Host("Enabling Custom Monitoring using template deployment") New-AzResourceGroupDeployment -Name $DeploymentName ` -ResourceGroupName $clusterResourceGroupName ` @@ -395,6 +398,9 @@ try { Write-Host("Successfully custom onboarded cluster to Monitoring") -ForegroundColor Green + Set-AzResource -ResourceId $aksResourceId -Tag $tags -Force + Write-Host("Successfully restored tags") + Write-Host("") } catch { @@ -424,7 +430,6 @@ try { Invoke-WebRequest https://raw.githubusercontent.com/Microsoft/OMS-docker/dilipr/kubeHealth/health/omsagent-template.yaml -OutFile $desktopPath/omsagent-template.yaml (Get-Content -Path $desktopPath/omsagent-template.yaml -Raw) -replace 'VALUE_AKS_RESOURCE_ID', $aksResourceId -replace 'VALUE_AKS_REGION', $aksResourceLocation -replace 'VALUE_WSID', $base64EncodedWsId -replace 'VALUE_KEY', $base64EncodedKey -replace 'VALUE_ACS_RESOURCE_NAME', $acsResourceName | Set-Content $desktopPath/deployments/omsagent-$clusterName.yaml - Start-Sleep 30 kubectl apply -f $desktopPath/deployments/omsagent-$clusterName.yaml Write-Host "Successfully onboarded to health model omsagent" -ForegroundColor Green }