From 55b750340b9701909b2f2aac3a9fc3c2b941997c Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Fri, 21 Jul 2023 16:03:44 +0530 Subject: [PATCH 01/20] Create conditionalHnsRestarter.p1 Script to be used to restart HNS based when certain condition is met, to help mitigate issues in customer AKS nodes. Script currently supports checking for missing rules on pod-endpoints. Example command: conditionalHnsRestarter.ps1 -PodPortRulesToCheck @{"ruleRegex" = "LB_DSR_[0-9A-Z]+_[\d\.]+_[\d\.]+_801_801_17"; "layerName" = "LB_DSR"; "groupName" = "LB_DSR_IPv4_OUT"} --- .../conditionalHnsRestarter.p1 | 325 ++++++++++++++++++ 1 file changed, 325 insertions(+) create mode 100644 scripts/conditionalHnsRestarter/conditionalHnsRestarter.p1 diff --git a/scripts/conditionalHnsRestarter/conditionalHnsRestarter.p1 b/scripts/conditionalHnsRestarter/conditionalHnsRestarter.p1 new file mode 100644 index 0000000..27445da --- /dev/null +++ b/scripts/conditionalHnsRestarter/conditionalHnsRestarter.p1 @@ -0,0 +1,325 @@ +param ( + [Parameter(Mandatory=$true)] + [array] $PodPortRulesToCheck, + # array of @{ruleRegex = ""; layerName = ""; groupName = ""} + + [Parameter(Mandatory=$false)] + [int] $SleepIntervalMins = 5, + # time to sleep before checking whether HNS restart is required. + + [Parameter(Mandatory=$false)] + [int] $RuleCheckIntervalMins = 3, + # if a rule is missing on an endpoint for more than RuleCheckIntervalMins minutes, we restart HNS + + [Parameter(Mandatory=$false)] + [int] $MaxHnsRestarts = 50, + + [Parameter(Mandatory=$false)] + [int] $MinRestartInterval = 5, + + [Parameter(Mandatory=$false)] + [string] $WindowsLogsPath = "C:\k\debug\ConditionalHnsRestart_data\" +) + +class RuleCheckInfo { + [string]$ruleRegex + [string]$layerName + [string]$groupName + RuleCheckInfo([string] $in_ruleRegex, [string] $in_layerName, [string] $in_groupName) + { + $this.ruleRegex = $in_ruleRegex + $this.layerName = $in_layerName + $this.groupName = $in_groupName + } +} + +class EndpointInfo { + [string]$id + [System.DateTime]$notedTime + [bool] $rulesVerified + + EndpointInfo([string] $inId, [System.DateTime] $inTime) + { + $this.id = $inId + $this.notedTime = $inTime + $this.rulesVerified = $false + } +} + +$g_scriptStartTime = get-date +$g_endpointInfoMap = @{} # key = id, value = EndpointInfo +$g_podRuleCheckList = [System.Collections.Generic.List[RuleCheckInfo]]::new() # array of RuleCheckInfo objects +$g_hnsRestartCount = 0 +$g_lastHnsRestartTime = $g_scriptStartTime +$g_nonPodPortRegex = "Container NIC|Host Vnic|ExternalPort" + + +function RulePresentInVfpPortGroup( + [PSCustomObject] $portGroup, + [RuleCheckInfo] $ruleToCheck +) +{ + # find rule + $ruleFound = $false + $ruleIndex = -1 + foreach ($rule in $portGroup.rules) { + $ruleIndex += 1 + if ($rule.Id -match $ruleToCheck.ruleRegex) { + $ruleFound = $true + + $msg = "rule {0} matches regex {1}." -f $rule.Id, $ruleToCheck.ruleRegex + write-host $msg + + break + } + } + + return $ruleFound +} + + +function RulePresentInVfpPortLayer( + [PSCustomObject] $layer, + [RuleCheckInfo] $ruleToCheck +) +{ + # first find layer + $groupFound = $false + $groupIndex = -1 + foreach ($portGroup in $layer.groups) { + $groupIndex += 1 + if ($portGroup.name -eq $ruleToCheck.groupName) { + $groupFound = $true + break + } + } + if ($groupFound -eq $false) { + return $false + } + + $msg = "group {0} found in layer {1}." -f $ruleToCheck.groupName, $ruleToCheck.layerName + write-host $msg + + return RulePresentInVfpPortGroup -portGroup $layer.groups[$groupIndex] -ruleToCheck $ruleToCheck +} + +function RulesPresentOnVfpPort( + [string] $portId, + [System.Collections.Generic.List[RuleCheckInfo]] $rulesToCheck +) +{ + $layers = (vfpctrl /list-rule /port $portId /format 1 | convertfrom-json).Layers + + foreach ($ruleToCheck in $rulesToCheck) { + write-host "" + + # first find layer + $layerFound = $false + $layerIndex = -1 + + foreach ($layer in $layers) { + $layerIndex += 1 + if ($layer.name -eq $ruleToCheck.layerName) { + $layerFound = $true + break + } + } + if ($layerFound -eq $false) { + return $false + } + + $msg = "Layer {0} found on port {1}: {2}." -f $ruleToCheck.layerName, $portId, $layers[$layerIndex] + write-host $msg + + $rulePresentInLayer = RulePresentInVfpPortLayer -layer $layers[$layerIndex] -ruleToCheck $ruleToCheck + if ($rulePresentInLayer -eq $false) { + return $false + } + } + + return $true +} + +function PortIsPodPort( + [PSCustomObject] $vfpPortInfo +) +{ + if ($vfpPortInfo.id -match $g_nonPodPortRegex) + { + return $false + } + + return $true +} + +function RulesAreMissing() { + $vfpPortList = ((vfpctrl /list-vmswitch-port /format 1 | convertfrom-json).Ports) + $current_time = get-date + $vfpPortMap = @{} + + $msg = "There are {0} ports in VFP." -f $vfpPortList.count + write-host $msg + + ## Note new endpoint IDs. + $msg = "$g_endpointInfoMap size before adding new ports {0}" -f $g_endpointInfoMap.count + write-host $msg + foreach ($vfpPort in $vfpPortList) + { + $vfpPortMap.Add($vfpPort.Id, $vfpPort) + + if ($g_endpointInfoMap.ContainsKey($vfpPort.Id) -eq $false) + { + $notedTime = get-date + $endpointInfo = [EndpointInfo]::New($vfpPort.Id, $notedTime) + $g_endpointInfoMap.Add($vfpPort.Id, $endpointInfo) + } + } + $msg = "$g_endpointInfoMap size after adding new ports {0}" -f $g_endpointInfoMap.count + write-host $msg + ## + + ## Delete stale endpoint IDs, so that g_endpointInfoMap's size does not keep increasing forever. + $stalePortIdList = @() + $msg = "$g_endpointInfoMap size before deleting stale ports {0}" -f $g_endpointInfoMap.count + write-host $msg + foreach ($portId in $g_endpointInfoMap.Keys) { + $portIdPresent = $false + foreach ($vfpPort in $vfpPortList) { + if ($vfpPort.Id -eq $portId) { + $portIdPresent = $true + break + } + } + + if ($portIdPresent -eq $false) { + $stalePortIdList += @($portId) + } + } + foreach ($portId in $stalePortIdList) { + # TODO: log this + $g_endpointInfoMap.Remove($portId) + } + $msg = "$g_endpointInfoMap size after deleting stale ports {0}" -f $g_endpointInfoMap.count + write-host $msg + ## + + + foreach ($portId in $g_endpointInfoMap.Keys) + { + if ($g_endpointInfoMap[$portId].rulesVerified -eq $true) + { + continue + } + + $isPodPort = PortIsPodPort -vfpPortInfo $vfpPortMap[$portId] + if ($isPodPort -eq $false) { + # this could be external port or host vNIC + continue + } + + #$msg = "checking for rules on port name:{0} id:{1}" -f $vfpPortMap[$portId].name,$vfpPortMap[$portId].id + #write-host $msg + + $rulesPresent = RulesPresentOnVfpPort -portId $portId -rulesToCheck $g_podRuleCheckList + + #$msg = "rules present: {0}" -f $rulesPresent + #write-host $msg + + if (($current_time - $g_endpointInfoMap[$portId].notedTime).TotalMinutes -gt $RuleCheckIntervalMins) + { + # hns must be restarted + return $true + } + } + + return $false +} + +function ScriptSetup() +{ + foreach ($rule in $PodPortRulesToCheck) { + $ruleCheckInfo = [RuleCheckInfo]::New($rule.ruleRegex, $rule.layerName, $rule.groupName) + $g_podRuleCheckList.Add($ruleCheckInfo) + } + $msg = "Size of g_podRuleCheckList: {0}" -f $g_podRuleCheckList.count + write-host $msg +$msg = "rulesToCheck[0].layerName: {0}" -f $g_podRuleCheckList[0].layerName +write-host $msg + +} + +function CheckIfRestartRequired() +{ + $rulesMissing = RulesAreMissing + return $false +} + +function collectLogs( + [string]$LogsPath +) +{ + # create log path if not yet created. + mkdir -Force $LogsPath + $originalPath = pwd + Set-Location $LogsPath + C:\k\debug\collect-windows-logs.ps1 + Set-Location $originalPath +} + +function restartHnsService() +{ + restart-service -f hns +} + +function myMain() +{ + ScriptSetup + while ($true) + { + $restartRequired = CheckIfRestartRequired + + if ($restartRequired -eq $false) { + sleep ($SleepIntervalMins * 60) + continue + } + + $current_time = get-date + $timeSinceLastRestart = $current_time - $g_lastHnsRestartTime + $scriptAge = $current_time - $g_scriptStartTime + + #### + # Check conditions to not restart HNS. + if ($scriptAge.TotalMinutes -lt $SleepIntervalMins) + { + # current_time could be just after a reboot, or just after a HNS/kube-proxy restart. + # Let's not restart yet. + sleep ($SleepIntervalMins * 60) + continue + } + elseif ($g_hnsRestartCount -ge $MaxHnsRestarts) + { + # TODO: log("max HNS restarts already done. Shouldn't restart anymore.") + sleep ($SleepIntervalMins * 60) + continue + } + elseif ($timeSinceLastRestart.TotalMinutes -lt $MinRestartInterval) + { + # TODO: log("HNS restarted recently. Let's wait more.") + sleep ($timeSinceLastRestart.TotalSeconds) + continue + } + # All negative cases (i.e., conditions to not restart HNS end here.) + #### + + # TODO: log("") + collectLogs -LogsPath $WindowsLogsPath + + restartHnsService + + $g_lastHnsRestartTime = get-date + $g_hnsRestartCount += 1 + + } +} + +myMain From c4f6d8eaf02a8c2d4223efb4c0213a71d6fcf976 Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Fri, 21 Jul 2023 16:06:18 +0530 Subject: [PATCH 02/20] Rename conditionalHnsRestarter.p1 to conditionalHnsRestarter.ps1 --- .../{conditionalHnsRestarter.p1 => conditionalHnsRestarter.ps1} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/conditionalHnsRestarter/{conditionalHnsRestarter.p1 => conditionalHnsRestarter.ps1} (100%) diff --git a/scripts/conditionalHnsRestarter/conditionalHnsRestarter.p1 b/scripts/conditionalHnsRestarter/conditionalHnsRestarter.ps1 similarity index 100% rename from scripts/conditionalHnsRestarter/conditionalHnsRestarter.p1 rename to scripts/conditionalHnsRestarter/conditionalHnsRestarter.ps1 From eaca849c287936f04f62ddbdf717875b1fd39d4d Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Sat, 22 Jul 2023 00:34:25 +0530 Subject: [PATCH 03/20] Update and rename conditionalHnsRestarter.ps1 to conditionalMitigator.ps1 This is a generic mitigator script. Currently it checks pod port rules and restarts HNS when rules are not found. It can be extended for other mitigation steps as well, since the mitigation logic can be similar. --- .../conditionalMitigator.ps1} | 196 +++++++++++------- 1 file changed, 123 insertions(+), 73 deletions(-) rename scripts/{conditionalHnsRestarter/conditionalHnsRestarter.ps1 => conditionalMitigator/conditionalMitigator.ps1} (52%) diff --git a/scripts/conditionalHnsRestarter/conditionalHnsRestarter.ps1 b/scripts/conditionalMitigator/conditionalMitigator.ps1 similarity index 52% rename from scripts/conditionalHnsRestarter/conditionalHnsRestarter.ps1 rename to scripts/conditionalMitigator/conditionalMitigator.ps1 index 27445da..ba8d2e0 100644 --- a/scripts/conditionalHnsRestarter/conditionalHnsRestarter.ps1 +++ b/scripts/conditionalMitigator/conditionalMitigator.ps1 @@ -7,15 +7,26 @@ param ( [int] $SleepIntervalMins = 5, # time to sleep before checking whether HNS restart is required. + [Parameter(Mandatory=$false)] + [int] $MinSleepIntervalMins = 1, + # time to sleep before checking whether HNS restart is required. + [Parameter(Mandatory=$false)] [int] $RuleCheckIntervalMins = 3, # if a rule is missing on an endpoint for more than RuleCheckIntervalMins minutes, we restart HNS [Parameter(Mandatory=$false)] - [int] $MaxHnsRestarts = 50, + [int] $MaxMitigationCount = 50, + + [Parameter(Mandatory=$false)] + [int] $MinMitigationIntervalMins = 5, + + [Parameter(Mandatory=$false)] + [int] $MitigationActionVal = 0, + # An enum indicating what mitigation action to take. Example, 0 indicates "restart HNS". [Parameter(Mandatory=$false)] - [int] $MinRestartInterval = 5, + [bool] $CollectWindowsLogs = $true, [Parameter(Mandatory=$false)] [string] $WindowsLogsPath = "C:\k\debug\ConditionalHnsRestart_data\" @@ -36,23 +47,31 @@ class RuleCheckInfo { class EndpointInfo { [string]$id [System.DateTime]$notedTime - [bool] $rulesVerified + [int] $ruleCheckCount + [System.DateTime]$lastRuleCheckTime EndpointInfo([string] $inId, [System.DateTime] $inTime) { $this.id = $inId $this.notedTime = $inTime - $this.rulesVerified = $false + $this.ruleCheckCount = 0 } } +enum MitigationActionEnum { + E_RestartHns = 0 + E_RestartKubeProxy +} + $g_scriptStartTime = get-date $g_endpointInfoMap = @{} # key = id, value = EndpointInfo $g_podRuleCheckList = [System.Collections.Generic.List[RuleCheckInfo]]::new() # array of RuleCheckInfo objects -$g_hnsRestartCount = 0 -$g_lastHnsRestartTime = $g_scriptStartTime +$g_mitigationActionCount = 0 +$g_lastMitigationTime = $g_scriptStartTime $g_nonPodPortRegex = "Container NIC|Host Vnic|ExternalPort" - +$RuleCheckIntervalSecs = $RuleCheckIntervalMins * 60 +$SleepIntervalSecs = $SleepIntervalMins * 60 +$MinMitigationIntervalSecs = $MinMitigationIntervalMins * 60 function RulePresentInVfpPortGroup( [PSCustomObject] $portGroup, @@ -66,14 +85,16 @@ function RulePresentInVfpPortGroup( $ruleIndex += 1 if ($rule.Id -match $ruleToCheck.ruleRegex) { $ruleFound = $true - - $msg = "rule {0} matches regex {1}." -f $rule.Id, $ruleToCheck.ruleRegex - write-host $msg - + #$msg = "rule {0} matches regex {1}." -f $rule.Id, $ruleToCheck.ruleRegex + #write-host $msg break } } + if ($ruleFound -eq $false) { + $msg = "rule with regex {0} not found on group {1}" -f $ruleToCheck.ruleRegex, $portGroup.name + write-host $msg + } return $ruleFound } @@ -94,11 +115,13 @@ function RulePresentInVfpPortLayer( } } if ($groupFound -eq $false) { + $msg = "No group on layer {0} matches name {1}" -f ('"' + $ruleToCheck.layerName + '"'),$ruleToCheck.groupName + write-host $msg return $false } - $msg = "group {0} found in layer {1}." -f $ruleToCheck.groupName, $ruleToCheck.layerName - write-host $msg + #$msg = "group {0} found in layer {1}." -f $ruleToCheck.groupName, $ruleToCheck.layerName + #write-host $msg return RulePresentInVfpPortGroup -portGroup $layer.groups[$groupIndex] -ruleToCheck $ruleToCheck } @@ -108,11 +131,10 @@ function RulesPresentOnVfpPort( [System.Collections.Generic.List[RuleCheckInfo]] $rulesToCheck ) { + #write-host "RulesPresentOnVfpPort called" $layers = (vfpctrl /list-rule /port $portId /format 1 | convertfrom-json).Layers foreach ($ruleToCheck in $rulesToCheck) { - write-host "" - # first find layer $layerFound = $false $layerIndex = -1 @@ -125,14 +147,18 @@ function RulesPresentOnVfpPort( } } if ($layerFound -eq $false) { + $msg = "No layer on port {0} matches name {1}" -f $portId, ('"' + $ruleToCheck.layerName + '"') + write-host $msg return $false } - $msg = "Layer {0} found on port {1}: {2}." -f $ruleToCheck.layerName, $portId, $layers[$layerIndex] - write-host $msg + #$msg = "Layer {0} found on port {1}: {2}." -f $ruleToCheck.layerName, $portId, $layers[$layerIndex] + #write-host $msg $rulePresentInLayer = RulePresentInVfpPortLayer -layer $layers[$layerIndex] -ruleToCheck $ruleToCheck if ($rulePresentInLayer -eq $false) { + $msg = "No rule on port {0} matches regex {1}." -f $portId, $ruleToCheck.ruleRegex + write-host $msg return $false } } @@ -144,8 +170,7 @@ function PortIsPodPort( [PSCustomObject] $vfpPortInfo ) { - if ($vfpPortInfo.id -match $g_nonPodPortRegex) - { + if ($vfpPortInfo.id -match $g_nonPodPortRegex) { return $false } @@ -154,15 +179,13 @@ function PortIsPodPort( function RulesAreMissing() { $vfpPortList = ((vfpctrl /list-vmswitch-port /format 1 | convertfrom-json).Ports) - $current_time = get-date $vfpPortMap = @{} - $msg = "There are {0} ports in VFP." -f $vfpPortList.count - write-host $msg + #$msg = "There are {0} ports in VFP." -f $vfpPortList.count + #write-host $msg ## Note new endpoint IDs. - $msg = "$g_endpointInfoMap size before adding new ports {0}" -f $g_endpointInfoMap.count - write-host $msg + $priorSize = $g_endpointInfoMap.count foreach ($vfpPort in $vfpPortList) { $vfpPortMap.Add($vfpPort.Id, $vfpPort) @@ -174,14 +197,14 @@ function RulesAreMissing() { $g_endpointInfoMap.Add($vfpPort.Id, $endpointInfo) } } - $msg = "$g_endpointInfoMap size after adding new ports {0}" -f $g_endpointInfoMap.count + $endpointsAdded = $g_endpointInfoMap.count - $priorSize + $msg = "new endpoints added to g_endpointInfoMap: {0}" -f $endpointsAdded write-host $msg ## + ## Delete stale endpoint IDs, so that g_endpointInfoMap's size does not keep increasing forever. $stalePortIdList = @() - $msg = "$g_endpointInfoMap size before deleting stale ports {0}" -f $g_endpointInfoMap.count - write-host $msg foreach ($portId in $g_endpointInfoMap.Keys) { $portIdPresent = $false foreach ($vfpPort in $vfpPortList) { @@ -195,42 +218,55 @@ function RulesAreMissing() { $stalePortIdList += @($portId) } } + $priorSize = $g_endpointInfoMap.count foreach ($portId in $stalePortIdList) { - # TODO: log this + $msg = "deleting stale endpoint ID {0}" -f $portId + write-host $msg $g_endpointInfoMap.Remove($portId) } - $msg = "$g_endpointInfoMap size after deleting stale ports {0}" -f $g_endpointInfoMap.count + + $endpointsDeleted = $g_endpointInfoMap.count - $priorSize + $msg = "old endpoints deleted from g_endpointInfoMap: {0}" -f $endpointsDeleted write-host $msg ## + ## Check pod port rules. foreach ($portId in $g_endpointInfoMap.Keys) { - if ($g_endpointInfoMap[$portId].rulesVerified -eq $true) - { - continue - } - $isPodPort = PortIsPodPort -vfpPortInfo $vfpPortMap[$portId] if ($isPodPort -eq $false) { - # this could be external port or host vNIC + # this could be external port or host vNIC. Ignore. continue } - #$msg = "checking for rules on port name:{0} id:{1}" -f $vfpPortMap[$portId].name,$vfpPortMap[$portId].id - #write-host $msg + $current_time = get-date + $timeSinceLastCheck = $current_time - $g_endpointInfoMap[$portId].lastRuleCheckTime + if ($g_endpointInfoMap.ruleCheckCount -gt 0) { + if ($timeSinceLastCheck.TotalSeconds -lt $RuleCheckIntervalSecs) { + # check again later + continue + } + } - $rulesPresent = RulesPresentOnVfpPort -portId $portId -rulesToCheck $g_podRuleCheckList - - #$msg = "rules present: {0}" -f $rulesPresent + #$msg = "Checking for rules on port name:{0} id:{1}" -f $vfpPortMap[$portId].name,$vfpPortMap[$portId].id #write-host $msg - if (($current_time - $g_endpointInfoMap[$portId].notedTime).TotalMinutes -gt $RuleCheckIntervalMins) - { - # hns must be restarted - return $true + $rulesPresent = RulesPresentOnVfpPort -portId $portId -rulesToCheck $g_podRuleCheckList + if ($rulesPresent -eq $true) { + # This port has the necessary rules. + $g_endpointInfoMap[$portId].ruleCheckCount += 1 + $g_endpointInfoMap[$portId].lastRuleCheckTime = $current_time + continue } + + # We reach here when a port does not have the necessary rules for more than RuleCheckIntervalMins. + # Mitigation action must be taken. + $msg = "Rules missing on VFP port with ID {0} since last {1} minutes" -f $portId,$timeSinceLastCheck.TotalMinutes + write-host $msg + return $true } + ## Pod port rule check done. return $false } @@ -241,50 +277,56 @@ function ScriptSetup() $ruleCheckInfo = [RuleCheckInfo]::New($rule.ruleRegex, $rule.layerName, $rule.groupName) $g_podRuleCheckList.Add($ruleCheckInfo) } - $msg = "Size of g_podRuleCheckList: {0}" -f $g_podRuleCheckList.count + $msg = "Number of pod port rules to check: {0}" -f $g_podRuleCheckList.count write-host $msg -$msg = "rulesToCheck[0].layerName: {0}" -f $g_podRuleCheckList[0].layerName -write-host $msg - } -function CheckIfRestartRequired() +function CheckIfMitigationRequired() { $rulesMissing = RulesAreMissing - return $false + return $rulesMissing } -function collectLogs( +function collectLogsBeforeMitigation( [string]$LogsPath ) { # create log path if not yet created. mkdir -Force $LogsPath - $originalPath = pwd - Set-Location $LogsPath - C:\k\debug\collect-windows-logs.ps1 - Set-Location $originalPath + + if ($CollectWindowsLogs -eq $true) { + write-host "collecting windows logs" + $originalPath = pwd + Set-Location $LogsPath + C:\k\debug\collect-windows-logs.ps1 + Set-Location $originalPath + } } -function restartHnsService() +function ExecuteMitigationAction() { - restart-service -f hns + if ($MitigationActionEnum -eq [MitigationActionEnum]::E_RestartHns) { + write-host "restarting HNS" + restart-service -f hns + } } + function myMain() { ScriptSetup while ($true) { - $restartRequired = CheckIfRestartRequired + write-host "" + $restartRequired = CheckIfMitigationRequired if ($restartRequired -eq $false) { - sleep ($SleepIntervalMins * 60) + sleep ($SleepIntervalSecs) continue } $current_time = get-date - $timeSinceLastRestart = $current_time - $g_lastHnsRestartTime + $timeSinceLastMitigation = $current_time - $g_lastMitigationTime $scriptAge = $current_time - $g_scriptStartTime #### @@ -293,32 +335,40 @@ function myMain() { # current_time could be just after a reboot, or just after a HNS/kube-proxy restart. # Let's not restart yet. - sleep ($SleepIntervalMins * 60) + $msg = "Not taking mitigation-action since current time could be just after reboot/HNS/kube-proxy restart." + write-host $msg + sleep ($SleepIntervalSecs) continue } - elseif ($g_hnsRestartCount -ge $MaxHnsRestarts) + elseif ($g_mitigationActionCount -ge $MaxMitigationCount) { - # TODO: log("max HNS restarts already done. Shouldn't restart anymore.") - sleep ($SleepIntervalMins * 60) + $msg = "Not taking mitigation-action since MaxMitigationCount has been crossed. Shouldn't take action anymore." + write-host $msg + sleep ($SleepIntervalSecs) continue } - elseif ($timeSinceLastRestart.TotalMinutes -lt $MinRestartInterval) + elseif ($timeSinceLastMitigation.TotalSeconds -lt $MinMitigationIntervalSecs) { - # TODO: log("HNS restarted recently. Let's wait more.") - sleep ($timeSinceLastRestart.TotalSeconds) + $timeToSleepSecs = $MinMitigationIntervalSecs - $timeSinceLastMitigation.TotalSeconds + $timeToSleepMins = $timeToSleepSecs / 60 + $msg = "Not taking mitigation-action since it was taken just {0} minutes ago. Checking again after {1} minutes" -f $timeSinceLastMitigation.TotalMinutes, $timeToSleepMins + write-host $msg + sleep ($timeToSleepSecs) continue } # All negative cases (i.e., conditions to not restart HNS end here.) #### - # TODO: log("") - collectLogs -LogsPath $WindowsLogsPath - - restartHnsService + $msg = "Collecting logs before mitigation" + write-host $msg + collectLogsBeforeMitigation -LogsPath $WindowsLogsPath - $g_lastHnsRestartTime = get-date - $g_hnsRestartCount += 1 + $msg = "Taking mitigation action..." + write-host $msg + ExecuteMitigationAction + $g_lastMitigationTime = get-date + $g_mitigationActionCount += 1 } } From e3cbe4cc8f99971c58f6219140f64fac7fdd3f92 Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Sun, 23 Jul 2023 10:47:38 +0530 Subject: [PATCH 04/20] Update conditionalMitigator.ps1 2 --- scripts/conditionalMitigator/conditionalMitigator.ps1 | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/conditionalMitigator/conditionalMitigator.ps1 b/scripts/conditionalMitigator/conditionalMitigator.ps1 index ba8d2e0..1b1038a 100644 --- a/scripts/conditionalMitigator/conditionalMitigator.ps1 +++ b/scripts/conditionalMitigator/conditionalMitigator.ps1 @@ -318,9 +318,9 @@ function myMain() while ($true) { write-host "" - $restartRequired = CheckIfMitigationRequired + $mitigationRequired = CheckIfMitigationRequired - if ($restartRequired -eq $false) { + if ($mitigationRequired -eq $false) { sleep ($SleepIntervalSecs) continue } @@ -330,11 +330,11 @@ function myMain() $scriptAge = $current_time - $g_scriptStartTime #### - # Check conditions to not restart HNS. + # Conditions for not mitigating. if ($scriptAge.TotalMinutes -lt $SleepIntervalMins) { # current_time could be just after a reboot, or just after a HNS/kube-proxy restart. - # Let's not restart yet. + # We don't hurry yet. We check again after $SleepIntervalSecs. $msg = "Not taking mitigation-action since current time could be just after reboot/HNS/kube-proxy restart." write-host $msg sleep ($SleepIntervalSecs) @@ -356,7 +356,7 @@ function myMain() sleep ($timeToSleepSecs) continue } - # All negative cases (i.e., conditions to not restart HNS end here.) + # All negative cases (i.e., conditions to not mitigate end here.) #### $msg = "Collecting logs before mitigation" From 51c5547e625c8b371c1400cbbb7be7a9bf138bf4 Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Tue, 25 Jul 2023 00:41:53 +0530 Subject: [PATCH 05/20] Update conditionalMitigator.ps1 3 --- .../conditionalMitigator.ps1 | 92 +++++++++++++------ 1 file changed, 63 insertions(+), 29 deletions(-) diff --git a/scripts/conditionalMitigator/conditionalMitigator.ps1 b/scripts/conditionalMitigator/conditionalMitigator.ps1 index 1b1038a..f80202b 100644 --- a/scripts/conditionalMitigator/conditionalMitigator.ps1 +++ b/scripts/conditionalMitigator/conditionalMitigator.ps1 @@ -12,7 +12,7 @@ param ( # time to sleep before checking whether HNS restart is required. [Parameter(Mandatory=$false)] - [int] $RuleCheckIntervalMins = 3, + [int] $RuleCheckIntervalMins = 15, # if a rule is missing on an endpoint for more than RuleCheckIntervalMins minutes, we restart HNS [Parameter(Mandatory=$false)] @@ -32,10 +32,12 @@ param ( [string] $WindowsLogsPath = "C:\k\debug\ConditionalHnsRestart_data\" ) + class RuleCheckInfo { [string]$ruleRegex [string]$layerName [string]$groupName + RuleCheckInfo([string] $in_ruleRegex, [string] $in_layerName, [string] $in_groupName) { $this.ruleRegex = $in_ruleRegex @@ -65,6 +67,7 @@ enum MitigationActionEnum { $g_scriptStartTime = get-date $g_endpointInfoMap = @{} # key = id, value = EndpointInfo +$g_currentVfpPortMap = @{} $g_podRuleCheckList = [System.Collections.Generic.List[RuleCheckInfo]]::new() # array of RuleCheckInfo objects $g_mitigationActionCount = 0 $g_lastMitigationTime = $g_scriptStartTime @@ -73,6 +76,17 @@ $RuleCheckIntervalSecs = $RuleCheckIntervalMins * 60 $SleepIntervalSecs = $SleepIntervalMins * 60 $MinMitigationIntervalSecs = $MinMitigationIntervalMins * 60 +function LogWithTimeStamp( + [string] $msgStr +) +{ + $currentTime = (get-date).ToUniversalTime() + $timestamp = $currentTime.ToShortDateString() + " " + $currentTime.ToLongTimeString() + $msg = $timestamp + " | " + $msgStr + write-host $msg +} + + function RulePresentInVfpPortGroup( [PSCustomObject] $portGroup, [RuleCheckInfo] $ruleToCheck @@ -86,14 +100,14 @@ function RulePresentInVfpPortGroup( if ($rule.Id -match $ruleToCheck.ruleRegex) { $ruleFound = $true #$msg = "rule {0} matches regex {1}." -f $rule.Id, $ruleToCheck.ruleRegex - #write-host $msg + #LogWithTimeStamp -msgStr $msg break } } if ($ruleFound -eq $false) { $msg = "rule with regex {0} not found on group {1}" -f $ruleToCheck.ruleRegex, $portGroup.name - write-host $msg + LogWithTimeStamp -msgStr $msg } return $ruleFound } @@ -116,16 +130,17 @@ function RulePresentInVfpPortLayer( } if ($groupFound -eq $false) { $msg = "No group on layer {0} matches name {1}" -f ('"' + $ruleToCheck.layerName + '"'),$ruleToCheck.groupName - write-host $msg + LogWithTimeStamp -msgStr $msg return $false } #$msg = "group {0} found in layer {1}." -f $ruleToCheck.groupName, $ruleToCheck.layerName - #write-host $msg + #LogWithTimeStamp -msgStr $msg return RulePresentInVfpPortGroup -portGroup $layer.groups[$groupIndex] -ruleToCheck $ruleToCheck } + function RulesPresentOnVfpPort( [string] $portId, [System.Collections.Generic.List[RuleCheckInfo]] $rulesToCheck @@ -148,17 +163,17 @@ function RulesPresentOnVfpPort( } if ($layerFound -eq $false) { $msg = "No layer on port {0} matches name {1}" -f $portId, ('"' + $ruleToCheck.layerName + '"') - write-host $msg + LogWithTimeStamp -msgStr $msg return $false } #$msg = "Layer {0} found on port {1}: {2}." -f $ruleToCheck.layerName, $portId, $layers[$layerIndex] - #write-host $msg + #LogWithTimeStamp -msgStr $msg $rulePresentInLayer = RulePresentInVfpPortLayer -layer $layers[$layerIndex] -ruleToCheck $ruleToCheck if ($rulePresentInLayer -eq $false) { $msg = "No rule on port {0} matches regex {1}." -f $portId, $ruleToCheck.ruleRegex - write-host $msg + LogWithTimeStamp -msgStr $msg return $false } } @@ -166,6 +181,7 @@ function RulesPresentOnVfpPort( return $true } + function PortIsPodPort( [PSCustomObject] $vfpPortInfo ) @@ -173,22 +189,24 @@ function PortIsPodPort( if ($vfpPortInfo.id -match $g_nonPodPortRegex) { return $false } - return $true } -function RulesAreMissing() { + +function NoteCurrentVfpPorts() +{ $vfpPortList = ((vfpctrl /list-vmswitch-port /format 1 | convertfrom-json).Ports) - $vfpPortMap = @{} + # reset g_currentVfpPortMap to empty map + $g_currentVfpPortMap.Clear() #$msg = "There are {0} ports in VFP." -f $vfpPortList.count - #write-host $msg + #LogWithTimeStamp -msgStr $msg ## Note new endpoint IDs. $priorSize = $g_endpointInfoMap.count foreach ($vfpPort in $vfpPortList) { - $vfpPortMap.Add($vfpPort.Id, $vfpPort) + $g_currentVfpPortMap.Add($vfpPort.Id, $vfpPort) if ($g_endpointInfoMap.ContainsKey($vfpPort.Id) -eq $false) { @@ -199,7 +217,11 @@ function RulesAreMissing() { } $endpointsAdded = $g_endpointInfoMap.count - $priorSize $msg = "new endpoints added to g_endpointInfoMap: {0}" -f $endpointsAdded - write-host $msg + LogWithTimeStamp -msgStr $msg + + $msg = "size of g_currentVfpPortMap: {0}" -f $g_currentVfpPortMap.count + LogWithTimeStamp -msgStr $msg + ## @@ -221,20 +243,21 @@ function RulesAreMissing() { $priorSize = $g_endpointInfoMap.count foreach ($portId in $stalePortIdList) { $msg = "deleting stale endpoint ID {0}" -f $portId - write-host $msg + LogWithTimeStamp -msgStr $msg $g_endpointInfoMap.Remove($portId) } $endpointsDeleted = $g_endpointInfoMap.count - $priorSize $msg = "old endpoints deleted from g_endpointInfoMap: {0}" -f $endpointsDeleted - write-host $msg + LogWithTimeStamp -msgStr $msg ## +} - +function RulesAreMissing() { ## Check pod port rules. foreach ($portId in $g_endpointInfoMap.Keys) { - $isPodPort = PortIsPodPort -vfpPortInfo $vfpPortMap[$portId] + $isPodPort = PortIsPodPort -vfpPortInfo $g_currentVfpPortMap[$portId] if ($isPodPort -eq $false) { # this could be external port or host vNIC. Ignore. continue @@ -249,21 +272,22 @@ function RulesAreMissing() { } } - #$msg = "Checking for rules on port name:{0} id:{1}" -f $vfpPortMap[$portId].name,$vfpPortMap[$portId].id - #write-host $msg + #$msg = "Checking for rules on port name:{0} id:{1}" -f $g_currentVfpPortMap[$portId].name,$g_currentVfpPortMap[$portId].id + #LogWithTimeStamp -msgStr $msg $rulesPresent = RulesPresentOnVfpPort -portId $portId -rulesToCheck $g_podRuleCheckList + $g_endpointInfoMap[$portId].ruleCheckCount += 1 + $g_endpointInfoMap[$portId].lastRuleCheckTime = $current_time + if ($rulesPresent -eq $true) { # This port has the necessary rules. - $g_endpointInfoMap[$portId].ruleCheckCount += 1 - $g_endpointInfoMap[$portId].lastRuleCheckTime = $current_time continue } # We reach here when a port does not have the necessary rules for more than RuleCheckIntervalMins. # Mitigation action must be taken. $msg = "Rules missing on VFP port with ID {0} since last {1} minutes" -f $portId,$timeSinceLastCheck.TotalMinutes - write-host $msg + LogWithTimeStamp -msgStr $msg return $true } ## Pod port rule check done. @@ -271,6 +295,7 @@ function RulesAreMissing() { return $false } + function ScriptSetup() { foreach ($rule in $PodPortRulesToCheck) { @@ -278,15 +303,19 @@ function ScriptSetup() $g_podRuleCheckList.Add($ruleCheckInfo) } $msg = "Number of pod port rules to check: {0}" -f $g_podRuleCheckList.count - write-host $msg + LogWithTimeStamp -msgStr $msg } + function CheckIfMitigationRequired() { + NoteCurrentVfpPorts + $rulesMissing = RulesAreMissing return $rulesMissing } + function collectLogsBeforeMitigation( [string]$LogsPath ) @@ -303,6 +332,7 @@ function collectLogsBeforeMitigation( } } + function ExecuteMitigationAction() { if ($MitigationActionEnum -eq [MitigationActionEnum]::E_RestartHns) { @@ -336,14 +366,14 @@ function myMain() # current_time could be just after a reboot, or just after a HNS/kube-proxy restart. # We don't hurry yet. We check again after $SleepIntervalSecs. $msg = "Not taking mitigation-action since current time could be just after reboot/HNS/kube-proxy restart." - write-host $msg + LogWithTimeStamp -msgStr $msg sleep ($SleepIntervalSecs) continue } elseif ($g_mitigationActionCount -ge $MaxMitigationCount) { $msg = "Not taking mitigation-action since MaxMitigationCount has been crossed. Shouldn't take action anymore." - write-host $msg + LogWithTimeStamp -msgStr $msg sleep ($SleepIntervalSecs) continue } @@ -351,8 +381,12 @@ function myMain() { $timeToSleepSecs = $MinMitigationIntervalSecs - $timeSinceLastMitigation.TotalSeconds $timeToSleepMins = $timeToSleepSecs / 60 + if ($MinSleepIntervalMins > $timeToSleepMins) { + $timeToSleepSecs = $MinSleepIntervalMins * 60 + $timeToSleepMins = $timeToSleepSecs / 60 + } $msg = "Not taking mitigation-action since it was taken just {0} minutes ago. Checking again after {1} minutes" -f $timeSinceLastMitigation.TotalMinutes, $timeToSleepMins - write-host $msg + LogWithTimeStamp -msgStr $msg sleep ($timeToSleepSecs) continue } @@ -360,11 +394,11 @@ function myMain() #### $msg = "Collecting logs before mitigation" - write-host $msg + LogWithTimeStamp -msgStr $msg collectLogsBeforeMitigation -LogsPath $WindowsLogsPath $msg = "Taking mitigation action..." - write-host $msg + LogWithTimeStamp -msgStr $msg ExecuteMitigationAction $g_lastMitigationTime = get-date From 31b10c4e7153048fb870e88b03e9b81e0d3de2ae Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Tue, 25 Jul 2023 22:01:11 +0530 Subject: [PATCH 06/20] Update conditionalMitigator.ps1 4 --- .../conditionalMitigator.ps1 | 112 +++++++++--------- 1 file changed, 53 insertions(+), 59 deletions(-) diff --git a/scripts/conditionalMitigator/conditionalMitigator.ps1 b/scripts/conditionalMitigator/conditionalMitigator.ps1 index f80202b..072f384 100644 --- a/scripts/conditionalMitigator/conditionalMitigator.ps1 +++ b/scripts/conditionalMitigator/conditionalMitigator.ps1 @@ -32,7 +32,6 @@ param ( [string] $WindowsLogsPath = "C:\k\debug\ConditionalHnsRestart_data\" ) - class RuleCheckInfo { [string]$ruleRegex [string]$layerName @@ -57,6 +56,7 @@ class EndpointInfo { $this.id = $inId $this.notedTime = $inTime $this.ruleCheckCount = 0 + $this.lastRuleCheckTime = get-date # Initializing with current time because otherwise it would have a garbage value. } } @@ -99,8 +99,6 @@ function RulePresentInVfpPortGroup( $ruleIndex += 1 if ($rule.Id -match $ruleToCheck.ruleRegex) { $ruleFound = $true - #$msg = "rule {0} matches regex {1}." -f $rule.Id, $ruleToCheck.ruleRegex - #LogWithTimeStamp -msgStr $msg break } } @@ -113,12 +111,12 @@ function RulePresentInVfpPortGroup( } -function RulePresentInVfpPortLayer( +function IsRulePresentInVfpPortLayer( [PSCustomObject] $layer, [RuleCheckInfo] $ruleToCheck ) { - # first find layer + # find group $groupFound = $false $groupIndex = -1 foreach ($portGroup in $layer.groups) { @@ -134,19 +132,16 @@ function RulePresentInVfpPortLayer( return $false } - #$msg = "group {0} found in layer {1}." -f $ruleToCheck.groupName, $ruleToCheck.layerName - #LogWithTimeStamp -msgStr $msg - return RulePresentInVfpPortGroup -portGroup $layer.groups[$groupIndex] -ruleToCheck $ruleToCheck } -function RulesPresentOnVfpPort( +function CheckForRulesOnVfpPort( [string] $portId, [System.Collections.Generic.List[RuleCheckInfo]] $rulesToCheck ) { - #write-host "RulesPresentOnVfpPort called" + #write-host "CheckForRulesOnVfpPort called" $layers = (vfpctrl /list-rule /port $portId /format 1 | convertfrom-json).Layers foreach ($ruleToCheck in $rulesToCheck) { @@ -167,10 +162,7 @@ function RulesPresentOnVfpPort( return $false } - #$msg = "Layer {0} found on port {1}: {2}." -f $ruleToCheck.layerName, $portId, $layers[$layerIndex] - #LogWithTimeStamp -msgStr $msg - - $rulePresentInLayer = RulePresentInVfpPortLayer -layer $layers[$layerIndex] -ruleToCheck $ruleToCheck + $rulePresentInLayer = IsRulePresentInVfpPortLayer -layer $layers[$layerIndex] -ruleToCheck $ruleToCheck if ($rulePresentInLayer -eq $false) { $msg = "No rule on port {0} matches regex {1}." -f $portId, $ruleToCheck.ruleRegex LogWithTimeStamp -msgStr $msg @@ -199,10 +191,7 @@ function NoteCurrentVfpPorts() # reset g_currentVfpPortMap to empty map $g_currentVfpPortMap.Clear() - #$msg = "There are {0} ports in VFP." -f $vfpPortList.count - #LogWithTimeStamp -msgStr $msg - - ## Note new endpoint IDs. + LogWithTimeStamp -msgStr "Adding new endpoints to g_endpointInfoMap" $priorSize = $g_endpointInfoMap.count foreach ($vfpPort in $vfpPortList) { @@ -215,17 +204,14 @@ function NoteCurrentVfpPorts() $g_endpointInfoMap.Add($vfpPort.Id, $endpointInfo) } } - $endpointsAdded = $g_endpointInfoMap.count - $priorSize - $msg = "new endpoints added to g_endpointInfoMap: {0}" -f $endpointsAdded - LogWithTimeStamp -msgStr $msg - - $msg = "size of g_currentVfpPortMap: {0}" -f $g_currentVfpPortMap.count - LogWithTimeStamp -msgStr $msg - ## + $endpointsAdded = $g_endpointInfoMap.count - $priorSize + LogWithTimeStamp -msgStr ("new endpoints added to g_endpointInfoMap: {0}" -f $endpointsAdded) + LogWithTimeStamp -msgStr ("size of g_currentVfpPortMap: {0}" -f $g_currentVfpPortMap.count) ## Delete stale endpoint IDs, so that g_endpointInfoMap's size does not keep increasing forever. + LogWithTimeStamp -msgStr "Removing deleted endpoints from $g_endpointInfoMap" $stalePortIdList = @() foreach ($portId in $g_endpointInfoMap.Keys) { $portIdPresent = $false @@ -246,7 +232,7 @@ function NoteCurrentVfpPorts() LogWithTimeStamp -msgStr $msg $g_endpointInfoMap.Remove($portId) } - + $endpointsDeleted = $g_endpointInfoMap.count - $priorSize $msg = "old endpoints deleted from g_endpointInfoMap: {0}" -f $endpointsDeleted LogWithTimeStamp -msgStr $msg @@ -264,31 +250,29 @@ function RulesAreMissing() { } $current_time = get-date - $timeSinceLastCheck = $current_time - $g_endpointInfoMap[$portId].lastRuleCheckTime if ($g_endpointInfoMap.ruleCheckCount -gt 0) { + $timeSinceLastCheck = $current_time - $g_endpointInfoMap[$portId].lastRuleCheckTime if ($timeSinceLastCheck.TotalSeconds -lt $RuleCheckIntervalSecs) { # check again later continue } + } else { + $timeSinceLastCheck = $current_time - $g_scriptStartTime } - #$msg = "Checking for rules on port name:{0} id:{1}" -f $g_currentVfpPortMap[$portId].name,$g_currentVfpPortMap[$portId].id - #LogWithTimeStamp -msgStr $msg - - $rulesPresent = RulesPresentOnVfpPort -portId $portId -rulesToCheck $g_podRuleCheckList + $rulesPresent = CheckForRulesOnVfpPort -portId $portId -rulesToCheck $g_podRuleCheckList $g_endpointInfoMap[$portId].ruleCheckCount += 1 - $g_endpointInfoMap[$portId].lastRuleCheckTime = $current_time - if ($rulesPresent -eq $true) { - # This port has the necessary rules. - continue + if ($rulesPresent -eq $false) { + # We reach here when a port does not have the necessary rules for more than RuleCheckIntervalMins. + # Mitigation action must be taken. + $msg = "Rules missing on VFP port with ID {0} since atleast last {1} minutes" -f $portId,$timeSinceLastCheck.TotalMinutes + LogWithTimeStamp -msgStr $msg + return $true } - # We reach here when a port does not have the necessary rules for more than RuleCheckIntervalMins. - # Mitigation action must be taken. - $msg = "Rules missing on VFP port with ID {0} since last {1} minutes" -f $portId,$timeSinceLastCheck.TotalMinutes - LogWithTimeStamp -msgStr $msg - return $true + $g_endpointInfoMap[$portId].lastRuleCheckTime = $current_time + # This port has the necessary rules. } ## Pod port rule check done. @@ -320,31 +304,51 @@ function collectLogsBeforeMitigation( [string]$LogsPath ) { - # create log path if not yet created. - mkdir -Force $LogsPath - if ($CollectWindowsLogs -eq $true) { - write-host "collecting windows logs" + # create log path if not yet created. + mkdir -Force $LogsPath + + LogWithTimeStamp -msgStr "collecting windows logs" $originalPath = pwd Set-Location $LogsPath C:\k\debug\collect-windows-logs.ps1 Set-Location $originalPath + + $currentPath = (pwd).Path + LogWithTimeStamp -msgStr ("current location: {0}" -f $currentPath) } } function ExecuteMitigationAction() { - if ($MitigationActionEnum -eq [MitigationActionEnum]::E_RestartHns) { - write-host "restarting HNS" + LogWithTimeStamp -msgStr ("MitigationActionVal is {0}" -f $MitigationActionVal) + + if ($MitigationActionVal -eq [MitigationActionEnum]::E_RestartHns) { + LogWithTimeStamp -msgStr "restarting HNS" restart-service -f hns + } elseif ($MitigationActionVal -eq [MitigationActionEnum]::E_RestartKubeProxy) { + LogWithTimeStamp -msgStr "restarting kubeproxy" + restart-service -f kubeproxy } } +function SleepInfinitely() { + while(1) { + sleep($SleepIntervalSecs) + } +} function myMain() { ScriptSetup + + if ($PauseAtBeginning -eq $true) { + $msg = "Script started. Current time could be just after reboot/HNS/kube-proxy restart. Sleeping for few mins before starting mitigation-checks." + LogWithTimeStamp -msgStr $msg + sleep ($SleepIntervalSecs) + } + while ($true) { write-host "" @@ -361,21 +365,11 @@ function myMain() #### # Conditions for not mitigating. - if ($scriptAge.TotalMinutes -lt $SleepIntervalMins) - { - # current_time could be just after a reboot, or just after a HNS/kube-proxy restart. - # We don't hurry yet. We check again after $SleepIntervalSecs. - $msg = "Not taking mitigation-action since current time could be just after reboot/HNS/kube-proxy restart." - LogWithTimeStamp -msgStr $msg - sleep ($SleepIntervalSecs) - continue - } - elseif ($g_mitigationActionCount -ge $MaxMitigationCount) + if ($g_mitigationActionCount -ge $MaxMitigationCount) { - $msg = "Not taking mitigation-action since MaxMitigationCount has been crossed. Shouldn't take action anymore." + $msg = "Not taking mitigation-action since MaxMitigationCount has been crossed. Going to infinite sleep." LogWithTimeStamp -msgStr $msg - sleep ($SleepIntervalSecs) - continue + SleepInfinitely } elseif ($timeSinceLastMitigation.TotalSeconds -lt $MinMitigationIntervalSecs) { From a287f1c34f38bb1f3753f4875aa645d7357226d6 Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Tue, 25 Jul 2023 22:11:22 +0530 Subject: [PATCH 07/20] Update conditionalMitigator.ps1 5 --- scripts/conditionalMitigator/conditionalMitigator.ps1 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/conditionalMitigator/conditionalMitigator.ps1 b/scripts/conditionalMitigator/conditionalMitigator.ps1 index 072f384..ca75384 100644 --- a/scripts/conditionalMitigator/conditionalMitigator.ps1 +++ b/scripts/conditionalMitigator/conditionalMitigator.ps1 @@ -28,6 +28,9 @@ param ( [Parameter(Mandatory=$false)] [bool] $CollectWindowsLogs = $true, + [Parameter(Mandatory=$false)] + [bool] $PauseAtBeginning = $true, + [Parameter(Mandatory=$false)] [string] $WindowsLogsPath = "C:\k\debug\ConditionalHnsRestart_data\" ) From 1dae926e98fa5301d3c958f03382e1e50b313f83 Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Tue, 25 Jul 2023 22:24:21 +0530 Subject: [PATCH 08/20] Update conditionalMitigator.ps1 6 --- scripts/conditionalMitigator/conditionalMitigator.ps1 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/conditionalMitigator/conditionalMitigator.ps1 b/scripts/conditionalMitigator/conditionalMitigator.ps1 index ca75384..5530b0d 100644 --- a/scripts/conditionalMitigator/conditionalMitigator.ps1 +++ b/scripts/conditionalMitigator/conditionalMitigator.ps1 @@ -90,7 +90,7 @@ function LogWithTimeStamp( } -function RulePresentInVfpPortGroup( +function IsRulePresentInVfpPortGroup( [PSCustomObject] $portGroup, [RuleCheckInfo] $ruleToCheck ) @@ -135,7 +135,7 @@ function IsRulePresentInVfpPortLayer( return $false } - return RulePresentInVfpPortGroup -portGroup $layer.groups[$groupIndex] -ruleToCheck $ruleToCheck + return IsRulePresentInVfpPortGroup -portGroup $layer.groups[$groupIndex] -ruleToCheck $ruleToCheck } From 6b625c14be26cb2e9bb7333e65a1b4b38b542d1a Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Wed, 26 Jul 2023 09:17:44 +0530 Subject: [PATCH 09/20] Update conditionalMitigator.ps1 6 --- .../conditionalMitigator.ps1 | 45 +++++++------------ 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/scripts/conditionalMitigator/conditionalMitigator.ps1 b/scripts/conditionalMitigator/conditionalMitigator.ps1 index 5530b0d..54e5827 100644 --- a/scripts/conditionalMitigator/conditionalMitigator.ps1 +++ b/scripts/conditionalMitigator/conditionalMitigator.ps1 @@ -90,7 +90,7 @@ function LogWithTimeStamp( } -function IsRulePresentInVfpPortGroup( +function RulePresentInVfpPortGroup( [PSCustomObject] $portGroup, [RuleCheckInfo] $ruleToCheck ) @@ -107,8 +107,7 @@ function IsRulePresentInVfpPortGroup( } if ($ruleFound -eq $false) { - $msg = "rule with regex {0} not found on group {1}" -f $ruleToCheck.ruleRegex, $portGroup.name - LogWithTimeStamp -msgStr $msg + LogWithTimeStamp -msgStr ("rule with regex {0} not found on group {1}" -f $ruleToCheck.ruleRegex, $portGroup.name) } return $ruleFound } @@ -130,12 +129,11 @@ function IsRulePresentInVfpPortLayer( } } if ($groupFound -eq $false) { - $msg = "No group on layer {0} matches name {1}" -f ('"' + $ruleToCheck.layerName + '"'),$ruleToCheck.groupName - LogWithTimeStamp -msgStr $msg + LogWithTimeStamp -msgStr ("No group on layer {0} matches name {1}" -f ('"' + $ruleToCheck.layerName + '"'),$ruleToCheck.groupName) return $false } - return IsRulePresentInVfpPortGroup -portGroup $layer.groups[$groupIndex] -ruleToCheck $ruleToCheck + return RulePresentInVfpPortGroup -portGroup $layer.groups[$groupIndex] -ruleToCheck $ruleToCheck } @@ -160,15 +158,13 @@ function CheckForRulesOnVfpPort( } } if ($layerFound -eq $false) { - $msg = "No layer on port {0} matches name {1}" -f $portId, ('"' + $ruleToCheck.layerName + '"') - LogWithTimeStamp -msgStr $msg + LogWithTimeStamp -msgStr ("No layer on port {0} matches name {1}" -f $portId, ('"' + $ruleToCheck.layerName + '"')) return $false } $rulePresentInLayer = IsRulePresentInVfpPortLayer -layer $layers[$layerIndex] -ruleToCheck $ruleToCheck if ($rulePresentInLayer -eq $false) { - $msg = "No rule on port {0} matches regex {1}." -f $portId, $ruleToCheck.ruleRegex - LogWithTimeStamp -msgStr $msg + LogWithTimeStamp -msgStr ("No rule on port {0} matches regex {1}." -f $portId, $ruleToCheck.ruleRegex) return $false } } @@ -214,7 +210,7 @@ function NoteCurrentVfpPorts() LogWithTimeStamp -msgStr ("size of g_currentVfpPortMap: {0}" -f $g_currentVfpPortMap.count) ## Delete stale endpoint IDs, so that g_endpointInfoMap's size does not keep increasing forever. - LogWithTimeStamp -msgStr "Removing deleted endpoints from $g_endpointInfoMap" + LogWithTimeStamp -msgStr "Removing deleted endpoints from g_endpointInfoMap" $stalePortIdList = @() foreach ($portId in $g_endpointInfoMap.Keys) { $portIdPresent = $false @@ -231,14 +227,12 @@ function NoteCurrentVfpPorts() } $priorSize = $g_endpointInfoMap.count foreach ($portId in $stalePortIdList) { - $msg = "deleting stale endpoint ID {0}" -f $portId - LogWithTimeStamp -msgStr $msg + LogWithTimeStamp -msgStr ("deleting stale endpoint ID {0}" -f $portId) $g_endpointInfoMap.Remove($portId) } $endpointsDeleted = $g_endpointInfoMap.count - $priorSize - $msg = "old endpoints deleted from g_endpointInfoMap: {0}" -f $endpointsDeleted - LogWithTimeStamp -msgStr $msg + LogWithTimeStamp -msgStr ("old endpoints deleted from g_endpointInfoMap: {0}" -f $endpointsDeleted) ## } @@ -269,8 +263,7 @@ function RulesAreMissing() { if ($rulesPresent -eq $false) { # We reach here when a port does not have the necessary rules for more than RuleCheckIntervalMins. # Mitigation action must be taken. - $msg = "Rules missing on VFP port with ID {0} since atleast last {1} minutes" -f $portId,$timeSinceLastCheck.TotalMinutes - LogWithTimeStamp -msgStr $msg + LogWithTimeStamp -msgStr ("Rules missing on VFP port with ID {0} since atleast last {1} minutes" -f $portId,$timeSinceLastCheck.TotalMinutes) return $true } @@ -289,8 +282,7 @@ function ScriptSetup() $ruleCheckInfo = [RuleCheckInfo]::New($rule.ruleRegex, $rule.layerName, $rule.groupName) $g_podRuleCheckList.Add($ruleCheckInfo) } - $msg = "Number of pod port rules to check: {0}" -f $g_podRuleCheckList.count - LogWithTimeStamp -msgStr $msg + LogWithTimeStamp -msgStr ("Number of pod port rules to check: {0}" -f $g_podRuleCheckList.count) } @@ -347,8 +339,7 @@ function myMain() ScriptSetup if ($PauseAtBeginning -eq $true) { - $msg = "Script started. Current time could be just after reboot/HNS/kube-proxy restart. Sleeping for few mins before starting mitigation-checks." - LogWithTimeStamp -msgStr $msg + LogWithTimeStamp -msgStr ("Script started. Current time could be just after reboot/HNS/kube-proxy restart. Sleeping for few mins before starting mitigation-checks.") sleep ($SleepIntervalSecs) } @@ -370,8 +361,7 @@ function myMain() # Conditions for not mitigating. if ($g_mitigationActionCount -ge $MaxMitigationCount) { - $msg = "Not taking mitigation-action since MaxMitigationCount has been crossed. Going to infinite sleep." - LogWithTimeStamp -msgStr $msg + LogWithTimeStamp -msgStr ("Not taking mitigation-action since MaxMitigationCount has been crossed. Going to infinite sleep.") SleepInfinitely } elseif ($timeSinceLastMitigation.TotalSeconds -lt $MinMitigationIntervalSecs) @@ -382,20 +372,17 @@ function myMain() $timeToSleepSecs = $MinSleepIntervalMins * 60 $timeToSleepMins = $timeToSleepSecs / 60 } - $msg = "Not taking mitigation-action since it was taken just {0} minutes ago. Checking again after {1} minutes" -f $timeSinceLastMitigation.TotalMinutes, $timeToSleepMins - LogWithTimeStamp -msgStr $msg + LogWithTimeStamp -msgStr ("Not taking mitigation-action since it was taken just {0} minutes ago. Checking again after {1} minutes" -f $timeSinceLastMitigation.TotalMinutes, $timeToSleepMins) sleep ($timeToSleepSecs) continue } # All negative cases (i.e., conditions to not mitigate end here.) #### - $msg = "Collecting logs before mitigation" - LogWithTimeStamp -msgStr $msg + LogWithTimeStamp -msgStr ("Collecting logs before mitigation") collectLogsBeforeMitigation -LogsPath $WindowsLogsPath - $msg = "Taking mitigation action..." - LogWithTimeStamp -msgStr $msg + LogWithTimeStamp -msgStr ("Taking mitigation action...") ExecuteMitigationAction $g_lastMitigationTime = get-date From c3379750d14e281a568ba912485cd7de3c44618d Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Wed, 26 Jul 2023 10:14:49 +0530 Subject: [PATCH 10/20] Update conditionalMitigator.ps1 8 --- .../conditionalMitigator.ps1 | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/scripts/conditionalMitigator/conditionalMitigator.ps1 b/scripts/conditionalMitigator/conditionalMitigator.ps1 index 54e5827..df5a591 100644 --- a/scripts/conditionalMitigator/conditionalMitigator.ps1 +++ b/scripts/conditionalMitigator/conditionalMitigator.ps1 @@ -85,12 +85,12 @@ function LogWithTimeStamp( { $currentTime = (get-date).ToUniversalTime() $timestamp = $currentTime.ToShortDateString() + " " + $currentTime.ToLongTimeString() - $msg = $timestamp + " | " + $msgStr + $msg = (hostname) + " " + $timestamp + " | " + $msgStr write-host $msg } -function RulePresentInVfpPortGroup( +function IsRulePresentInVfpPortGroup( [PSCustomObject] $portGroup, [RuleCheckInfo] $ruleToCheck ) @@ -133,7 +133,7 @@ function IsRulePresentInVfpPortLayer( return $false } - return RulePresentInVfpPortGroup -portGroup $layer.groups[$groupIndex] -ruleToCheck $ruleToCheck + return IsRulePresentInVfpPortGroup -portGroup $layer.groups[$groupIndex] -ruleToCheck $ruleToCheck } @@ -263,7 +263,7 @@ function RulesAreMissing() { if ($rulesPresent -eq $false) { # We reach here when a port does not have the necessary rules for more than RuleCheckIntervalMins. # Mitigation action must be taken. - LogWithTimeStamp -msgStr ("Rules missing on VFP port with ID {0} since atleast last {1} minutes" -f $portId,$timeSinceLastCheck.TotalMinutes) + LogWithTimeStamp -msgStr ("Rules missing on VFP port with ID {0} since atleast last {1:N2} minutes" -f $portId,$timeSinceLastCheck.TotalMinutes) return $true } @@ -317,8 +317,6 @@ function collectLogsBeforeMitigation( function ExecuteMitigationAction() { - LogWithTimeStamp -msgStr ("MitigationActionVal is {0}" -f $MitigationActionVal) - if ($MitigationActionVal -eq [MitigationActionEnum]::E_RestartHns) { LogWithTimeStamp -msgStr "restarting HNS" restart-service -f hns @@ -340,7 +338,7 @@ function myMain() if ($PauseAtBeginning -eq $true) { LogWithTimeStamp -msgStr ("Script started. Current time could be just after reboot/HNS/kube-proxy restart. Sleeping for few mins before starting mitigation-checks.") - sleep ($SleepIntervalSecs) + sleep($SleepIntervalSecs) } while ($true) @@ -349,7 +347,7 @@ function myMain() $mitigationRequired = CheckIfMitigationRequired if ($mitigationRequired -eq $false) { - sleep ($SleepIntervalSecs) + sleep($SleepIntervalSecs) continue } @@ -372,8 +370,8 @@ function myMain() $timeToSleepSecs = $MinSleepIntervalMins * 60 $timeToSleepMins = $timeToSleepSecs / 60 } - LogWithTimeStamp -msgStr ("Not taking mitigation-action since it was taken just {0} minutes ago. Checking again after {1} minutes" -f $timeSinceLastMitigation.TotalMinutes, $timeToSleepMins) - sleep ($timeToSleepSecs) + LogWithTimeStamp -msgStr ("Not taking mitigation-action since it was taken just {0:N2} minutes ago. Checking again after {1:N2} minutes" -f $timeSinceLastMitigation.TotalMinutes, $timeToSleepMins) + sleep($timeToSleepSecs) continue } # All negative cases (i.e., conditions to not mitigate end here.) @@ -387,6 +385,7 @@ function myMain() $g_lastMitigationTime = get-date $g_mitigationActionCount += 1 + sleep($MinMitigationIntervalSecs) } } From 48dcb040005a526566fd634cc9aaee3001504d85 Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Wed, 26 Jul 2023 10:49:06 +0530 Subject: [PATCH 11/20] Update conditionalMitigator.ps1 9 --- scripts/conditionalMitigator/conditionalMitigator.ps1 | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/conditionalMitigator/conditionalMitigator.ps1 b/scripts/conditionalMitigator/conditionalMitigator.ps1 index df5a591..8a571b5 100644 --- a/scripts/conditionalMitigator/conditionalMitigator.ps1 +++ b/scripts/conditionalMitigator/conditionalMitigator.ps1 @@ -190,7 +190,7 @@ function NoteCurrentVfpPorts() # reset g_currentVfpPortMap to empty map $g_currentVfpPortMap.Clear() - LogWithTimeStamp -msgStr "Adding new endpoints to g_endpointInfoMap" + LogWithTimeStamp -msgStr "Checking if new endpoints have been added" $priorSize = $g_endpointInfoMap.count foreach ($vfpPort in $vfpPortList) { @@ -210,7 +210,7 @@ function NoteCurrentVfpPorts() LogWithTimeStamp -msgStr ("size of g_currentVfpPortMap: {0}" -f $g_currentVfpPortMap.count) ## Delete stale endpoint IDs, so that g_endpointInfoMap's size does not keep increasing forever. - LogWithTimeStamp -msgStr "Removing deleted endpoints from g_endpointInfoMap" + LogWithTimeStamp -msgStr "Checking if any endpoints have been deleted" $stalePortIdList = @() foreach ($portId in $g_endpointInfoMap.Keys) { $portIdPresent = $false @@ -227,11 +227,10 @@ function NoteCurrentVfpPorts() } $priorSize = $g_endpointInfoMap.count foreach ($portId in $stalePortIdList) { - LogWithTimeStamp -msgStr ("deleting stale endpoint ID {0}" -f $portId) $g_endpointInfoMap.Remove($portId) } - $endpointsDeleted = $g_endpointInfoMap.count - $priorSize + $endpointsDeleted = $priorSize - $g_endpointInfoMap.count LogWithTimeStamp -msgStr ("old endpoints deleted from g_endpointInfoMap: {0}" -f $endpointsDeleted) ## } From f70a62cdff6e1aa5d0448522f3c975a6ce0bd32a Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Wed, 26 Jul 2023 16:16:09 +0530 Subject: [PATCH 12/20] Update conditionalMitigator.ps1 10 --- scripts/conditionalMitigator/conditionalMitigator.ps1 | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/conditionalMitigator/conditionalMitigator.ps1 b/scripts/conditionalMitigator/conditionalMitigator.ps1 index 8a571b5..6bf8b3c 100644 --- a/scripts/conditionalMitigator/conditionalMitigator.ps1 +++ b/scripts/conditionalMitigator/conditionalMitigator.ps1 @@ -376,7 +376,6 @@ function myMain() # All negative cases (i.e., conditions to not mitigate end here.) #### - LogWithTimeStamp -msgStr ("Collecting logs before mitigation") collectLogsBeforeMitigation -LogsPath $WindowsLogsPath LogWithTimeStamp -msgStr ("Taking mitigation action...") From 97ba42ce0c210907cee2d9c0d8242b0856f7b4ee Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Wed, 26 Jul 2023 18:56:06 +0530 Subject: [PATCH 13/20] Update conditionalMitigator.ps1 11 --- .../conditionalMitigator.ps1 | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/scripts/conditionalMitigator/conditionalMitigator.ps1 b/scripts/conditionalMitigator/conditionalMitigator.ps1 index 6bf8b3c..41ec137 100644 --- a/scripts/conditionalMitigator/conditionalMitigator.ps1 +++ b/scripts/conditionalMitigator/conditionalMitigator.ps1 @@ -54,10 +54,10 @@ class EndpointInfo { [int] $ruleCheckCount [System.DateTime]$lastRuleCheckTime - EndpointInfo([string] $inId, [System.DateTime] $inTime) + EndpointInfo([string] $inId) { $this.id = $inId - $this.notedTime = $inTime + $this.notedTime = get-date $this.ruleCheckCount = 0 $this.lastRuleCheckTime = get-date # Initializing with current time because otherwise it would have a garbage value. } @@ -198,8 +198,7 @@ function NoteCurrentVfpPorts() if ($g_endpointInfoMap.ContainsKey($vfpPort.Id) -eq $false) { - $notedTime = get-date - $endpointInfo = [EndpointInfo]::New($vfpPort.Id, $notedTime) + $endpointInfo = [EndpointInfo]::New($vfpPort.Id) $g_endpointInfoMap.Add($vfpPort.Id, $endpointInfo) } } @@ -246,14 +245,13 @@ function RulesAreMissing() { } $current_time = get-date + $timeSinceLastCheck = $current_time - $g_endpointInfoMap[$portId].lastRuleCheckTime + if ($g_endpointInfoMap.ruleCheckCount -gt 0) { - $timeSinceLastCheck = $current_time - $g_endpointInfoMap[$portId].lastRuleCheckTime if ($timeSinceLastCheck.TotalSeconds -lt $RuleCheckIntervalSecs) { # check again later continue } - } else { - $timeSinceLastCheck = $current_time - $g_scriptStartTime } $rulesPresent = CheckForRulesOnVfpPort -portId $portId -rulesToCheck $g_podRuleCheckList @@ -314,14 +312,25 @@ function collectLogsBeforeMitigation( } +function RestartWinService( + [string]$serviceName +) { + $oldPid = (Get-WmiObject -Class Win32_Service -Filter "Name LIKE '$serviceName'" | Select-Object -ExpandProperty ProcessId).ToString() + LogWithTimeStamp -msgStr ("Current {0} pid: {1}. Restarting {0}" -f $serviceName,$oldPid) + + restart-service -f $serviceName + + $newPid = (Get-WmiObject -Class Win32_Service -Filter "Name LIKE '$serviceName'" | Select-Object -ExpandProperty ProcessId).ToString() + LogWithTimeStamp -msgStr ("{0} pid after restart: {1}" -f $serviceName,$newPid) +} + + function ExecuteMitigationAction() { if ($MitigationActionVal -eq [MitigationActionEnum]::E_RestartHns) { - LogWithTimeStamp -msgStr "restarting HNS" - restart-service -f hns + RestartWinService -serviceName "Hns" } elseif ($MitigationActionVal -eq [MitigationActionEnum]::E_RestartKubeProxy) { - LogWithTimeStamp -msgStr "restarting kubeproxy" - restart-service -f kubeproxy + RestartWinService -serviceName "kubeproxy" } } @@ -383,6 +392,7 @@ function myMain() $g_lastMitigationTime = get-date $g_mitigationActionCount += 1 + LogWithTimeStamp -msgStr ("Mitigation done {0} times." -f $g_mitigationActionCount) sleep($MinMitigationIntervalSecs) } } From 276f4fc715bf9de3fe5ef223a45970f118bf2e04 Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Wed, 26 Jul 2023 19:09:38 +0530 Subject: [PATCH 14/20] Update conditionalMitigator.ps1 12 --- scripts/conditionalMitigator/conditionalMitigator.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/conditionalMitigator/conditionalMitigator.ps1 b/scripts/conditionalMitigator/conditionalMitigator.ps1 index 41ec137..34427f4 100644 --- a/scripts/conditionalMitigator/conditionalMitigator.ps1 +++ b/scripts/conditionalMitigator/conditionalMitigator.ps1 @@ -370,7 +370,7 @@ function myMain() LogWithTimeStamp -msgStr ("Not taking mitigation-action since MaxMitigationCount has been crossed. Going to infinite sleep.") SleepInfinitely } - elseif ($timeSinceLastMitigation.TotalSeconds -lt $MinMitigationIntervalSecs) + elseif (($g_mitigationActionCount -gt 0) -And ($timeSinceLastMitigation.TotalSeconds -lt $MinMitigationIntervalSecs)) { $timeToSleepSecs = $MinMitigationIntervalSecs - $timeSinceLastMitigation.TotalSeconds $timeToSleepMins = $timeToSleepSecs / 60 From e143e4bb9233d3adada9dc29bef2a573a856c0fc Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Wed, 2 Aug 2023 11:05:08 +0530 Subject: [PATCH 15/20] Create crashEventChecker.yaml hnsCrashEventChecker.yaml is a daemonset that looks for HNS crash win-event logs. Once applied on a kubernetes cluster, user can execute below command to know the timestamps when HNS crashed occurred on respective windows nodes: kubectl logs -l name=hns-crash-event-checker --all-containers=true --- .../hnsCrashEventChecker.yaml | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 scripts/crashEventChecker/hnsCrashEventChecker.yaml diff --git a/scripts/crashEventChecker/hnsCrashEventChecker.yaml b/scripts/crashEventChecker/hnsCrashEventChecker.yaml new file mode 100644 index 0000000..2e892cb --- /dev/null +++ b/scripts/crashEventChecker/hnsCrashEventChecker.yaml @@ -0,0 +1,54 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: hns-crash-event-checker + labels: + app: hns-crash-event-checker +spec: + selector: + matchLabels: + name: hns-crash-event-checker + template: + metadata: + labels: + name: hns-crash-event-checker + spec: + securityContext: + windowsOptions: + hostProcess: true + runAsUserName: "NT AUTHORITY\\SYSTEM" + hostNetwork: true + containers: + - name: hns-crash-event-checker + image: mcr.microsoft.com/windows/nanoserver:ltsc2022 + args: + - powershell.exe + - -Command + - | + $crashEventList = @() + (Get-WinEvent -FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Where-Object Message -like "*The Host Network Service service terminated unexpectedly*") ` + | ForEach-Object -Begin {} -Process {$crashEventList += $_} -End {} + + + $CrashInstancesStr = "" + + $i = 0 + foreach ($crashEvent in $crashEventList) + { + if ($i -eq 0) { + $CrashInstancesStr += $crashEvent.TimeCreated.ToString() + } else { + $CrashInstancesStr += ", " + $crashEvent.TimeCreated.ToString() + } + $i += 1 + } + + $(Hostname) + " : " + $CrashInstancesStr + + While ($true) { + Start-Sleep -Seconds 600 + } + + imagePullPolicy: IfNotPresent + nodeSelector: + kubernetes.azure.com/os-sku: Windows2022 From fa2d66cce27a6a8087ef91a379030f6968e92c57 Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Wed, 2 Aug 2023 11:15:21 +0530 Subject: [PATCH 16/20] Rename hnsCrashEventChecker.yaml to crashEventChecker.yaml Making the daemonset generic such that it can be used for any service (not just HNS). The $ServiceName variable needs to be edited for this. --- .../crashEventChecker/crashEventChecker.yaml | 25 +++++++++ .../hnsCrashEventChecker.yaml | 54 ------------------- 2 files changed, 25 insertions(+), 54 deletions(-) create mode 100644 scripts/crashEventChecker/crashEventChecker.yaml delete mode 100644 scripts/crashEventChecker/hnsCrashEventChecker.yaml diff --git a/scripts/crashEventChecker/crashEventChecker.yaml b/scripts/crashEventChecker/crashEventChecker.yaml new file mode 100644 index 0000000..c29613d --- /dev/null +++ b/scripts/crashEventChecker/crashEventChecker.yaml @@ -0,0 +1,25 @@ +$ServiceName = "Host Network Service" +$CrashEventList = @() +$MsgMatchString = "*The {0} service terminated unexpectedly*" -f $ServiceName +(Get-WinEvent -FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Where-Object Message -like $MsgMatchString) ` + | ForEach-Object -Begin {} -Process {$CrashEventList += $_} -End {} + + +$CrashInstancesStr = "" + +$i = 0 +foreach ($crashEvent in $CrashEventList) +{ + if ($i -eq 0) { + $CrashInstancesStr += $crashEvent.TimeCreated.ToString() + } else { + $CrashInstancesStr += ", " + $crashEvent.TimeCreated.ToString() + } + $i += 1 +} + +$(Hostname) + " : " + $CrashInstancesStr + +While ($true) { + Start-Sleep -Seconds 600 +} diff --git a/scripts/crashEventChecker/hnsCrashEventChecker.yaml b/scripts/crashEventChecker/hnsCrashEventChecker.yaml deleted file mode 100644 index 2e892cb..0000000 --- a/scripts/crashEventChecker/hnsCrashEventChecker.yaml +++ /dev/null @@ -1,54 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: hns-crash-event-checker - labels: - app: hns-crash-event-checker -spec: - selector: - matchLabels: - name: hns-crash-event-checker - template: - metadata: - labels: - name: hns-crash-event-checker - spec: - securityContext: - windowsOptions: - hostProcess: true - runAsUserName: "NT AUTHORITY\\SYSTEM" - hostNetwork: true - containers: - - name: hns-crash-event-checker - image: mcr.microsoft.com/windows/nanoserver:ltsc2022 - args: - - powershell.exe - - -Command - - | - $crashEventList = @() - (Get-WinEvent -FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Where-Object Message -like "*The Host Network Service service terminated unexpectedly*") ` - | ForEach-Object -Begin {} -Process {$crashEventList += $_} -End {} - - - $CrashInstancesStr = "" - - $i = 0 - foreach ($crashEvent in $crashEventList) - { - if ($i -eq 0) { - $CrashInstancesStr += $crashEvent.TimeCreated.ToString() - } else { - $CrashInstancesStr += ", " + $crashEvent.TimeCreated.ToString() - } - $i += 1 - } - - $(Hostname) + " : " + $CrashInstancesStr - - While ($true) { - Start-Sleep -Seconds 600 - } - - imagePullPolicy: IfNotPresent - nodeSelector: - kubernetes.azure.com/os-sku: Windows2022 From 57890b4800582e1f013bccfaa6169a2c1f7461e8 Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Wed, 2 Aug 2023 11:17:34 +0530 Subject: [PATCH 17/20] Correct crashEventChecker.yaml Last edit missed the daemonset code. --- .../crashEventChecker/crashEventChecker.yaml | 71 +++++++++++++------ 1 file changed, 51 insertions(+), 20 deletions(-) diff --git a/scripts/crashEventChecker/crashEventChecker.yaml b/scripts/crashEventChecker/crashEventChecker.yaml index c29613d..be409d7 100644 --- a/scripts/crashEventChecker/crashEventChecker.yaml +++ b/scripts/crashEventChecker/crashEventChecker.yaml @@ -1,25 +1,56 @@ -$ServiceName = "Host Network Service" -$CrashEventList = @() -$MsgMatchString = "*The {0} service terminated unexpectedly*" -f $ServiceName -(Get-WinEvent -FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Where-Object Message -like $MsgMatchString) ` - | ForEach-Object -Begin {} -Process {$CrashEventList += $_} -End {} +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: hns-crash-event-checker + labels: + app: hns-crash-event-checker +spec: + selector: + matchLabels: + name: hns-crash-event-checker + template: + metadata: + labels: + name: hns-crash-event-checker + spec: + securityContext: + windowsOptions: + hostProcess: true + runAsUserName: "NT AUTHORITY\\SYSTEM" + hostNetwork: true + containers: + - name: hns-crash-event-checker + image: mcr.microsoft.com/windows/nanoserver:ltsc2022 + args: + - powershell.exe + - -Command + - | + $ServiceName = "Host Network Service" + $CrashEventList = @() + $MsgMatchString = "*The {0} service terminated unexpectedly*" -f $ServiceName + (Get-WinEvent -FilterHashtable @{logname = 'System'; ProviderName = 'Service Control Manager' } | Select-Object -Property TimeCreated, Id, LevelDisplayName, Message | Where-Object Message -like $MsgMatchString) ` + | ForEach-Object -Begin {} -Process {$CrashEventList += $_} -End {} -$CrashInstancesStr = "" + $CrashInstancesStr = "" -$i = 0 -foreach ($crashEvent in $CrashEventList) -{ - if ($i -eq 0) { - $CrashInstancesStr += $crashEvent.TimeCreated.ToString() - } else { - $CrashInstancesStr += ", " + $crashEvent.TimeCreated.ToString() - } - $i += 1 -} + $i = 0 + foreach ($crashEvent in $CrashEventList) + { + if ($i -eq 0) { + $CrashInstancesStr += $crashEvent.TimeCreated.ToString() + } else { + $CrashInstancesStr += ", " + $crashEvent.TimeCreated.ToString() + } + $i += 1 + } -$(Hostname) + " : " + $CrashInstancesStr + $(Hostname) + " : " + $CrashInstancesStr -While ($true) { - Start-Sleep -Seconds 600 -} + While ($true) { + Start-Sleep -Seconds 600 + } + + imagePullPolicy: IfNotPresent + nodeSelector: + kubernetes.azure.com/os-sku: Windows2022 From a66dcc4613288f4541a6c389c48c2e60b04866d4 Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Wed, 2 Aug 2023 11:18:07 +0530 Subject: [PATCH 18/20] Update crashEventChecker.yaml --- scripts/crashEventChecker/crashEventChecker.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/crashEventChecker/crashEventChecker.yaml b/scripts/crashEventChecker/crashEventChecker.yaml index be409d7..4376dbf 100644 --- a/scripts/crashEventChecker/crashEventChecker.yaml +++ b/scripts/crashEventChecker/crashEventChecker.yaml @@ -1,17 +1,17 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: hns-crash-event-checker + name: crash-event-checker labels: - app: hns-crash-event-checker + app: crash-event-checker spec: selector: matchLabels: - name: hns-crash-event-checker + name: crash-event-checker template: metadata: labels: - name: hns-crash-event-checker + name: crash-event-checker spec: securityContext: windowsOptions: @@ -19,7 +19,7 @@ spec: runAsUserName: "NT AUTHORITY\\SYSTEM" hostNetwork: true containers: - - name: hns-crash-event-checker + - name: crash-event-checker image: mcr.microsoft.com/windows/nanoserver:ltsc2022 args: - powershell.exe From e32cbd214c3c4a46e21d7597bc2e8240ad49c857 Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Wed, 2 Aug 2023 20:03:03 +0530 Subject: [PATCH 19/20] Create CrashDumpEnabler This daemonset enables collection of crash-dumps for user-space processes on AKS windows nodes. --- scripts/CrashDumpEnabler | 59 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 scripts/CrashDumpEnabler diff --git a/scripts/CrashDumpEnabler b/scripts/CrashDumpEnabler new file mode 100644 index 0000000..f644860 --- /dev/null +++ b/scripts/CrashDumpEnabler @@ -0,0 +1,59 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: crash-dump-enabler + labels: + app: crash-dump-enabler +spec: + selector: + matchLabels: + name: crash-dump-enabler + template: + metadata: + labels: + name: crash-dump-enabler + spec: + securityContext: + windowsOptions: + hostProcess: true + runAsUserName: "NT AUTHORITY\\SYSTEM" + hostNetwork: true + containers: + - name: crash-dump-enabler + image: mcr.microsoft.com/windows/nanoserver:ltsc2022 + args: + - powershell.exe + - -Command + - | + $crashDumpsPath = "C:\k\debug\LocalDumps" + mkdir $crashDumpsPath -Force + + # Prepare ACL properties + $FileSystemRights = "Read" + $AccessControlType = "Allow" + $IdentityReference = "BUILTIN\Users" + + # Add ACL properties to a FileSystemAccessRule object + $fileSystemAccessRuleArgumentList = $IdentityReference, $FileSystemRights, $AccessControlType + $fileSystemAccessRule = New-Object -TypeName System.Security.AccessControl.FileSystemAccessRule -ArgumentList $fileSystemAccessRuleArgumentList + + # We modify current ACL on crashDumpsPath folder to allow AKS-Periscope to copy it to remote storage account. + $newAcl = Get-Acl -Path $crashDumpsPath + # Apply modified rule to crashDumpsPath + $newAcl.AddAccessRule($fileSystemAccessRule) + Set-Acl -Path $crashDumpsPath -AclObject $newAcl + + Reg add "HKLM\Software\Microsoft\Windows\Windows Error Reporting\LocalDumps" /V DumpCount /t REG_DWORD /d 50 /f + Reg add "HKLM\Software\Microsoft\Windows\Windows Error Reporting\LocalDumps" /V DumpType /t REG_DWORD /d 2 /f + Reg add "HKLM\Software\Microsoft\Windows\Windows Error Reporting\LocalDumps" /V DumpFolder /t REG_EXPAND_SZ /d $crashDumpsPath /f + + $(Hostname) + " :" + "Dump collection Enabled, registry output below:" + reg query "HKLM\Software\Microsoft\Windows\Windows Error Reporting\LocalDumps" + + While ($true) { + Start-Sleep -Seconds 600 + } + + imagePullPolicy: IfNotPresent + nodeSelector: + kubernetes.azure.com/os-sku: Windows2022 From f42518106f6185951f37b0065bb4689c569cfc6c Mon Sep 17 00:00:00 2001 From: jayanthaMicrosoft Date: Wed, 2 Aug 2023 20:03:34 +0530 Subject: [PATCH 20/20] Rename scripts/CrashDumpEnabler to scripts/crashDumpEnabler/CrashDumpEnabler.yaml --- .../{CrashDumpEnabler => crashDumpEnabler/CrashDumpEnabler.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/{CrashDumpEnabler => crashDumpEnabler/CrashDumpEnabler.yaml} (100%) diff --git a/scripts/CrashDumpEnabler b/scripts/crashDumpEnabler/CrashDumpEnabler.yaml similarity index 100% rename from scripts/CrashDumpEnabler rename to scripts/crashDumpEnabler/CrashDumpEnabler.yaml