From 059ffa99260ec0b4b0c81b7e724f226da9bcec68 Mon Sep 17 00:00:00 2001 From: Prince Pereira Date: Thu, 11 Dec 2025 11:43:46 +0530 Subject: [PATCH] Fix for delete lb and stale lb dsr vfp rules. --- .../stale-lb-dsr-rules/README.md | 132 ++++++++++++++ .../cleanup-stale-lb-rules.ps1 | 102 +++++++++++ .../cleanup-stale-lb-rules.yaml | 169 ++++++++++++++++++ .../stale-lb-dsr-rules/fix-del-lb-issue.yaml | 81 +++++++++ 4 files changed, 484 insertions(+) create mode 100644 scripts/mitigation-scripts/stale-lb-dsr-rules/README.md create mode 100644 scripts/mitigation-scripts/stale-lb-dsr-rules/cleanup-stale-lb-rules.ps1 create mode 100644 scripts/mitigation-scripts/stale-lb-dsr-rules/cleanup-stale-lb-rules.yaml create mode 100644 scripts/mitigation-scripts/stale-lb-dsr-rules/fix-del-lb-issue.yaml diff --git a/scripts/mitigation-scripts/stale-lb-dsr-rules/README.md b/scripts/mitigation-scripts/stale-lb-dsr-rules/README.md new file mode 100644 index 0000000..81e59f7 --- /dev/null +++ b/scripts/mitigation-scripts/stale-lb-dsr-rules/README.md @@ -0,0 +1,132 @@ +# Stale LB DSR Rules Cleanup + +## Overview + +This mitigation script automatically detects and removes stale Load Balancer Direct Server Return (LB DSR) rules from VFP (Virtual Filtering Platform) that reference non-existent backend endpoints. It runs continuously to maintain network health by cleaning up orphaned rules that can cause connectivity issues. + +## Problem Statement + +When backend endpoints are removed or become unavailable, the corresponding LB DSR rules in VFP may not be cleaned up properly. These stale rules can: +- Cause packet routing failures +- Lead to connection timeouts +- Create unnecessary overhead in the networking stack +- Result in traffic being sent to non-existent endpoints + +## Solution + +The `cleanup-stale-lb-rules.ps1` script: +1. Checks and sets the required registry configuration for LB DSR feature management +2. Continuously monitors VFP LB DSR rules (both IPv4 and IPv6) +3. Compares rule destination IPs (DIPs) against active HNS endpoints +4. Automatically removes rules that reference non-existent endpoints + +## Prerequisites + +- Windows Server with HNS (Host Network Service) enabled +- VFP control utilities (`vfpctrl.exe`) available +- PowerShell with administrator privileges +- HNS PowerShell module + +## Usage + +### Running the Script on a Single Node + +```powershell +.\cleanup-stale-lb-rules.ps1 +``` + +The script will: +1. Check registry key `HKLM:\SYSTEM\CurrentControlSet\Policies\Microsoft\FeatureManagement\Overrides\140377743` +2. If the key value is 1, set it to 0 and restart the node (this disables PR 13179278 which is causing delete LB RPC calls from KubeProxy to fail with Invalid IP Error - ICM: 719903780) +3. Start a continuous monitoring loop with 10-second intervals +4. Clean up any stale LB DSR rules found + +**Note:** This approach fixes issues on a single node. If the issue is widespread across the cluster, deploy the solution using a DaemonSet: + +```powershell +kubectl create -f cleanup-stale-lb-rules.yaml +``` + +This will run the mitigation script as HPC pods on all affected nodes. + +### Configuration + +You can modify these parameters at the top of the script: + +- **`$groups`**: VFP groups to monitor (default: `LB_DSR_IPv4_OUT`, `LB_DSR_IPv6_OUT`) +- **`$refreshIntervalSeconds`**: Time between cleanup iterations (default: 10 seconds) + +## How It Works + +### 1. Registry Check +The script first ensures the feature flag registry key (140377743) is set to 0. If not, it sets the value and restarts the node. + +### 2. Endpoint Collection +- Retrieves all HNS policies +- Extracts endpoint references +- Builds a dictionary of valid endpoint IP addresses + +### 3. Rule Validation +For each VFP port and LB DSR group: +- Lists all rules in the `LB_DSR` layer +- Extracts DIP (Destination IP) ranges from each rule +- Compares DIPs against the valid endpoint dictionary + +### 4. Cleanup +- Rules with DIPs not found in active endpoints are flagged as stale +- Stale rules are automatically deleted using `vfpctrl /remove-rule` + +## Output Examples + +### Healthy State +``` +All DIP ranges are present in the dictionary. +``` + +### Stale Rules Detected +``` +Missing DIP ranges: + - 10.244.0.25 + - fdf5:5d67:b9ce:b28f::13f +Deleting rule : ruleId: ABC123, port: Port1, group: LB_DSR_IPv4_OUT +``` + +## Monitoring + +The script provides color-coded output: +- **Green**: Healthy state, all rules valid +- **Yellow**: Configuration changes or rule deletion in progress +- **Red**: Stale rules detected +- **Cyan**: Status updates and iteration markers + +## Important Notes + +- The script runs indefinitely until manually stopped (Ctrl+C) +- Node restart may occur on first run if registry configuration is incorrect +- Ensure no legitimate endpoint updates are in progress during cleanup to avoid false positives +- The script requires elevated privileges to modify VFP rules and registry settings + +## Troubleshooting + +### Script doesn't detect stale rules +- Verify VFP and HNS are functioning correctly +- Check that `vfpctrl.exe` is accessible in the system PATH +- Ensure HNS endpoints are properly registered + +### Node restarts unexpectedly +- This is expected behavior if the registry key is not set to 0 +- After restart, the script will continue normal operation + +### Permission errors +- Run PowerShell as Administrator +- Verify account has rights to modify VFP rules and registry + +## Related Documentation + +- [VFP Documentation](../../helper/VFP.psm1) +- [HNS Module](../HNS/) +- [Network Health Monitoring](../../networkhealth/) + +## Support + +For issues or questions, please refer to the main repository documentation or open an issue. diff --git a/scripts/mitigation-scripts/stale-lb-dsr-rules/cleanup-stale-lb-rules.ps1 b/scripts/mitigation-scripts/stale-lb-dsr-rules/cleanup-stale-lb-rules.ps1 new file mode 100644 index 0000000..c507f67 --- /dev/null +++ b/scripts/mitigation-scripts/stale-lb-dsr-rules/cleanup-stale-lb-rules.ps1 @@ -0,0 +1,102 @@ +$groups = @("LB_DSR_IPv4_OUT", "LB_DSR_IPv6_OUT") +$refreshIntervalSeconds = 10 + +function Get-DipRangesFromRuleText { + param([string[]]$RuleText) + + $collect = $false + $dips = @() + + foreach ($line in $RuleText) { + + # Detect beginning of DIP Range block + if ($line -match "DIP Range") { + $collect = $true + continue + } + + # Stop when FlagsEx or another header appears + if ($collect -and $line -match "FlagsEx") { + break + } + + # Process lines like: + # { 10.244.0.25 : 53 } + # { fdf5:5d67:b9ce:b28f::13f : 4445 } + if ($collect -and $line.Trim().StartsWith("{")) { + + # Remove surrounding { } then trim + $clean = $line.Trim().Trim('{','}').Trim() + # Use regex to extract IP before last " : " + if ($clean -match '(.+)\s*:\s*\d+$') { + $ip = $matches[1].Trim() + $dips += $ip + } + } + } + + return $dips +} + +$regKeyVal = (Get-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Policies\Microsoft\FeatureManagement\Overrides" -Name 140377743).140377743 +if ($regKeyVal -eq 1) { + Write-Host "Registry keys are not zero. Setting reg key to 0 and restarting the node." -ForegroundColor Yellow + Set-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Policies\Microsoft\FeatureManagement\Overrides" -Name 140377743 -Value 0 -Type DWORD + Restart-Computer -Force + Start-Sleep -Seconds 30 +} else { + Write-Host "Registry keys are zero. Continuing the script." -ForegroundColor Green +} + +While($true) { + Write-Host "Waiting for $refreshIntervalSeconds seconds for the next iteration..." -ForegroundColor Cyan + Start-Sleep -Seconds $refreshIntervalSeconds + Write-Host "Starting new iteration to check for stale LB DSR rules..." -ForegroundColor Cyan + $dictDstIPs = @{} + + $policies = Get-HnsPolicyList + + $endpointIds = $policies.References | + Where-Object { $_ -like "/endpoints/*" } | + ForEach-Object { ($_ -split "/")[-1] } | + Sort-Object -Unique + + $endpointIds | ForEach-Object { + $ipAddress = (Get-HnsEndpoint -Id $_).IPAddress + if ($ipAddress -ne $null) { + $dictDstIPs[$ipAddress] = $true + } + $ipv6Address = (Get-HnsEndpoint -Id $_).IPv6Address + if ($ipv6Address -ne $null) { + $dictDstIPs[$ipv6Address] = $true + } + } + + $ports = (vfpctrl.exe /list-vmswitch-port /format 1 | ConvertFrom-Json).Ports.Name + foreach ($port in $ports) { + foreach ($group in $groups) { + $rules = (vfpctrl /port $port /layer LB_DSR /group $group /list-rule /format 1 | ConvertFrom-Json).Rules + foreach ($rule in $rules) { + $ruleId = $rule.Id + $ruleText = vfpctrl /get-rule-info /port $port /layer LB_DSR /group $group /rule $ruleId 2>&1 + if (-not $ruleText) { + Write-Host "No output from vfpctrl" + continue + } + + $dips = Get-DipRangesFromRuleText -RuleText $ruleText + # Check which DIPs are missing in the dictionary + $missingDIPs = $dips | Where-Object { -not $dictDstIPs.ContainsKey($_) } + + if ($missingDIPs.Count -eq 0) { + Write-Host "All DIP ranges are present in the dictionary." -ForegroundColor Green + } else { + Write-Host "Missing DIP ranges:" -ForegroundColor Red + $missingDIPs | ForEach-Object { Write-Host " - $_" } + Write-Host "Deleting rule : ruleId: $ruleId, port: $port, group: $group" -ForegroundColor Yellow + vfpctrl /remove-rule /port $port /layer LB_DSR /group $group /rule $ruleId + } + } + } + } +} \ No newline at end of file diff --git a/scripts/mitigation-scripts/stale-lb-dsr-rules/cleanup-stale-lb-rules.yaml b/scripts/mitigation-scripts/stale-lb-dsr-rules/cleanup-stale-lb-rules.yaml new file mode 100644 index 0000000..33cd887 --- /dev/null +++ b/scripts/mitigation-scripts/stale-lb-dsr-rules/cleanup-stale-lb-rules.yaml @@ -0,0 +1,169 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: demo + namespace: demo +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: check-hns-issue + namespace: demo +data: + mitigate-del-lb-issue.ps1: | + + $groups = @("LB_DSR_IPv4_OUT", "LB_DSR_IPv6_OUT") + $refreshIntervalSeconds = 10 + + function Get-DipRangesFromRuleText { + param([string[]]$RuleText) + + $collect = $false + $dips = @() + + foreach ($line in $RuleText) { + + # Detect beginning of DIP Range block + if ($line -match "DIP Range") { + $collect = $true + continue + } + + # Stop when FlagsEx or another header appears + if ($collect -and $line -match "FlagsEx") { + break + } + + # Process lines like: + # { 10.244.0.25 : 53 } + # { fdf5:5d67:b9ce:b28f::13f : 4445 } + if ($collect -and $line.Trim().StartsWith("{")) { + + # Remove surrounding { } then trim + $clean = $line.Trim().Trim('{','}').Trim() + # Use regex to extract IP before last " : " + if ($clean -match '(.+)\s*:\s*\d+$') { + $ip = $matches[1].Trim() + $dips += $ip + } + } + } + + return $dips + } + + $regKeyVal = (Get-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Policies\Microsoft\FeatureManagement\Overrides" -Name 140377743).140377743 + if ($regKeyVal -eq 1) { + Write-Host "Registry keys are not zero. Setting reg key to 0 and restarting the node." -ForegroundColor Yellow + Set-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Policies\Microsoft\FeatureManagement\Overrides" -Name 140377743 -Value 0 -Type DWORD + Restart-Computer -Force + Start-Sleep -Seconds 30 + } else { + Write-Host "Registry keys are zero. Continuing the script." -ForegroundColor Green + } + + While($true) { + Write-Host "Waiting for $refreshIntervalSeconds seconds for the next iteration..." -ForegroundColor Cyan + Start-Sleep -Seconds $refreshIntervalSeconds + Write-Host "Starting new iteration to check for stale LB DSR rules..." -ForegroundColor Cyan + $dictDstIPs = @{} + + $policies = Get-HnsPolicyList + + $endpointIds = $policies.References | + Where-Object { $_ -like "/endpoints/*" } | + ForEach-Object { ($_ -split "/")[-1] } | + Sort-Object -Unique + + $endpointIds | ForEach-Object { + $ipAddress = (Get-HnsEndpoint -Id $_).IPAddress + if ($ipAddress -ne $null) { + $dictDstIPs[$ipAddress] = $true + } + $ipv6Address = (Get-HnsEndpoint -Id $_).IPv6Address + if ($ipv6Address -ne $null) { + $dictDstIPs[$ipv6Address] = $true + } + } + + $ports = (vfpctrl.exe /list-vmswitch-port /format 1 | ConvertFrom-Json).Ports.Name + foreach ($port in $ports) { + foreach ($group in $groups) { + $rules = (vfpctrl /port $port /layer LB_DSR /group $group /list-rule /format 1 | ConvertFrom-Json).Rules + foreach ($rule in $rules) { + $ruleId = $rule.Id + $ruleText = vfpctrl /get-rule-info /port $port /layer LB_DSR /group $group /rule $ruleId 2>&1 + if (-not $ruleText) { + Write-Host "No output from vfpctrl" + continue + } + + $dips = Get-DipRangesFromRuleText -RuleText $ruleText + # Check which DIPs are missing in the dictionary + $missingDIPs = $dips | Where-Object { -not $dictDstIPs.ContainsKey($_) } + + if ($missingDIPs.Count -eq 0) { + Write-Host "All DIP ranges are present in the dictionary." -ForegroundColor Green + } else { + Write-Host "Missing DIP ranges:" -ForegroundColor Red + $missingDIPs | ForEach-Object { Write-Host " - $_" } + Write-Host "Deleting rule : ruleId: $ruleId, port: $port, group: $group" -ForegroundColor Yellow + vfpctrl /remove-rule /port $port /layer LB_DSR /group $group /rule $ruleId + } + } + } + } + } +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: stale-lb-rules-mitigator + namespace: demo + labels: + app: stale-lb-rules-mitigator +spec: + selector: + matchLabels: + app: stale-lb-rules-mitigator + template: + metadata: + labels: + app: stale-lb-rules-mitigator + spec: + securityContext: + windowsOptions: + hostProcess: true + runAsUserName: 'NT AUTHORITY\SYSTEM' + hostNetwork: true + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - windows + containers: + - name: stale-lb-rules-mitigator + image: mcr.microsoft.com/dotnet/framework/samples:aspnetapp + imagePullPolicy: IfNotPresent + command: + - powershell.exe + - -File + - C:\scripts\mitigate-del-lb-issue.ps1 + volumeMounts: + - name: script + mountPath: C:\scripts + - name: kube-path + mountPath: C:\k + terminationGracePeriodSeconds: 60 + volumes: + - name: script + configMap: + name: check-hns-issue + - name: kube-path + hostPath: + path: C:\k + type: DirectoryOrCreate \ No newline at end of file diff --git a/scripts/mitigation-scripts/stale-lb-dsr-rules/fix-del-lb-issue.yaml b/scripts/mitigation-scripts/stale-lb-dsr-rules/fix-del-lb-issue.yaml new file mode 100644 index 0000000..57c2ffe --- /dev/null +++ b/scripts/mitigation-scripts/stale-lb-dsr-rules/fix-del-lb-issue.yaml @@ -0,0 +1,81 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: demo + namespace: demo +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: check-hns-issue + namespace: demo +data: + mitigate-del-lb-issue.ps1: | + + $regKeyVal = (Get-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Policies\Microsoft\FeatureManagement\Overrides" -Name 140377743).140377743 + if ($regKeyVal -eq 1) { + Write-Host "Registry keys are not zero. Setting reg key to 0 and restarting the node." -ForegroundColor Yellow + Set-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Policies\Microsoft\FeatureManagement\Overrides" -Name 140377743 -Value 0 -Type DWORD + Restart-Computer -Force + Start-Sleep -Seconds 30 + } else { + Write-Host "Registry keys are zero. Continuing the script." -ForegroundColor Green + } + + While($true) { + Write-Host "Registry keys are disabled...." -ForegroundColor Cyan + Start-Sleep -Seconds 36000 + } +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: del-lb-fail-mitigator + namespace: demo + labels: + app: del-lb-fail-mitigator +spec: + selector: + matchLabels: + app: del-lb-fail-mitigator + template: + metadata: + labels: + app: del-lb-fail-mitigator + spec: + securityContext: + windowsOptions: + hostProcess: true + runAsUserName: 'NT AUTHORITY\SYSTEM' + hostNetwork: true + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - windows + containers: + - name: del-lb-fail-mitigator + image: mcr.microsoft.com/dotnet/framework/samples:aspnetapp + imagePullPolicy: IfNotPresent + command: + - powershell.exe + - -File + - C:\scripts\mitigate-del-lb-issue.ps1 + volumeMounts: + - name: script + mountPath: C:\scripts + - name: kube-path + mountPath: C:\k + terminationGracePeriodSeconds: 60 + volumes: + - name: script + configMap: + name: check-hns-issue + - name: kube-path + hostPath: + path: C:\k + type: DirectoryOrCreate \ No newline at end of file