Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
142 commits
Select commit Hold shift + click to select a range
cdc0c92
provision Azure Managed Grafana workspace
Oct 8, 2025
fb7997d
put variables in the right position
Oct 8, 2025
e13fb60
remove conditional statement
Oct 9, 2025
03b5d4f
update windows to use 1es-windows-2022
Oct 9, 2025
4a5ed6d
remove bicep installation task
Oct 9, 2025
ec7b956
remove parameters that are not needed
Oct 9, 2025
28e0eda
changed parameters file format for bicep
Oct 9, 2025
a5805d6
changed parameters file format for bicep
Oct 9, 2025
5424b0c
remove role assignment from bicep
Oct 9, 2025
e0004fc
remove role assignment from bicep
Oct 9, 2025
0ef9267
changed parameters file format for bicep
Oct 9, 2025
0382325
changed parameters file format for bicep
Oct 9, 2025
e8a9e30
add task to install amg extension
Oct 9, 2025
aa6018d
remove allow-preview-versions flag
Oct 9, 2025
ac429bd
assign grafana admin role to .net eng services
Oct 9, 2025
5683982
assign grafana admin role to .net eng services
Oct 9, 2025
340831d
assign grafana admin role to .net eng services
Oct 9, 2025
c76bcc4
remove grafana admin role assignment
Oct 9, 2025
eed1aed
add release job type
Oct 9, 2025
90cdefc
remove release job type
Oct 9, 2025
f533797
provision azure managed grafana workspace
Oct 13, 2025
a074e78
fix bicep file path
Oct 13, 2025
7fc2c34
add provsion grafana stage to the deployment
Oct 13, 2025
2273352
Merge branch 'main' into haruna/managed-grafana-new
haruna99 Oct 13, 2025
b5ea9ca
add deploy azure managed grafana script
Oct 14, 2025
ccc335f
Merge updates for Azure Managed Grafana provisioning pipeline
Oct 14, 2025
6ff2dba
remove test pipeline
Oct 14, 2025
8106d3b
delete deploy-grafana.ps1 file
Oct 15, 2025
7e71347
add grafana bicep validation task to the pr jobs
Oct 15, 2025
2c39d25
add grafana bicep validation to the build stage
Oct 15, 2025
4d2041c
remove unused GrafanaKeyVault parameter
Oct 15, 2025
758a85c
remove unused GrafanaVariableGroup parameter
Oct 15, 2025
71c3d73
add skuName variable
Oct 15, 2025
cf44564
remove unused SkipGrafanaProvisioning parameter
Oct 15, 2025
0e9664d
reduce maximum attempts to 5 for grafana deployment
Oct 15, 2025
4d387fe
remove output variables since there is no downstream usage
Oct 15, 2025
ed34f93
make the dotnet eng services group the grafana admin
Oct 15, 2025
85b7227
test grafana
Oct 15, 2025
669da0d
test grafana
Oct 15, 2025
9b2a2e5
test grafana
Oct 15, 2025
bfa2882
remove group grafana admin assignment
Oct 16, 2025
bc29c7b
add user assigned managed identity
Oct 16, 2025
0351d84
add user assigned managed identity
Oct 16, 2025
363a9af
add key vault for grafana
Oct 16, 2025
f3e8036
change resource group name
Oct 16, 2025
6fae23b
change resource group validation script
Oct 16, 2025
170a6c2
change service connection
Oct 16, 2025
8a6a3a4
change service connection
Oct 16, 2025
a13a12d
change service connection to use nethelix sc
Oct 16, 2025
330b56f
grant the managed identity permissions to the keyvault
Oct 16, 2025
b5306a4
give the .net eng services group grafana admin permissions
Oct 17, 2025
d99edad
change the service connection to Dotnet Engineering services
Oct 17, 2025
48cdf59
change grafana keyvault name
Oct 17, 2025
c10a8d8
add application gateway
Nov 4, 2025
a216a49
add application gateway
Nov 4, 2025
9ac0129
rectify image used for app gateway
Nov 4, 2025
2b36a31
rectify image used for app gateway
Nov 4, 2025
b988c49
remove unused service connection id and rectify subscription
Nov 4, 2025
ac504aa
Enable HTTPS on Application Gateway
Nov 6, 2025
5445e58
Enable HTTPS on Application Gateway
Nov 6, 2025
b106927
remove incorrect operator
Nov 6, 2025
17c1ba7
remove incorrect operator
Nov 6, 2025
acc362b
remove incorrect operator
Nov 6, 2025
11618a7
fix script path
Nov 6, 2025
916faf5
grant the grafana MI Key Vault Certificates Officer role
Nov 6, 2025
c8eec92
grant pipeline service principal Key Vault Certificates Officer role
Nov 6, 2025
a71a2cc
Grant Application Gateway Access to Key Vault
Nov 6, 2025
6d98a29
Fix 502 error: Accept 401 status from Grafana health probe
Nov 6, 2025
c1f94c3
publish grafana dashboard
Nov 7, 2025
59540e6
add token creation to publish grafana stage
Nov 9, 2025
0a79b46
grant service principal grafana admin role
Nov 9, 2025
7d95e70
grant service principal key vault officer role
Nov 9, 2025
0e2b751
Grant pipeline SP Key Vault Secrets Officer role in Grafana provision…
Nov 9, 2025
25be23e
Add Key Vault permission verification and retry logic for RBAC propag…
Nov 10, 2025
ab8601e
Pass Azure Pipelines credentials to MSBuild SDK for Key Vault authent…
Nov 10, 2025
d40d857
add service connection ID
Nov 11, 2025
ca72eb5
add service connection client ID
Nov 11, 2025
4b97270
remove undefined ServiceConnectionId
Nov 11, 2025
7f3fda8
import secrets from dotnet-grafana-secrets.yaml
Nov 11, 2025
9f6d58d
remove dnceng-amg-int-kv.yaml file
Nov 11, 2025
81f4ed5
remove notification alerts and synchronize secrets
Nov 11, 2025
381b936
fix key vault access propagation
Nov 11, 2025
c617b3a
fix keyvault secret access
Nov 11, 2025
6f68e64
grant MI monitoring reader permission to subscriptions
Nov 12, 2025
5ce1be5
grant MI monitoring reader permission to subscriptions in the pipeline
Nov 12, 2025
5d05dbe
fix grafana notification contact points
Nov 13, 2025
b30cf22
azure managed grafana alert rules
Nov 18, 2025
69a2709
add quota alerts
Nov 19, 2025
d09e4e6
import secret to staging and prod KV
Nov 19, 2025
7b74036
remove unused output variables
Nov 19, 2025
0fdc086
remove unused output variables in azure managed grafana bicep
Nov 19, 2025
e5048af
remove EnableCustomDomain variable from deploy-managed-grafana.yml
Nov 19, 2025
edab753
remove unnecessary comments
Nov 20, 2025
b73f5f6
remove unnecessary comments
Nov 20, 2025
e24c9df
remove unnecessary comments
Nov 20, 2025
a0fd967
change token lifespan to 30 days
Nov 20, 2025
3dd87e9
remove unnecessary comments
Nov 20, 2025
2b65635
remove unused contact point
Nov 20, 2025
d5582a7
fix duplication of contact points
Nov 20, 2025
070fbeb
include dashboard for homepage
Nov 20, 2025
3f10420
remove plugin version
Nov 21, 2025
0517896
remove app gateway logic
Nov 21, 2025
98f709a
Migrate alert rules to Azure Managed Grafana unified alerting format
Nov 22, 2025
f091797
grant grafa MI access to engineeringdata
Nov 23, 2025
e4a35f5
grant grafa MI access to engineeringdata
Nov 23, 2025
9ad1f73
grant grafa MI access to engineeringdata
Nov 23, 2025
ddcce3e
grant grafana MI access to engineeringdata
Nov 23, 2025
6e3def1
remove grafana MI access to engineeringdata
Nov 23, 2025
e9d06c9
fix data source for dashboard
Nov 24, 2025
88c34a1
update grafana dashboards
Nov 25, 2025
5606316
set homepage preference
Nov 25, 2025
c223f45
fix dashboard alert annotations
Nov 27, 2025
af36144
add grafana keyvault manifest file
Nov 27, 2025
83d0b66
add grafana annotation settings for infinity datasource
Nov 27, 2025
2ca58ac
show inactive alerts
Dec 1, 2025
7d0a9cd
set alert rule timeframe
Dec 3, 2025
fd13b74
fix alerting rules folder directory
Jan 6, 2026
f25346e
refactor grafana publishing to use only one stage
Jan 6, 2026
3e02792
refactor grafana publishing to use only one stage
Jan 6, 2026
1b3005e
add managed grafana to the dotnet-dnceng-ci pipeline
Jan 7, 2026
44bc88a
add managed grafana to the dotnet-dnceng-ci pipeline
Jan 7, 2026
98aa83b
remove self hosted grafana dashboard publishing
Jan 7, 2026
c087a93
fix grafana dashboard publishing error
Jan 7, 2026
dd901b2
fix error when adding ANG to dotnet-dnceng-ci pipeline
Jan 7, 2026
6eaa430
remove test pipeline
Jan 7, 2026
89c3cf8
fix deploy-managed-grafana.ml filepath
Jan 7, 2026
382350c
include serviceConnectionName variable
Jan 7, 2026
5e64cdb
Allow anonymous access to alert webhook endpoint for Grafana
Jan 9, 2026
840b1e3
remove AllowAnonymous from alertHookController
Jan 13, 2026
2064f8a
Merge remote-tracking branch 'origin/main' into haruna/managed-grafan…
Jan 13, 2026
75f2091
delete azure-pipelines-managed-grafana pipeline
Jan 13, 2026
3c86e61
Disable EventHub connection strings test (moving to MI) (#6392)
meghnave Jan 13, 2026
f406766
remove unnecessary comment
Jan 13, 2026
eff6b13
Merge branch 'main' into haruna/managed-grafana-new
haruna99 Jan 13, 2026
93896b0
fix service connection naming
Jan 14, 2026
cc08ecf
Merge branch 'haruna/managed-grafana-new' of https://github.com/dotne…
Jan 14, 2026
554a3e2
remove Validation of the Grafana Bicep Template from the PR stage
Jan 14, 2026
b42bc65
add azure managed grafana api key to secret manager
Jan 15, 2026
0500a64
operations triage agent one-pager
chcosta Jan 15, 2026
cd39caa
add metrics section (#6446)
chcosta Jan 16, 2026
708dd92
remove unused files
Jan 23, 2026
9a0ad17
Migrate self-hosted grafana to azure managed grafana (#6304)
haruna99 Jan 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .vault-config/dnceng-amg-int-kv.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
storageLocation:
type: azure-key-vault
parameters:
subscription: a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1
name: dnceng-amg-int-kv

secrets:
# API token for DotNet Status website
dotnet-build-bot-dotnet-eng-status-token:
type: text
parameters:
description: API token from https://dotneteng-status-staging.azurewebsites.net/ - Generated using dotnet-build-bot account

# Authorization header for Deployment Annotations datasource
dotneteng-status-auth-header:
type: text
parameters:
description: "Bearer token for status API - Format: Bearer <dotnet-build-bot-dotnet-eng-status-token>"

# Teams webhook URL for alert notifications
fr-bot-notifications-teams-notification-url:
type: text
parameters:
description: Teams Incoming Webhook URL - Do not rotate
24 changes: 24 additions & 0 deletions .vault-config/dnceng-amg-prod-kv.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
storageLocation:
type: azure-key-vault
parameters:
subscription: a4fc5514-21a9-4296-bfaf-5c7ee7fa35d1
name: dnceng-amg-prod-kv

secrets:
# API token for DotNet Status website
dotnet-build-bot-dotnet-eng-status-token:
type: text
parameters:
description: API token from https://dotneteng-status.azurewebsites.net/ - Generated using dotnet-build-bot account

# Authorization header for Deployment Annotations datasource
dotneteng-status-auth-header:
type: text
parameters:
description: "Bearer token for status API - Format: Bearer <dotnet-build-bot-dotnet-eng-status-token>"

# Teams webhook URL for alert notifications
fr-bot-notifications-teams-notification-url:
type: text
parameters:
description: Teams Incoming Webhook URL - Do not rotate
174 changes: 174 additions & 0 deletions Documentation/OnePagers/ops-triage-agent.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
# DNCEng Operational Triage Agent — One Pager

## Description
Automated triage system for DNCEng Tasks under
`internal\.NET Engineering Services\Operations`, ensuring transparent and consistent priority assignment.

The architecture is intentionally split into two layers with a **strict separation of responsibilities**:

1. **MCP-Based Triage Tooling (Logic Layer — Pure, Side‑Effect‑Free)**
- Implemented as one or more **local MCP servers**, distributed via **NuGet packages**
- Encapsulates triage logic, rule evaluation, and reasoning generation
- **Performs no Azure DevOps I/O**
- **Requires no authentication, tokens, or permissions**
- Deterministic, testable, and reusable across any environment (Azure job, CLI, Foundry)

2. **Lightweight Agent Layer (Execution / Orchestration Layer — Side‑Effecting)**
- Responsible for **all authentication and authorization**
- Owns all Azure DevOps interactions (queries, writes, comments, updates)
- Invokes MCP logic and applies its results
- Initially an **Azure-scheduled job**, but architected to support **CLI** or **Foundry-based agents** using the same MCP tooling

By design, **all security scopes remain in the agent**, eliminating the need to manage authentication or ADO permissions inside MCP tooling. With this model, the agent is treated with the same security posture as a user would.

## Deliverable at End of Sprint
A functioning system where:
- MCP servers generate pure triage decisions and structured reasoning
- The agent:
- Authenticates to Azure DevOps
- Queries for candidate items
- Invokes MCP tooling
- Writes updates back to Azure DevOps
- Unprioritized DNCEng Tasks are automatically triaged
- Items assigned to the agent’s service principal are re-triaged and unassigned
- A read-only “Triage agent” section is created and appended to
- Triage decisions follow a versioned wiki-based triage bar

### Metrics

As a metric for evaluating the effectiveness of the triage agent, we should review all work items which are triaged by the agent. If a user, or users, or triage team have examined an issue and adjusted the assigned priority, then we can quickly generate a measure of how many issues the agent triaged correctly / incorrectly on an on-going basis. There will be scenarios where the agent actually triaged appropriately and the work item lacked sufficient context for a different priority assignment but a user has additional information and decides to re-triage based on external knowledge but hopefully those occurences are rare. If it turns out to be a frequent case where external knowledge is not captured in the work item, then we should re-examine our processes to try to encourage more value be added to the work item itself.

## Motivation
Azure DevOps Tags/Labels must remain strictly **filtering/query metadata**, not programmatic switches.

The team adheres to the principle that **Azure DevOps Tags/Labels must not be used as operational signals**. Use of Tags / Labels for programmatic decision making / operations leads to non-intuitive experiences and area expertise requirements.

Tags should remain strictly for **querying and filtering**, not automation triggers.

Separating concerns between:
- **Pure logic (MCP tooling)**
- **Execution agent (Azure job, CLI, Foundry)**

ensures a predictable, inspectable system that avoids “hidden behaviors” and reduces dependency on tribal knowledge.

The system’s architecture reinforces this by:
- Making MCP tooling **pure business logic with zero dependencies on authentication or external APIs**
- Keeping **all operational and security responsibilities in the agent layer**
- Preventing accidental coupling between triage rules and ADO access patterns

This separation:
- Removes the need for security handling in MCP tooling
- Promotes deterministic, portable logic
- Preserves a predictable and auditable automation model

## Architectural Overview

### **MCP Tooling (Logic Layer)**
- Distributed as NuGet-based local MCP servers
- Purely logical: no network calls, no credentials
- Takes structured inputs:
- Work item fields supplied by the agent
- Triage bar rules
- Contextual metadata
- Outputs structured triage decisions:
- Priority recommendation
- Reasoning and decision explanation
- Alternate outcomes if additional data were provided
- Triage bar version used

**Because MCP tooling is completely side‑effect‑free, it does not require ADO credentials, eliminating the need to manage authentication/authorization at the logic level.**

### **Agent Layer (Execution / Orchestration Layer)**
The agent is the **only** component that:

- Authenticates to Azure DevOps
- Holds authorization scopes
- Performs all ADO reads (queries)
- Performs all ADO writes (priority updates, comments, section updates)
- Handles throttling, retries, and permission boundaries

The agent simply orchestrates:
1. Collecting ADO data
2. Passing structured inputs to MCP servers
3. Applying MCP outputs back to ADO

### **Why ADO I/O Lives Exclusively in the Agent Layer**
Keeping all authentication and authorization in the agent:

- Ensures MCP servers remain universal and environment-agnostic
- Prevents credential sprawl (no tokens in libraries/packages)
- Eliminates the need for MCP security review or privileged access
- Makes MCP tooling safe to run in local tools, pipelines, and Foundry agents
- Reduces operational blast radius

**This security simplification is a core architectural advantage.**

## Approach

### 1. Automatic Triage of Unprioritized Tasks
Agent queries ADO → sends data to MCP logic → applies MCP decision.

### 2. Re‑Triage of Tasks Assigned to the Agent
Agent re-evaluates tasks assigned to the service principal, unassigns them, and updates ADO.

### 3. “Triage Agent” Read‑Only Section
Agent writes an append-only, structured section including:
- MCP reasoning
- Priority determination
- Data gaps
- Bar version used
- Timestamp
- Wiki link

### 3a. **Brief Agent Comment on the Work Item**
Whenever the agent triages or re‑triages a work item, it posts a **short comment** indicating that triage occurred,
and includes a **direct link to the “Triage agent” section** where the full historical record and detailed reasoning are stored.

This ensures:
- Comments stay concise
- Users immediately know triage occurred
- Full fidelity details remain centralized and well‑structured

### 4. Wiki‑Published, Versioned Triage Bar
- Triage bar lives in DNCEng wiki
- MCP tooling interprets it
- Updating the bar allows all items to be re‑triaged cleanly

## Security / Telemetry / Test Coverage / Safe Deployment

### **Security**
- **All authentication and authorization handled by the agent**
- MCP tooling requires **no credentials and has no permissions**
- No external calls from MCP servers

This reduces risk and simplifies compliance.

### **Telemetry**
- MCP evaluations (inputs/outputs)
- Agent‑applied operations
- Re-triage deltas
- User overrides

### **Test Coverage**
- MCP: deterministic unit + integration tests
- Agent: orchestration tests, mocked MCP calls

### **Safe Deployment**
- Wiki-based rule versioning
- Dry-run modes
- Controlled rollout via agent configuration


## Metrics

## Task Breakdown

| Testable Chunk | Tasks | Cost |
|----------------------------------|-------------------------------------------------------------|------|
| MCP triage engine | Pure logic + triage bar interpreter | 2–3 days |
| MCP server infrastructure | Local MCP server + NuGet packaging | 2–3 days |
| Agent orchestration logic | Azure job + ADO I/O + MCP invocation | 2–3 days |
| Triage agent section writer | Append-only structured section | 2 days |
| Commenting + assignment handling | Minimal comments + unassign logic | 1–2 days |
| Telemetry + monitoring | Metrics, dashboards | 1–2 days |
| Testing & rollout | Unit, integration, and dry-run validation | 3 days |
2 changes: 1 addition & 1 deletion azure-pipelines-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,4 @@ stages:
dotnet run --project src/SecretManager/Microsoft.DncEng.SecretManager -- validate-all -b src @manifestArgs
displayName: Verify Secret Usages

- template: /eng/test.yaml
- template: /eng/test.yaml
40 changes: 33 additions & 7 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ variables:
- name: _DotNetArtifactsCategory
value: .NETCore
- group: SDL_Settings
- name: ServiceConnectionName
value: 'Dotnet Engineering services'

trigger:
batch: true
Expand Down Expand Up @@ -195,6 +197,24 @@ extends:
contents: '*'
targetFolder: $(Build.ArtifactStagingDirectory)\eng

- task: AzureCLI@2
displayName: 'Validate Grafana Bicep Template'
inputs:
azureSubscription: '$(ServiceConnectionName)'
scriptType: 'ps'
scriptLocation: 'inlineScript'
inlineScript: |
Write-Host "Validating Grafana Bicep template..."
if (!(Test-Path "eng/deployment/azure-managed-grafana.bicep")) {
throw "Bicep template not found: azure-managed-grafana.bicep"
}

az bicep build --file eng/deployment/azure-managed-grafana.bicep
if ($LASTEXITCODE -ne 0) {
throw "Bicep template validation failed"
}
Write-Host "SUCCESS: Bicep template validation successful"

- template: /eng/common/templates-official/post-build/post-build.yml@self
parameters:
enableSymbolValidation: false
Expand Down Expand Up @@ -225,20 +245,26 @@ extends:
PublishProfile: Int
ServiceConnectionName: NetHelixStaging
StatusVariableGroup: DotNetStatus Staging
GrafanaHost: https://dotnet-eng-grafana-staging.westus2.cloudapp.azure.com
GrafanaKeyVault: dotnet-grafana-staging
GrafanaVariableGroup: Dotnet-Grafana-Staging
ServiceConnectionClientId: 57f299da-15de-4117-b8f6-7c10451926f0
ServiceConnectionId: 7829de7e-fb4e-4118-8370-475d6bc61905
AMGServiceConnectionName: 'Dotnet Engineering services'
AMGServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55
AMGServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba
AMGDeploymentEnvironment: Staging
AMGGrafanaWorkspaceName: dnceng-grafana-staging
AMGGrafanaKeyVault: dnceng-amg-int-kv
${{ else }}:
DeploymentEnvironment: Production
DotNetStatusAppName: dotneteng-status
DotNetStatusEndpoint: .NET Engineering Deployment Notification - Production
PublishProfile: Prod
ServiceConnectionName: NetHelix
StatusVariableGroup: DotNetStatus Production
GrafanaHost: https://dotnet-eng-grafana.westus2.cloudapp.azure.com
GrafanaKeyVault: dotnet-grafana
GrafanaVariableGroup: Dotnet-Grafana-Production
ServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba
ServiceConnectionId: 4a511f6f-b538-48e6-a389-207e430634d1
ServiceConnectionId: 4a511f6f-b538-48e6-a389-207e430634d1
AMGServiceConnectionName: 'Dotnet Engineering services'
AMGServiceConnectionId: dd8c2cfc-b9c9-452c-a168-ccd4240ada55
AMGServiceConnectionClientId: fc1eb341-aea4-4a11-8f80-d14b8775b2ba
AMGDeploymentEnvironment: Production
AMGGrafanaWorkspaceName: dnceng-grafana
AMGGrafanaKeyVault: dnceng-amg-prod-kv
Loading