Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions cmd/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ import (
"os/signal"
"syscall"

"github.com/prometheus/client_golang/prometheus"
internalclient "github.com/skupperproject/skupper/internal/kube/client"
"github.com/skupperproject/skupper/internal/kube/controller"
"github.com/skupperproject/skupper/internal/kube/metrics"
"github.com/skupperproject/skupper/internal/version"
)

Expand Down Expand Up @@ -73,6 +75,15 @@ func main() {
log.Fatal("Error getting new site controller ", err.Error())
}

if !config.MetricsConfig.Disabled {
reg := prometheus.NewRegistry()
metrics.MustRegisterClientGoMetrics(reg)
srv := metrics.NewServer(config.MetricsConfig, reg)
if err := srv.Start(stopCh); err != nil {
log.Fatalf("Error starting metrics server: %s", err)
}
}

if err = controller.Run(stopCh); err != nil {
log.Fatal("Error running site controller: ", err.Error())
}
Expand Down
16 changes: 16 additions & 0 deletions cmd/kube-adaptor/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ import (
"time"

"github.com/cenkalti/backoff/v4"
"github.com/prometheus/client_golang/prometheus"
iflag "github.com/skupperproject/skupper/internal/flag"
"github.com/skupperproject/skupper/internal/kube/adaptor"
internalclient "github.com/skupperproject/skupper/internal/kube/client"
"github.com/skupperproject/skupper/internal/kube/metrics"
"github.com/skupperproject/skupper/internal/qdr"
"github.com/skupperproject/skupper/internal/version"
)
Expand Down Expand Up @@ -53,6 +55,11 @@ func main() {
// if -version used, report and exit
isVersion := flags.Bool("version", false, "Report the version of Config Sync")
isInit := flags.Bool("init", false, "Downloads configuration and ssl profile artefacts")

metricsConfig, err := metrics.BoundConfig(flags)
if err != nil {
log.Fatalf("Error reading metrics configuration: %s", err)
}
flags.Parse(os.Args[1:])
if *isVersion {
fmt.Println(version.Version)
Expand All @@ -77,6 +84,15 @@ func main() {
os.Exit(0)
}

if !metricsConfig.Disabled {
reg := prometheus.NewRegistry()
metrics.MustRegisterClientGoMetrics(reg)
srv := metrics.NewServer(metricsConfig, reg)
if err := srv.Start(stopCh); err != nil {
log.Fatalf("Error starting metrics server: %s", err)
}
}

log.Println("Waiting for Skupper router to be ready")
if err := waitForAMQPConnection("amqp://localhost:5672", time.Second*180, time.Second*5); err != nil {
log.Fatal("Error waiting for router ", err.Error())
Expand Down
7 changes: 7 additions & 0 deletions internal/kube/controller/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ import (

iflag "github.com/skupperproject/skupper/internal/flag"
"github.com/skupperproject/skupper/internal/kube/grants"
"github.com/skupperproject/skupper/internal/kube/metrics"
"github.com/skupperproject/skupper/internal/kube/securedaccess"
)

type Config struct {
GrantConfig *grants.GrantConfig
SecuredAccessConfig *securedaccess.Config
MetricsConfig *metrics.Config
Namespace string
Kubeconfig string
WatchNamespace string
Expand All @@ -39,9 +41,14 @@ func BoundConfig(flags *flag.FlagSet) (*Config, error) {
} else if err := securedAccessConfig.Verify(); err != nil {
return nil, err
}
metricsConfig, err := metrics.BoundConfig(flags)
if err != nil {
return nil, err
}
c := &Config{
GrantConfig: grantConfig,
SecuredAccessConfig: securedAccessConfig,
MetricsConfig: metricsConfig,
}
iflag.StringVar(flags, &c.Namespace, "namespace", "NAMESPACE", "", "The Kubernetes namespace scope for the controller")
iflag.StringVar(flags, &c.Kubeconfig, "kubeconfig", "KUBECONFIG", "", "A path to the kubeconfig file to use")
Expand Down
77 changes: 77 additions & 0 deletions internal/kube/metrics/client.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package metrics

import (
"context"
"net/url"
"time"

"github.com/prometheus/client_golang/prometheus"
"k8s.io/client-go/tools/metrics"
)

// MustRegisterClientGoMetrics registers a set of metrics exposed from the
// k8s.io/client-go/tools/metrics package with the prometheus registry.
func MustRegisterClientGoMetrics(registry *prometheus.Registry) {
httpMetrics := &clientGoHttpMetrics{
latency: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "skupper",
Subsystem: "kubernetes_client",
Name: "http_request_duration_seconds",
Help: "Latency of kubernetes client requests in seconds by endpoint.",
}, []string{"method", "endpoint"}),
results: prometheus.NewCounterVec(prometheus.CounterOpts{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to have a results counter for "path" and "method" as well to help identifying operations happening during the reconciliation of a given resource ytpe.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense! I did consider that.

The client-go metrics API doesn't give us all of that context in the ResultMetric interface, and I think that's intentional. It gives us the LatencyMetric (http_request_duration_seconds) that does have method and URL path. I think the best we can do is something along these lines:

  • Add method to http_request_duration_seconds (and maybe rate_limiter_duration_seconds.)
  • Monitor "sum by (method, endpoint) (rate(http_request_duration_seconds_count[d])) " for request rate by (normalized) url path and method.
  • Monitor "sum by (status_code) (rate(http_requests_total{status_code!=200}[d])) " for http error rate in general.

I suspect the intent of the ResultMetric is mostly for clients working with multiple cluster API servers as a lower cardinality indicator of api server trouble. For our single-cluster context that is less interesting, but I included it anyways since the latency metric is absent of the status code field.

Namespace: "skupper",
Subsystem: "kubernetes_client",
Name: "http_requests_total",
Help: "Total number of kubernetes client requests by status code.",
}, []string{"method", "status_code"}),
retries: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "skupper",
Subsystem: "kubernetes_client",
Name: "http_retries_total",
Help: "Total number of kubernetes client requests retried by status code.",
}, []string{"method", "status_code"}),
}
rateLimiterMetrics := &clientGoRateLimiterMetrics{
latency: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "skupper",
Subsystem: "kubernetes_client",
Name: "rate_limiter_duration_seconds",
Help: "Latency of kubernetes client side rate limiting in seconds by endpoint.",
}, []string{"method", "endpoint"}),
}

registry.MustRegister(httpMetrics.latency, httpMetrics.results, httpMetrics.retries, rateLimiterMetrics.latency)
metrics.Register(metrics.RegisterOpts{
RequestLatency: httpMetrics,
RequestResult: httpMetrics,
RequestRetry: httpMetrics,

RateLimiterLatency: rateLimiterMetrics,
})
}

type clientGoHttpMetrics struct {
latency *prometheus.HistogramVec
results *prometheus.CounterVec
retries *prometheus.CounterVec
}

func (m *clientGoHttpMetrics) Observe(ctx context.Context, verb string, url url.URL, latency time.Duration) {
m.latency.WithLabelValues(verb, url.EscapedPath()).Observe(latency.Seconds())
}

func (m *clientGoHttpMetrics) Increment(ctx context.Context, code string, method string, host string) {
m.results.WithLabelValues(method, code).Inc()
}
func (m *clientGoHttpMetrics) IncrementRetry(ctx context.Context, code string, method string, _ string) {
m.retries.WithLabelValues(method, code).Inc()
}

type clientGoRateLimiterMetrics struct {
latency *prometheus.HistogramVec
}

func (m *clientGoRateLimiterMetrics) Observe(ctx context.Context, verb string, url url.URL, latency time.Duration) {
m.latency.WithLabelValues(verb, url.EscapedPath()).Observe(latency.Seconds())
}
74 changes: 74 additions & 0 deletions internal/kube/metrics/server.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package metrics

import (
"context"
"flag"
"fmt"
"log"
"net"
"net/http"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
iflag "github.com/skupperproject/skupper/internal/flag"
)

const metricsPath = "/metrics"

type Config struct {
Disabled bool
Address string
}

func BoundConfig(flags *flag.FlagSet) (*Config, error) {
cfg := &Config{}
err := iflag.BoolVar(flags, &cfg.Disabled, "disable-metrics", "SKUPPER_METRICS_DISABLE", false, "Set to disable metrics.")
iflag.StringVar(flags, &cfg.Address, "metrics-address", "SKUPPER_METRICS_ADDRESS", ":9000", "The address for the metrics http server to listen on.")
return cfg, err
}

func NewServer(cfg *Config, registry *prometheus.Registry) *Server {
mux := http.NewServeMux()
mux.Handle(metricsPath, promhttp.HandlerFor(registry, promhttp.HandlerOpts{}))
srv := &http.Server{
ReadTimeout: 5 * time.Second,
WriteTimeout: 5 * time.Second,
IdleTimeout: 120 * time.Second,
Handler: mux,
}
return &Server{
config: *cfg,
server: srv,
}
}

type Server struct {
config Config
server *http.Server
}

func (s *Server) Start(stopCh <-chan struct{}) error {
listenCtx, cancel := context.WithCancel(context.Background())
go func() {
select {
case <-stopCh:
cancel()
case <-listenCtx.Done():
}
}()
defer cancel()

var lc net.ListenConfig
ln, err := lc.Listen(listenCtx, "tcp", s.config.Address)
if err != nil {
return fmt.Errorf("failed to start listener: %s", err)
}
go func() {
if err := s.server.Serve(ln); err != nil {
log.Printf("metrics server error: %s", err)
}
}()
log.Printf("Started metrics server at: %s", s.config.Address)
return nil
}
4 changes: 4 additions & 0 deletions internal/kube/site/resources/skupper-router-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@ spec:
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
ports:
- containerPort: 9000
name: metrics
protocol: TCP
volumeMounts:
- mountPath: /etc/skupper-router-certs
name: skupper-router-certs
Expand Down
6 changes: 6 additions & 0 deletions scripts/skupper-deployment-generator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ spec:
imagePullPolicy: Always
command: ["/app/controller"]
args: ["-enable-grants", "-grant-server-autoconfigure"]
ports:
- name: metrics
containerPort: 9000
env:
- name: SKUPPER_KUBE_ADAPTOR_IMAGE
value: ${SKUPPER_KUBE_ADAPTOR_IMAGE}
Expand Down Expand Up @@ -181,6 +184,9 @@ spec:
imagePullPolicy: Always
command: ["/app/controller"]
args: ["-enable-grants", "-grant-server-autoconfigure"]
ports:
- name: metrics
containerPort: 9000
env:
- name: WATCH_NAMESPACE
valueFrom:
Expand Down