Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions docs/using/etcd-reconfiguration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Etcd Mode Reconfiguration


This guide explains how to change a Control Plane host's etcd mode after cluster initialization.


## Overview


The Control Plane supports two etcd modes:


- **Server mode**: Runs an embedded etcd server and participates as a voting member
- **Client mode**: Connects to the etcd cluster as a client only


**Recommended topology:**
- 1-3 hosts: All should be etcd servers
- 4-7 hosts: 3 etcd servers, rest as clients
- 8+ hosts: 5 etcd servers, rest as clients


!!! warning "Maintain Odd Numbers"
Etcd requires an **odd number** of servers (3 or 5) for proper quorum.


## How It Works


Etcd mode reconfiguration is **fully automatic**:


1. Stop the container
2. Update `PGEDGE_ETCD_MODE` environment variable
3. Restart the container
4. The system automatically handles all cluster operations


**What happens automatically:**
- **Client→Server**: Discovers cluster, obtains credentials, joins as voting member
- **Server→Client**: Removes itself from membership, transitions to client mode


No manual API calls or configuration needed!


## Procedures


### Promoting a Client to Server (Example - host-4)


```bash
# 1. Stop the container
docker stop control-plane-host-4


# 2. Update docker-compose.yaml environment:
PGEDGE_ETCD_MODE: server # was: client


# 3. Restart
docker-compose up -d host-4


# 4. Verify (check logs)
docker logs control-plane-host-4
```


### Demoting a Server to Client (Example - host-4)


!!! warning "Quorum Check"
Ensure at least 2 other healthy servers remain before demotion.


```bash
# 1. Stop the container
docker stop control-plane-host-4


# 2. Update docker-compose.yaml environment:
PGEDGE_ETCD_MODE: client # was: server


# 3. Restart
docker-compose up -d host-4


# 4. Verify (check logs)
docker logs control-plane-host-4
```


## Troubleshooting


### Promotion Issues


**Problem**: Host fails to join cluster
**Solution**: Check logs for connection errors. Verify network connectivity and that other hosts are healthy.


**Problem**: "Permission denied" errors
**Solution**: System automatically obtains new credentials. If issue persists, check RBAC is enabled on cluster.


### Demotion Issues


**Problem**: Host fails to remove itself from membership
**Solution**: Check remaining servers have quorum. System continues transition even if removal fails.


**Problem**: Old data directory persists
**Solution**: System automatically cleans up etcd directory. If persists, manually remove after verifying host transitioned.


### General Troubleshooting


Check cluster health:


```bash
docker exec control-plane-host-1 etcdctl member list
```


All members should show `STATUS=started`.


## Best Practices


- **Change one host at a time** - Wait for completion before reconfiguring another
- **Monitor cluster health** - Verify all servers healthy before/after changes
- **Maintain odd numbers** - Always keep 3 or 5 etcd servers, never 2 or 4

## Summary


Etcd mode reconfiguration is fully automatic - just update the environment variable and restart. The Control Plane handles all cluster operations including credential provisioning, membership changes, and configuration updates without manual intervention.

54 changes: 8 additions & 46 deletions server/internal/api/apiv1/pre_init_handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,14 @@ import (
"context"
"crypto/tls"
"crypto/x509"
"encoding/base64"
"fmt"
"net/http"
"net/url"
"os"

"github.com/google/uuid"
goahttp "goa.design/goa/v3/http"

api "github.com/pgEdge/control-plane/api/apiv1/gen/control_plane"
"github.com/pgEdge/control-plane/api/apiv1/gen/http/control_plane/client"
"github.com/pgEdge/control-plane/server/internal/cluster"
"github.com/pgEdge/control-plane/server/internal/config"
"github.com/pgEdge/control-plane/server/internal/etcd"
Expand Down Expand Up @@ -93,20 +90,15 @@ func (s *PreInitHandlers) JoinCluster(ctx context.Context, token *api.ClusterJoi
return ErrInvalidServerURL
}

http_client, err := s.GetClient()

httpClient, err := s.GetClient()
if err != nil {
return err
}

enc := goahttp.RequestEncoder
dec := goahttp.ResponseDecoder //make our own
c := client.NewClient(serverURL.Scheme, serverURL.Host, http_client, enc, dec, false)
cli := &api.Client{
GetJoinOptionsEndpoint: c.GetJoinOptions(),
}
// Use shared API client creation utility
apiClient := etcd.CreateAPIClient(serverURL, httpClient)

opts, err := cli.GetJoinOptions(ctx, &api.ClusterJoinRequest{
opts, err := apiClient.GetJoinOptions(ctx, &api.ClusterJoinRequest{
HostID: api.Identifier(s.cfg.HostID),
Hostname: s.cfg.Hostname,
Ipv4Address: s.cfg.IPv4Address,
Expand All @@ -117,43 +109,13 @@ func (s *PreInitHandlers) JoinCluster(ctx context.Context, token *api.ClusterJoi
return apiErr(err)
}

caCert, err := base64.StdEncoding.DecodeString(opts.Credentials.CaCert)
if err != nil {
return apiErr(fmt.Errorf("failed to decode CA certificate: %w", err))
}
clientCert, err := base64.StdEncoding.DecodeString(opts.Credentials.ClientCert)
if err != nil {
return apiErr(fmt.Errorf("failed to decode client certificate: %w", err))
}
clientKey, err := base64.StdEncoding.DecodeString(opts.Credentials.ClientKey)
if err != nil {
return apiErr(fmt.Errorf("failed to decode client key: %w", err))
}
serverCert, err := base64.StdEncoding.DecodeString(opts.Credentials.ServerCert)
if err != nil {
return apiErr(fmt.Errorf("failed to decode server certificate: %w", err))
}
serverKey, err := base64.StdEncoding.DecodeString(opts.Credentials.ServerKey)
// Decode credentials using shared utility
joinOptions, err := etcd.DecodeJoinCredentials(opts)
if err != nil {
return apiErr(fmt.Errorf("failed to decode server key: %w", err))
return apiErr(err)
}

err = s.etcd.Join(ctx, etcd.JoinOptions{
Leader: &etcd.ClusterMember{
Name: opts.Leader.Name,
PeerURLs: opts.Leader.PeerUrls,
ClientURLs: opts.Leader.ClientUrls,
},
Credentials: &etcd.HostCredentials{
Username: opts.Credentials.Username,
Password: opts.Credentials.Password,
CaCert: caCert,
ClientCert: clientCert,
ClientKey: clientKey,
ServerCert: serverCert,
ServerKey: serverKey,
},
})
err = s.etcd.Join(ctx, *joinOptions)
if err != nil {
return apiErr(fmt.Errorf("failed to join existing cluster: %w", err))
}
Expand Down
78 changes: 78 additions & 0 deletions server/internal/etcd/embedded.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ func (e *EmbeddedEtcd) Start(ctx context.Context) error {
e.mu.Lock()
defer e.mu.Unlock()

if e.etcd != nil {
return nil // already started
}

initialized, err := e.IsInitialized()
if err != nil {
return err
Expand Down Expand Up @@ -292,6 +296,7 @@ func (e *EmbeddedEtcd) Shutdown() error {
}
if e.etcd != nil {
e.etcd.Close()
e.etcd = nil
}
return errors.Join(errs...)
}
Expand Down Expand Up @@ -443,6 +448,79 @@ func (e *EmbeddedEtcd) HealthCheck() common.ComponentStatus {
}
}

func (e *EmbeddedEtcd) ChangeMode(ctx context.Context, mode config.EtcdMode) (Etcd, error) {
if mode != config.EtcdModeClient {
return nil, fmt.Errorf("invalid mode transition from %s to %s", config.EtcdModeServer, mode)
}

if err := e.Start(ctx); err != nil {
return nil, err
}

cfg := e.cfg.Config()

embeddedClient, err := e.GetClient()
if err != nil {
return nil, err
}

// Get the full member list before removing this host
resp, err := embeddedClient.MemberList(ctx)
if err != nil {
return nil, fmt.Errorf("failed to list etcd members for server->client transition: %w", err)
}

var endpoints []string
for _, m := range resp.Members {
// Skip this host's member; we are about to remove it.
if m.Name == cfg.HostID {
continue
}
endpoints = append(endpoints, m.ClientURLs...)
}

if len(endpoints) == 0 {
return nil, fmt.Errorf("cannot demote etcd server on host %s: no remaining cluster members with client URLs", cfg.HostID)
}

generated := e.cfg.GeneratedConfig()
generated.EtcdClient.Endpoints = endpoints
if err := e.cfg.UpdateGeneratedConfig(generated); err != nil {
return nil, fmt.Errorf("failed to update generated config with client endpoints: %w", err)
}

if err := e.Shutdown(); err != nil {
return nil, err
}

remote := NewRemoteEtcd(e.cfg, e.logger)
if err := remote.Start(ctx); err != nil {
return nil, fmt.Errorf("failed to start remote client: %w", err)
}

remoteClient, err := remote.GetClient()
if err != nil {
return nil, fmt.Errorf("failed to get remote client: %w", err)
}

if err := RemoveMember(ctx, remoteClient, cfg.HostID); err != nil {
return nil, fmt.Errorf("failed to remove embedded etcd from cluster: %w", err)
}

if err := os.RemoveAll(e.etcdDir()); err != nil {
return nil, fmt.Errorf("failed to remove embedded etcd data dir: %w", err)
}

generated.EtcdMode = config.EtcdModeClient
generated.EtcdServer = config.EtcdServer{}
generated.EtcdClient = cfg.EtcdClient
if err := e.cfg.UpdateGeneratedConfig(generated); err != nil {
return nil, fmt.Errorf("failed to clear out etcd server settings in generated config: %w", err)
}

return remote, err
}

const maxLearnerStallTime = 5 * time.Minute

type learnerProgress struct {
Expand Down
2 changes: 2 additions & 0 deletions server/internal/etcd/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
clientv3 "go.etcd.io/etcd/client/v3"

"github.com/pgEdge/control-plane/server/internal/common"
"github.com/pgEdge/control-plane/server/internal/config"
)

type ClusterMember struct {
Expand Down Expand Up @@ -50,4 +51,5 @@ type Etcd interface {
RemoveHost(ctx context.Context, hostID string) error
JoinToken() (string, error)
VerifyJoinToken(in string) error
ChangeMode(ctx context.Context, mode config.EtcdMode) (Etcd, error)
}
Loading