-
Notifications
You must be signed in to change notification settings - Fork 2
feat: add election package #246
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
jason-lynch
wants to merge
2
commits into
main
Choose a base branch
from
feat/PLAT-347/election-pkg
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,290 @@ | ||
| package election | ||
|
|
||
| import ( | ||
| "context" | ||
| "errors" | ||
| "fmt" | ||
| "sync" | ||
| "sync/atomic" | ||
| "time" | ||
|
|
||
| "github.com/pgEdge/control-plane/server/internal/storage" | ||
| "github.com/rs/zerolog" | ||
| ) | ||
|
|
||
| // ClaimHandler is a callback function invoked when a candidate successfully | ||
| // claims leadership. Handlers are executed in separate goroutines. | ||
| type ClaimHandler func(ctx context.Context) | ||
|
|
||
| // Candidate participates in a single named election. Once elected, candidate | ||
| // maintains leadership until its Stop method is called or until it fails to | ||
| // renew its leadership claim. | ||
| type Candidate struct { | ||
| mu sync.Mutex | ||
| running bool | ||
| store *ElectionStore | ||
| logger zerolog.Logger | ||
| electionName Name | ||
| candidateID string | ||
| isLeader atomic.Bool | ||
| ttl time.Duration | ||
| watchOp storage.WatchOp[*StoredElection] | ||
| ticker *time.Ticker | ||
| done chan struct{} | ||
| errCh chan error | ||
| onClaim []ClaimHandler | ||
| } | ||
|
|
||
| // NewCandidate creates a new Candidate instance to participate in the specified | ||
| // election. The candidateID uniquely identifies this participant. The ttl | ||
| // determines how long a leadership claim remains valid without renewal. The | ||
| // onClaim handlers are invoked when this candidate successfully claims | ||
| // leadership. | ||
| func NewCandidate( | ||
| store *ElectionStore, | ||
| logger zerolog.Logger, | ||
| electionName Name, | ||
| candidateID string, | ||
| ttl time.Duration, | ||
| onClaim []ClaimHandler, | ||
| ) *Candidate { | ||
| return &Candidate{ | ||
| store: store, | ||
| logger: logger.With(). | ||
| Str("component", "election_candidate"). | ||
| Stringer("election_name", electionName). | ||
| Str("candidate_id", candidateID). | ||
| Logger(), | ||
| electionName: electionName, | ||
| candidateID: candidateID, | ||
| ttl: ttl, | ||
| errCh: make(chan error, 1), | ||
| onClaim: onClaim, | ||
| } | ||
| } | ||
|
|
||
| // Start begins participating in the election. It synchronously attempts to | ||
| // claim leadership and starts an asynchronous process to periodically refresh | ||
| // its claim or re-attempt to claim leadership. Start is idempotent. | ||
| func (c *Candidate) Start(ctx context.Context) error { | ||
| c.mu.Lock() | ||
| defer c.mu.Unlock() | ||
|
|
||
| if c.running { | ||
| return nil | ||
| } | ||
|
|
||
| if err := c.checkClaim(ctx); err != nil { | ||
| return err | ||
| } | ||
|
|
||
| // we're intentionally not assigning e.done and e.ticker directly to capture | ||
| // these variables in this closure and avoid a data race if start is called | ||
| // again. | ||
| done := make(chan struct{}, 1) | ||
| ticker := time.NewTicker(c.ttl / 3) | ||
|
|
||
| go func() { | ||
| defer ticker.Stop() | ||
|
|
||
| for { | ||
| select { | ||
| case <-ctx.Done(): | ||
| return | ||
| case <-done: | ||
| return | ||
| case <-ticker.C: | ||
| if err := c.lockAndCheckClaim(ctx); err != nil { | ||
| c.errCh <- err | ||
| } | ||
| } | ||
| } | ||
| }() | ||
|
|
||
| c.running = true | ||
| c.done = done | ||
| c.ticker = ticker | ||
|
|
||
| go c.watch(ctx) | ||
|
|
||
| if c.IsLeader() { | ||
| c.logger.Debug().Msg("i am the current leader") | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
jason-lynch marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| // IsLeader returns true if this candidate currently holds leadership. | ||
| func (c *Candidate) IsLeader() bool { | ||
| return c.isLeader.Load() | ||
| } | ||
|
|
||
| // Stop ceases participation in the election, releases leadership if held, and | ||
| // stops the asynchronous renewal process. Stop is idempotent. | ||
| func (c *Candidate) Stop(ctx context.Context) error { | ||
| c.mu.Lock() | ||
| defer c.mu.Unlock() | ||
|
|
||
| if !c.running { | ||
| return nil | ||
| } | ||
|
|
||
| if c.watchOp != nil { | ||
| c.watchOp.Close() | ||
| c.watchOp = nil | ||
| } | ||
|
|
||
| c.done <- struct{}{} | ||
| c.running = false | ||
|
|
||
| return c.release(ctx) | ||
| } | ||
|
|
||
| // Error returns a channel that receives errors encountered during election | ||
| // operations such as claim attempts, renewals, or watch failures. | ||
| func (c *Candidate) Error() <-chan error { | ||
| return c.errCh | ||
| } | ||
|
|
||
| func (c *Candidate) AddHandlers(handlers ...ClaimHandler) { | ||
| c.mu.Lock() | ||
| defer c.mu.Unlock() | ||
|
|
||
| c.onClaim = append(c.onClaim, handlers...) | ||
| } | ||
|
|
||
| func (c *Candidate) lockAndCheckClaim(ctx context.Context) error { | ||
| c.mu.Lock() | ||
| defer c.mu.Unlock() | ||
|
|
||
| return c.checkClaim(ctx) | ||
| } | ||
|
|
||
| func (c *Candidate) checkClaim(ctx context.Context) error { | ||
| const maxRetries = 3 | ||
| for range maxRetries { | ||
| curr, err := c.store.GetByKey(c.electionName).Exec(ctx) | ||
| switch { | ||
| case errors.Is(err, storage.ErrNotFound): | ||
| return c.attemptClaim(ctx) | ||
| case err != nil: | ||
| return fmt.Errorf("failed to check for existing leader: %w", err) | ||
| } | ||
|
|
||
| c.isLeader.Store(curr.LeaderID == c.candidateID) | ||
| if curr.LeaderID != c.candidateID { | ||
| return nil | ||
| } | ||
|
|
||
| err = c.store.Update(curr). | ||
| WithTTL(c.ttl). | ||
| Exec(ctx) | ||
| switch { | ||
| case errors.Is(err, storage.ErrValueVersionMismatch): | ||
| // Can happen if we caught the claim right before it expired. The | ||
| // continue will retry the operation. When we re-fetch the claim, it | ||
| // will either not exist or it will belong to someone else. | ||
| continue | ||
| case err != nil: | ||
| return fmt.Errorf("failed to refresh claim: %w", err) | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
| return fmt.Errorf("failed to refresh claim after %d retries", maxRetries) | ||
| } | ||
|
|
||
| func (c *Candidate) attemptClaim(ctx context.Context) error { | ||
| c.logger.Debug().Msg("attempting to claim leadership") | ||
|
|
||
| err := c.store. | ||
| Create(&StoredElection{ | ||
| Name: c.electionName, | ||
| LeaderID: c.candidateID, | ||
| CreatedAt: time.Now(), | ||
| }). | ||
| WithTTL(c.ttl). | ||
| Exec(ctx) | ||
| switch { | ||
| case err == nil: | ||
| c.isLeader.Store(true) | ||
| c.logger.Debug().Msg("successfully claimed leadership") | ||
| for _, handler := range c.onClaim { | ||
| go handler(ctx) | ||
| } | ||
| case !errors.Is(err, storage.ErrAlreadyExists): | ||
| return fmt.Errorf("failed to claim leadership: %w", err) | ||
| default: | ||
| c.isLeader.Store(false) | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| func (c *Candidate) watch(ctx context.Context) { | ||
| c.mu.Lock() | ||
| defer c.mu.Unlock() | ||
|
|
||
| c.logger.Debug().Msg("starting watch") | ||
|
|
||
| c.watchOp = c.store.Watch(c.electionName) | ||
| err := c.watchOp.Watch(ctx, func(evt *storage.Event[*StoredElection]) { | ||
| switch evt.Type { | ||
| case storage.EventTypeDelete: | ||
| // The delete event will fire simultaneously with the ticker in some | ||
| // types of outages, so the claim might have already been created | ||
| // when this handler runs, even though its for a 'delete' event. | ||
| if err := c.lockAndCheckClaim(ctx); err != nil { | ||
| c.errCh <- err | ||
| } | ||
| case storage.EventTypeError: | ||
| c.logger.Debug().Err(evt.Err).Msg("encountered a watch error") | ||
| if errors.Is(evt.Err, storage.ErrWatchClosed) && c.running { | ||
| // Restart the watch if we're still running. | ||
| c.watch(ctx) | ||
| } | ||
| } | ||
| }) | ||
| if err != nil { | ||
| c.errCh <- fmt.Errorf("failed to start watch: %w", err) | ||
| } | ||
| } | ||
jason-lynch marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| func (c *Candidate) release(ctx context.Context) error { | ||
| ctx, cancel := context.WithTimeout(ctx, c.ttl) | ||
| defer cancel() | ||
|
|
||
| if !c.isLeader.Load() { | ||
| return nil | ||
| } | ||
|
|
||
| curr, err := c.store.GetByKey(c.electionName).Exec(ctx) | ||
| switch { | ||
| case errors.Is(err, storage.ErrNotFound): | ||
| // Happens when the claim has expired since the last time we checked it | ||
| // and no one else has claimed it. | ||
| c.isLeader.Store(false) | ||
| return nil | ||
| case err != nil: | ||
| return fmt.Errorf("failed to fetch current leader: %w", err) | ||
| case curr.LeaderID != c.candidateID: | ||
| // Happens when the claim has expired since the last time we checked it | ||
| // and someone else has claimed it. | ||
| c.isLeader.Store(false) | ||
| return nil | ||
| } | ||
|
|
||
| err = c.store.Delete(curr).Exec(ctx) | ||
| switch { | ||
| case errors.Is(err, storage.ErrValueVersionMismatch): | ||
| // Happens when the claim has expired after the above check and someone | ||
| // else has claimed it. | ||
| c.isLeader.Store(false) | ||
| return nil | ||
| case err != nil: | ||
| return fmt.Errorf("failed to release leadership claim: %w", err) | ||
| } | ||
|
|
||
| c.isLeader.Store(false) | ||
| return nil | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.