Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions pkg/testing/fixture.go
Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,63 @@ func (f *Fixture) ExecStatus(ctx context.Context, opts ...statusOpt) (AgentStatu
}
}

// ExecStatusRaw executes `elastic-agent status --output=json`.
//
// Returns the output parsed as map[string]any and the error from the execution. Keep in mind the agent exits with status 1 if it's
Comment on lines +863 to +865
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The agent control server uses a non-standard format when serializing timestamps breaking the ExecStatus command. I've added this method in order to avoid this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is weird, how is something formatted with time.Format not a valid time.Time?

Do you have an example of the error this causes?

I don't like just duplicating the function like this, it would be better to fix things in place, but I can't suggest alternatives unless I know what specific error we are working around.

Copy link
Contributor Author

@michel-laterman michel-laterman Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agent status returned an error: could not unmarshal agent status output: parsing time "2025-12-18 19:02:32 +0000 UTC" as "2006-01-02T15:04:05Z07:00": cannot parse " 19:02:32 +0000 UTC" as "T".

By default Go will try to to parse strings using the RFC3339 nano format (docs), the control server does not use this when serializing times to send to the client

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a drawback to changing to RFC3339? It seems like this would always unconditionally fail without a custom unmarshal implementation. Is this just the first time anyone has ever tried to unmarshal this as JSON in Go?

This is clearly just an artifact of the control server, the upgrade watcher is using gRPC instead so doesn't hit this I suspect because it never unmarshals as JSON.

Why do we have this non-standard format?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't think of a drawback from a technical standpoint; i've made a draft to change it here:#11923.
I don't know why the non-standard format was chosen, perhaps @blakerouse or @michalpristas would know?

// unhealthy, but it still outputs the status successfully. This call does require that the Elastic Agent is running
// and communication over the control protocol is working.
//
// By default, retry logic is applied. Use WithNoRetry to disable this behavior. WithRetryTimeout and WithRetryInterval
// can be used to adjust the retry logic timing. The default retry timeout is one minute and the default retry
// interval is one second.
//
// An empty AgentStatusOutput and non nil error means the output could not be parsed. As long as we get some output,
// we don't return any error. It should work with any 8.6+ agent
func (f *Fixture) ExecStatusRaw(ctx context.Context, opts ...statusOpt) (map[string]any, error) {
var opt statusOpts
opt.retryTimeout = 1 * time.Minute
opt.retryInterval = 1 * time.Second
for _, o := range opts {
o(&opt)
}

var cancel context.CancelFunc
if opt.noRetry || opt.retryTimeout == 0 {
ctx, cancel = context.WithCancel(ctx)
} else {
ctx, cancel = context.WithTimeout(ctx, opt.retryTimeout)
}
defer cancel()

var lastErr error
for {
if ctx.Err() != nil {
if errors.Is(ctx.Err(), context.DeadlineExceeded) && lastErr != nil {
// return the last observed error
return nil, fmt.Errorf("agent status returned an error: %w", lastErr)
}
return nil, fmt.Errorf("agent status failed: %w", ctx.Err())
}
out, err := f.Exec(ctx, []string{"status", "--output", "json"}, opt.cmdOptions...)
status := map[string]any{}
if uerr := json.Unmarshal(out, &status); uerr != nil {
// unmarshal error means that json was not outputted due to a communication error
lastErr = fmt.Errorf("could not unmarshal agent status output: %w:\n%s", errors.Join(uerr, err), out)
} else if len(status) == 0 {
// still not correct try again for a successful status
lastErr = fmt.Errorf("agent status output is empty: %w", err)
} else {
return status, nil
}

if opt.noRetry {
return status, lastErr
}

sleepFor(ctx, opt.retryInterval)
}
}

// ExecInspect executes to inspect subcommand on the prepared Elastic Agent binary.
// It returns the parsed output and the error from the execution or an empty
// AgentInspectOutput and the unmarshalling error if it cannot unmarshal the
Expand Down
14 changes: 7 additions & 7 deletions testing/integration/ess/proxy_url_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -824,8 +824,6 @@ func TestFleetDownloadProxyURL(t *testing.T) {
Sudo: true,
})

t.Skip("Flaky test, see https://github.com/elastic/elastic-agent/issues/11796#issuecomment-3667342065")

ctx := t.Context()
kibClient := info.KibanaClient
fleetServerURL, err := fleettools.DefaultURL(ctx, kibClient)
Expand Down Expand Up @@ -948,10 +946,12 @@ func TestFleetDownloadProxyURL(t *testing.T) {

t.Log("Ensure upgrade has failed")
require.EventuallyWithT(t, func(c *assert.CollectT) {
agent, err := kibClient.GetAgent(ctx, kibana.GetAgentRequest{ID: agentID})
status, err := startFixture.ExecStatusRaw(ctx)
require.NoError(c, err)
require.NotNil(c, agent.UpgradeDetails)
require.Equal(c, "UPG_FAILED", agent.UpgradeDetails.State)
require.NotNil(c, status["upgrade_details"], "Agent status does not contain upgrade_details.")
upgradeDetails, ok := status["upgrade_details"].(map[string]any)
require.True(t, ok, "expected upgrade_details to be an object, got type: %T", status["upgrade_details"])
require.Equal(c, "UPG_FAILED", upgradeDetails["state"])
}, time.Minute*5, time.Second, "Unable to verify that upgrade has failed.")

proxy := proxytest.New(t,
Expand Down Expand Up @@ -997,9 +997,9 @@ func TestFleetDownloadProxyURL(t *testing.T) {

t.Log("Ensure upgrade starts")
require.EventuallyWithT(t, func(c *assert.CollectT) {
agent, err := kibClient.GetAgent(ctx, kibana.GetAgentRequest{ID: agentID})
status, err := startFixture.ExecStatusRaw(ctx)
require.NoError(c, err)
require.NotNil(c, agent.UpgradeDetails)
require.NotNil(c, status["upgrade_details"], "Agent status does not contain upgrade_details.")
}, time.Minute*5, time.Second, "Unable to verify that upgrade details appear.")

t.Log("Waiting for upgrade watcher to start...")
Expand Down