From cc2c005faddc11a0cbfb049b4af52d92d60b5263 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Tue, 15 Feb 2022 14:10:07 -0500 Subject: [PATCH 1/2] debug: limit the size of the trace We've noticed that a trace that is captured over the full duration is too large to open on most machines. A trace.out captured over just the interval period (30s by default) should be a more than enough time to capture trace data. --- .changelog/12359.txt | 3 +++ command/debug/debug.go | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 .changelog/12359.txt diff --git a/.changelog/12359.txt b/.changelog/12359.txt new file mode 100644 index 0000000000..6c6c3e4511 --- /dev/null +++ b/.changelog/12359.txt @@ -0,0 +1,3 @@ +```release-note:improvement +debug: reduce the capture time for trace to only a single interval instead of the full duration to make trace.out easier to open without running into OOM errors. +``` diff --git a/command/debug/debug.go b/command/debug/debug.go index 1749d76b29..acbe36d937 100644 --- a/command/debug/debug.go +++ b/command/debug/debug.go @@ -413,7 +413,7 @@ func (c *cmd) captureLongRunning(ctx context.Context) error { g.Go(func() error { // use ctx without a timeout to allow the trace to finish sending - return c.captureTrace(ctx, s) + return c.captureTrace(ctx, int(c.interval.Seconds())) }) } if c.captureTarget(targetLogs) { @@ -443,8 +443,8 @@ func (c *cmd) captureGoRoutines(outputDir string) error { return ioutil.WriteFile(filepath.Join(outputDir, "goroutine.prof"), gr, 0644) } -func (c *cmd) captureTrace(ctx context.Context, s float64) error { - prof, err := c.client.Debug().PProf(ctx, "trace", int(s)) +func (c *cmd) captureTrace(ctx context.Context, duration int) error { + prof, err := c.client.Debug().PProf(ctx, "trace", duration) if err != nil { return fmt.Errorf("failed to collect cpu profile: %w", err) } From 53ae4b3e2ca3a9274de44e56dde74b06816bb404 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Tue, 15 Feb 2022 18:16:12 -0500 Subject: [PATCH 2/2] debug: update CLI docs To clarify how trace is captured. Also remove the minimum seconds check, because that is already done in prepare() --- command/debug/debug.go | 8 ++------ website/content/commands/debug.mdx | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/command/debug/debug.go b/command/debug/debug.go index acbe36d937..587462a4b7 100644 --- a/command/debug/debug.go +++ b/command/debug/debug.go @@ -400,15 +400,11 @@ func makeIntervalDir(base string, now time.Time) (string, error) { func (c *cmd) captureLongRunning(ctx context.Context) error { g := new(errgroup.Group) - // Capture a profile/trace with a minimum of 1s - s := c.duration.Seconds() - if s < 1 { - s = 1 - } + if c.captureTarget(targetProfiles) { g.Go(func() error { // use ctx without a timeout to allow the profile to finish sending - return c.captureProfile(ctx, s) + return c.captureProfile(ctx, c.duration.Seconds()) }) g.Go(func() error { diff --git a/website/content/commands/debug.mdx b/website/content/commands/debug.mdx index ded40e58e6..58434cb16a 100644 --- a/website/content/commands/debug.mdx +++ b/website/content/commands/debug.mdx @@ -78,7 +78,7 @@ information when `debug` is running. By default, it captures all information. | `members` | A list of all the WAN and LAN members in the cluster. | | `metrics` | Metrics from the in-memory metrics endpoint in the target, captured at the interval. | | `logs` | `DEBUG` level logs for the target agent, captured for the duration. | -| `pprof` | Golang heap, CPU, goroutine, and trace profiling. CPU and traces are captured for `duration` in a single file while heap and goroutine are separate snapshots for each `interval`. This information is not retrieved unless [`enable_debug`](/docs/agent/options#enable_debug) is set to `true` on the target agent or ACLs are enable and an ACL token with `operator:read` is provided. | +| `pprof` | Golang heap, CPU, goroutine, and trace profiling. CPU profile is captured for `duration` in a single file, trace is captured for a single `interval`, while heap and goroutine are separate snapshots for each `interval`. This information is not retrieved unless [`enable_debug`](/docs/agent/options#enable_debug) is set to `true` on the target agent or ACLs are enable and an ACL token with `operator:read` is provided. | ## Examples