go-watchdog/watchdog_linux.go
Raúl Kripalani 31d951f370 implement automatic heapdumps when usage is above threshold.
A heapdump will be captured when the usage trespasses the threshold.
Staying above the threshold won't trigger another heapdump.
If the usage goes down, then back up, that is considered another
"episode" to be captured in a heapdump.

This feature is driven by three parameters:

* HeapdumpDir: the directory where the watchdog will write the heapdump.
  It will be created if it doesn't exist upon initialization. An error when
  creating the dir will not prevent heapdog initialization; it will just
  disable the heapdump capture feature.

  If zero-valued, the feature is disabled. Heapdumps will be written to path:
  <HeapdumpDir>/<RFC3339Nano formatted timestamp>.heap.

* HeapdumpMaxCaptures: sets the maximum amount of heapdumps a process will
  generate. This limits the amount of episodes that will be captured, in case
  the utilization climbs repeatedly over the threshold. By default, it is 10.

* HeapdumpThreshold: sets the utilization threshold that will trigger a
  heap dump to be taken automatically. A zero value disables this feature.
  By default, it is disabled.
2021-01-19 20:02:16 +00:00

74 lines
2.1 KiB
Go

package watchdog
import (
"fmt"
"os"
"time"
"github.com/containerd/cgroups"
)
var (
pid = os.Getpid()
memSubsystem = cgroups.SingleSubsystem(cgroups.V1, cgroups.Memory)
)
// CgroupDriven initializes a cgroups-driven watchdog. It will try to discover
// the memory limit from the cgroup of the process (derived from /proc/self/cgroup),
// or from the root cgroup path if the PID == 1 (which indicates that the process
// is running in a container).
//
// Memory usage is calculated by querying the cgroup stats.
//
// This function will return an error immediately if the OS does not support cgroups,
// or if another error occurs during initialization. The caller can then safely fall
// back to the system driven watchdog.
func CgroupDriven(frequency time.Duration, policyCtor PolicyCtor) (err error, stopFn func()) {
// use self path unless our PID is 1, in which case we're running inside
// a container and our limits are in the root path.
path := cgroups.NestedPath("")
if pid := os.Getpid(); pid == 1 {
path = cgroups.RootPath
}
cgroup, err := cgroups.Load(memSubsystem, path)
if err != nil {
return fmt.Errorf("failed to load cgroup for process: %w", err), nil
}
var limit uint64
if stat, err := cgroup.Stat(); err != nil {
return fmt.Errorf("failed to load memory cgroup stats: %w", err), nil
} else if stat.Memory == nil || stat.Memory.Usage == nil {
return fmt.Errorf("cgroup memory stats are nil; aborting"), nil
} else {
limit = stat.Memory.Usage.Limit
}
if limit == 0 {
return fmt.Errorf("cgroup limit is 0; refusing to start memory watchdog"), nil
}
policy, err := policyCtor(limit)
if err != nil {
return fmt.Errorf("failed to construct policy with limit %d: %w", limit, err), nil
}
if err := start(UtilizationProcess); err != nil {
return err, nil
}
_watchdog.wg.Add(1)
go pollingWatchdog(policy, frequency, limit, func() (uint64, error) {
stat, err := cgroup.Stat()
if err != nil {
return 0, err
} else if stat.Memory == nil || stat.Memory.Usage == nil {
return 0, fmt.Errorf("cgroup memory stats are nil; aborting")
}
return stat.Memory.Usage.Usage, nil
})
return nil, stop
}