go-watchdog/watchdog_test.go

package watchdog

import (
	"fmt"
	"io/ioutil"
	"log"
	"os"
	"path/filepath"
	"runtime"
	"runtime/debug"
	"strconv"
	"testing"
	"time"

	"github.com/elastic/gosigar"
	"github.com/raulk/clock"
	"github.com/stretchr/testify/require"
)

const (
	// EnvTestIsolated is a marker property for the runner to confirm that this
	// test is running in isolation (i.e. a dedicated process).
	EnvTestIsolated = "TEST_ISOLATED"

	// EnvTestDockerMemLimit is the memory limit applied in a docker container.
	EnvTestDockerMemLimit = "TEST_DOCKER_MEMLIMIT"
)

// DockerMemLimit is initialized in the init() function from the
// EnvTestDockerMemLimit env variable.
var DockerMemLimit int // bytes

func init() {
	Logger = &stdlog{log: log.New(os.Stdout, "[watchdog test] ", log.LstdFlags|log.Lmsgprefix), debug: true}

	if l := os.Getenv(EnvTestDockerMemLimit); l != "" {
		l, err := strconv.Atoi(l)
		if err != nil {
			panic(err)
		}
		DockerMemLimit = l
	}
}

func skipIfNotIsolated(t *testing.T) {
	if os.Getenv(EnvTestIsolated) != "1" {
		t.Skipf("skipping test in non-isolated mode")
	}
}

var (
	limit64MiB uint64 = 64 << 20 // 64MiB.
)

func TestControl_Isolated(t *testing.T) {
	skipIfNotIsolated(t)

	debug.SetGCPercent(100)

	rounds := 100
	if DockerMemLimit != 0 {
		rounds /= int(float64(DockerMemLimit)*0.8) / 1024 / 1024
	}

	// retain 1MiB every iteration.
	var retained [][]byte
	for i := 0; i < rounds; i++ {
		b := make([]byte, 1*1024*1024)
		for i := range b {
			b[i] = byte(i)
		}
		retained = append(retained, b)
	}

	for _, b := range retained {
		for i := range b {
			b[i] = byte(i)
		}
	}

	var ms runtime.MemStats
	runtime.ReadMemStats(&ms)
	require.NotZero(t, ms.NumGC)    // GCs have taken place, but...
	require.Zero(t, ms.NumForcedGC) // ... no forced GCs beyond our initial one.
}

func TestHeapDriven_Isolated(t *testing.T) {
	skipIfNotIsolated(t)

	// we can't mock ReadMemStats, because we're relying on the go runtime to
	// enforce the GC run, and the go runtime won't use our mock. Therefore, we
	// need to do the actual thing.
	debug.SetGCPercent(100)

	clk := clock.NewMock()
	Clock = clk

	observations := make([]*runtime.MemStats, 0, 100)
	NotifyGC = func() {
		var ms runtime.MemStats
		runtime.ReadMemStats(&ms)
		observations = append(observations, &ms)
	}

	// limit is 64MiB.
	err, stopFn := HeapDriven(limit64MiB, NewAdaptivePolicy(0.5))
	require.NoError(t, err)
	defer stopFn()

	time.Sleep(500 * time.Millisecond) // give time for the watchdog to init.

	// retain 1MiB every iteration, up to 100MiB (beyond heap limit!).
	var retained [][]byte
	for i := 0; i < 100; i++ {
		retained = append(retained, make([]byte, 1*1024*1024))
	}

	for _, o := range observations {
		fmt.Println("heap alloc:", o.HeapAlloc, "next gc:", o.NextGC, "gc count:", o.NumGC, "forced gc:", o.NumForcedGC)
	}

	var ms runtime.MemStats
	runtime.ReadMemStats(&ms)
	require.GreaterOrEqual(t, ms.NumGC, uint32(5)) // over 5 GCs should've taken place.
}

func TestSystemDriven_Isolated(t *testing.T) {
	skipIfNotIsolated(t)

	debug.SetGCPercent(100)

	clk := clock.NewMock()
	Clock = clk

	// mock the system reporting.
	var actualUsed uint64
	sysmemFn = func(g *gosigar.Mem) error {
		g.ActualUsed = actualUsed
		return nil
	}

	// limit is 64MiB.
	err, stopFn := SystemDriven(limit64MiB, 5*time.Second, NewAdaptivePolicy(0.5))
	require.NoError(t, err)
	defer stopFn()

	time.Sleep(200 * time.Millisecond) // give time for the watchdog to init.

	notifyCh := make(chan struct{}, 1)
	NotifyGC = func() {
		notifyCh <- struct{}{}
	}

	// first tick; used = 0.
	clk.Add(5 * time.Second)
	time.Sleep(200 * time.Millisecond)
	require.Len(t, notifyCh, 0) // no GC has taken place.

	// second tick; used = just over 50%; will trigger GC.
	actualUsed = (limit64MiB / 2) + 1
	clk.Add(5 * time.Second)
	time.Sleep(200 * time.Millisecond)
	require.Len(t, notifyCh, 1)
	<-notifyCh

	// third tick; just below 75%; no GC.
	actualUsed = uint64(float64(limit64MiB)*0.75) - 1
	clk.Add(5 * time.Second)
	time.Sleep(200 * time.Millisecond)
	require.Len(t, notifyCh, 0)

	// fourth tick; 75% exactly; will trigger GC.
	actualUsed = uint64(float64(limit64MiB)*0.75) + 1
	clk.Add(5 * time.Second)
	time.Sleep(200 * time.Millisecond)
	require.Len(t, notifyCh, 1)
	<-notifyCh

	var ms runtime.MemStats
	runtime.ReadMemStats(&ms)
	require.GreaterOrEqual(t, ms.NumForcedGC, uint32(2))
}

// TestHeapdumpCapture tests that heap dumps are captured appropriately.
func TestHeapdumpCapture(t *testing.T) {
	debug.SetGCPercent(100)

	dir, err := ioutil.TempDir("", "")
	require.NoError(t, err)

	t.Cleanup(func() {
		_ = os.RemoveAll(dir)
	})

	assertFileCount := func(expected int) {
		glob, err := filepath.Glob(filepath.Join(dir, "*"))
		require.NoError(t, err)
		require.Len(t, glob, expected)
	}

	HeapdumpDir = dir
	HeapdumpThreshold = 0.5
	HeapdumpMaxCaptures = 5

	// mock clock.
	clk := clock.NewMock()
	Clock = clk

	// mock the system reporting.
	var actualUsed uint64
	sysmemFn = func(g *gosigar.Mem) error {
		g.ActualUsed = actualUsed
		return nil
	}

	// init a system driven watchdog.
	err, stopFn := SystemDriven(limit64MiB, 5*time.Second, NewAdaptivePolicy(0.5))
	require.NoError(t, err)
	defer stopFn()
	time.Sleep(200 * time.Millisecond) // give time for the watchdog to init.

	// first tick; used = 0.
	clk.Add(5 * time.Second)
	time.Sleep(200 * time.Millisecond)
	assertFileCount(0)

	// second tick; used = just over 50%; will trigger a heapdump.
	actualUsed = (limit64MiB / 2) + 1
	clk.Add(5 * time.Second)
	time.Sleep(200 * time.Millisecond)
	assertFileCount(1)

	// third tick; continues above 50%; same episode, no heapdump.
	actualUsed = (limit64MiB / 2) + 10
	clk.Add(5 * time.Second)
	time.Sleep(200 * time.Millisecond)
	assertFileCount(1)

	// fourth tick; below 50%; this resets the episodic flag.
	actualUsed = limit64MiB / 3
	clk.Add(5 * time.Second)
	time.Sleep(200 * time.Millisecond)
	assertFileCount(1)

	// fifth tick; above 50%; this triggers a new heapdump.
	actualUsed = (limit64MiB / 2) + 1
	clk.Add(5 * time.Second)
	time.Sleep(200 * time.Millisecond)
	assertFileCount(2)

	for i := 0; i < 20; i++ {
		// below 50%; this resets the episodic flag.
		actualUsed = limit64MiB / 3
		clk.Add(5 * time.Second)
		time.Sleep(200 * time.Millisecond)

		// above 50%; this triggers a new heapdump.
		actualUsed = (limit64MiB / 2) + 1
		clk.Add(5 * time.Second)
		time.Sleep(200 * time.Millisecond)
	}

	assertFileCount(5) // we only generated 5 heap dumps even though we had more episodes.

	// verify that heap dump file sizes aren't zero.
	glob, err := filepath.Glob(filepath.Join(dir, "*"))
	require.NoError(t, err)
	for _, f := range glob {
		fi, err := os.Stat(f)
		require.NoError(t, err)
		require.NotZero(t, fi.Size())
	}

}