From b09281bd36137474ebfaf08017a9e03b36e7d291 Mon Sep 17 00:00:00 2001
From: Charlie Vieth <charlie.vieth@gmail.com>
Date: Sun, 15 Mar 2015 17:40:38 -0400
Subject: [PATCH 1/4] Update benchmarks to focus on conversion performance.

Benchmark each interpolation function for RGBA and YCbCr images.
Reduce the image size used when benchmarking to reduce the influence of
memory performance on results.
---
 resize_test.go | 121 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 85 insertions(+), 36 deletions(-)

diff --git a/resize_test.go b/resize_test.go
index f49bad6..3ba03f2 100644
--- a/resize_test.go
+++ b/resize_test.go
@@ -85,56 +85,105 @@ func Test_SameSizeReturnsOriginal(t *testing.T) {
 	}
 }
 
-func Benchmark_BigResizeLanczos3(b *testing.B) {
-	var m image.Image
-	for i := 0; i < b.N; i++ {
-		m = Resize(1000, 1000, img, Lanczos3)
-	}
-	m.At(0, 0)
-}
+const (
+	// Use a small image size for benchmarks. We don't want memory performance
+	// to affect the benchmark results.
+	benchMaxX = 250
+	benchMaxY = 250
 
-func Benchmark_Reduction(b *testing.B) {
-	largeImg := image.NewRGBA(image.Rect(0, 0, 1000, 1000))
+	// Resize values near the original size require increase the amount of time
+	// resize spends converting the image.
+	benchWidth  = 200
+	benchHeight = 200
+)
 
-	var m image.Image
-	for i := 0; i < b.N; i++ {
-		m = Resize(300, 300, largeImg, Bicubic)
-	}
-	m.At(0, 0)
-}
-
-// Benchmark resize of 16 MPix jpeg image to 800px width.
-func jpegThumb(b *testing.B, interp InterpolationFunction) {
-	input := image.NewYCbCr(image.Rect(0, 0, 4896, 3264), image.YCbCrSubsampleRatio422)
-
-	var output image.Image
-	for i := 0; i < b.N; i++ {
-		output = Resize(800, 0, input, interp)
+func benchRGBA(b *testing.B, interp InterpolationFunction) {
+	m := image.NewRGBA(image.Rect(0, 0, benchMaxX, benchMaxY))
+	// Initialize m's pixels to create a non-uniform image.
+	for y := m.Rect.Min.Y; y < m.Rect.Max.Y; y++ {
+		for x := m.Rect.Min.X; x < m.Rect.Max.X; x++ {
+			i := m.PixOffset(x, y)
+			m.Pix[i+0] = uint8(y + 4*x)
+			m.Pix[i+1] = uint8(y + 4*x)
+			m.Pix[i+2] = uint8(y + 4*x)
+			m.Pix[i+3] = uint8(4*y + x)
+		}
 	}
 
-	output.At(0, 0)
+	var out image.Image
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		out = Resize(benchWidth, benchHeight, m, interp)
+	}
+	out.At(0, 0)
 }
 
-func Benchmark_LargeJpegThumbNearestNeighbor(b *testing.B) {
-	jpegThumb(b, NearestNeighbor)
+// The names of some interpolation functions are truncated so that the columns
+// of 'go test -bench' line up.
+func Benchmark_Nearest_RGBA(b *testing.B) {
+	benchRGBA(b, NearestNeighbor)
 }
 
-func Benchmark_LargeJpegThumbBilinear(b *testing.B) {
-	jpegThumb(b, Bilinear)
+func Benchmark_Bilinear_RGBA(b *testing.B) {
+	benchRGBA(b, Bilinear)
 }
 
-func Benchmark_LargeJpegThumbBicubic(b *testing.B) {
-	jpegThumb(b, Bicubic)
+func Benchmark_Bicubic_RGBA(b *testing.B) {
+	benchRGBA(b, Bicubic)
 }
 
-func Benchmark_LargeJpegThumbMitchellNetravali(b *testing.B) {
-	jpegThumb(b, MitchellNetravali)
+func Benchmark_Mitchell_RGBA(b *testing.B) {
+	benchRGBA(b, MitchellNetravali)
 }
 
-func Benchmark_LargeJpegThumbLanczos2(b *testing.B) {
-	jpegThumb(b, Lanczos2)
+func Benchmark_Lanczos2_RGBA(b *testing.B) {
+	benchRGBA(b, Lanczos2)
 }
 
-func Benchmark_LargeJpegThumbLanczos3(b *testing.B) {
-	jpegThumb(b, Lanczos3)
+func Benchmark_Lanczos3_RGBA(b *testing.B) {
+	benchRGBA(b, Lanczos3)
+}
+
+func benchYCbCr(b *testing.B, interp InterpolationFunction) {
+	m := image.NewYCbCr(image.Rect(0, 0, benchMaxX, benchMaxY), image.YCbCrSubsampleRatio422)
+	// Initialize m's pixels to create a non-uniform image.
+	for y := m.Rect.Min.Y; y < m.Rect.Max.Y; y++ {
+		for x := m.Rect.Min.X; x < m.Rect.Max.X; x++ {
+			yi := m.YOffset(x, y)
+			ci := m.COffset(x, y)
+			m.Y[yi] = uint8(16*y + x)
+			m.Cb[ci] = uint8(y + 16*x)
+			m.Cr[ci] = uint8(y + 16*x)
+		}
+	}
+	var out image.Image
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		out = Resize(benchWidth, benchHeight, m, interp)
+	}
+	out.At(0, 0)
+}
+
+func Benchmark_Nearest_YCC(b *testing.B) {
+	benchYCbCr(b, NearestNeighbor)
+}
+
+func Benchmark_Bilinear_YCC(b *testing.B) {
+	benchYCbCr(b, Bilinear)
+}
+
+func Benchmark_Bicubic_YCC(b *testing.B) {
+	benchYCbCr(b, Bicubic)
+}
+
+func Benchmark_Mitchell_YCC(b *testing.B) {
+	benchYCbCr(b, MitchellNetravali)
+}
+
+func Benchmark_Lanczos2_YCC(b *testing.B) {
+	benchYCbCr(b, Lanczos2)
+}
+
+func Benchmark_Lanczos3_YCC(b *testing.B) {
+	benchYCbCr(b, Lanczos3)
 }

From d8255b04211a049fb5895bc48f2d3e8086730d30 Mon Sep 17 00:00:00 2001
From: Charlie Vieth <charlie.vieth@gmail.com>
Date: Sun, 15 Mar 2015 17:44:39 -0400
Subject: [PATCH 2/4] Optimize bound checks.

Optimizing the bound checks resulted in an overall 14% increase in
conversion performance.
---
 converter.go | 42 +++++++++++++++++++++---------------------
 nearest.go   | 18 +++++++++---------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/converter.go b/converter.go
index 07d0903..8d9f61b 100644
--- a/converter.go
+++ b/converter.go
@@ -20,24 +20,24 @@ import "image"
 
 // Keep value in [0,255] range.
 func clampUint8(in int32) uint8 {
-	if in < 0 {
-		return 0
+	if uint32(in) < 256 {
+		return uint8(in)
 	}
 	if in > 255 {
 		return 255
 	}
-	return uint8(in)
+	return 0
 }
 
 // Keep value in [0,65535] range.
 func clampUint16(in int64) uint16 {
-	if in < 0 {
-		return 0
+	if uint64(in) < 65536 {
+		return uint16(in)
 	}
 	if in > 65535 {
 		return 65535
 	}
-	return uint16(in)
+	return 0
 }
 
 func resizeGeneric(in image.Image, out *image.RGBA64, scale float64, coeffs []int32, offset []int, filterLength int) {
@@ -102,12 +102,12 @@ func resizeRGBA(in *image.RGBA, out *image.RGBA, scale float64, coeffs []int16,
 				if coeff != 0 {
 					xi := start + i
 					switch {
-					case xi < 0:
-						xi = 0
+					case uint(xi) < uint(maxX):
+						xi *= 4
 					case xi >= maxX:
 						xi = 4 * maxX
 					default:
-						xi *= 4
+						xi = 0
 					}
 					rgba[0] += int32(coeff) * int32(row[xi+0])
 					rgba[1] += int32(coeff) * int32(row[xi+1])
@@ -142,12 +142,12 @@ func resizeRGBA64(in *image.RGBA64, out *image.RGBA64, scale float64, coeffs []i
 				if coeff != 0 {
 					xi := start + i
 					switch {
-					case xi < 0:
-						xi = 0
+					case uint(xi) < uint(maxX):
+						xi *= 8
 					case xi >= maxX:
 						xi = 8 * maxX
 					default:
-						xi *= 8
+						xi = 0
 					}
 					rgba[0] += int64(coeff) * int64(uint16(row[xi+0])<<8|uint16(row[xi+1]))
 					rgba[1] += int64(coeff) * int64(uint16(row[xi+2])<<8|uint16(row[xi+3]))
@@ -222,12 +222,12 @@ func resizeGray16(in *image.Gray16, out *image.Gray16, scale float64, coeffs []i
 				if coeff != 0 {
 					xi := start + i
 					switch {
-					case xi < 0:
-						xi = 0
+					case uint(xi) < uint(maxX):
+						xi *= 2
 					case xi >= maxX:
 						xi = 2 * maxX
 					default:
-						xi *= 2
+						xi = 0
 					}
 					gray += int64(coeff) * int64(uint16(row[xi+0])<<8|uint16(row[xi+1]))
 					sum += int64(coeff)
@@ -258,12 +258,12 @@ func resizeYCbCr(in *ycc, out *ycc, scale float64, coeffs []int16, offset []int,
 				if coeff != 0 {
 					xi := start + i
 					switch {
-					case xi < 0:
-						xi = 0
+					case uint(xi) < uint(maxX):
+						xi *= 3
 					case xi >= maxX:
 						xi = 3 * maxX
 					default:
-						xi *= 3
+						xi = 0
 					}
 					p[0] += int32(coeff) * int32(row[xi+0])
 					p[1] += int32(coeff) * int32(row[xi+1])
@@ -295,12 +295,12 @@ func nearestYCbCr(in *ycc, out *ycc, scale float64, coeffs []bool, offset []int,
 				if coeffs[ci+i] {
 					xi := start + i
 					switch {
-					case xi < 0:
-						xi = 0
+					case uint(xi) < uint(maxX):
+						xi *= 3
 					case xi >= maxX:
 						xi = 3 * maxX
 					default:
-						xi *= 3
+						xi = 0
 					}
 					p[0] += float32(row[xi+0])
 					p[1] += float32(row[xi+1])
diff --git a/nearest.go b/nearest.go
index 0f8c321..78ad3cb 100644
--- a/nearest.go
+++ b/nearest.go
@@ -94,12 +94,12 @@ func nearestRGBA(in *image.RGBA, out *image.RGBA, scale float64, coeffs []bool,
 				if coeffs[ci+i] {
 					xi := start + i
 					switch {
-					case xi < 0:
-						xi = 0
+					case uint(xi) < uint(maxX):
+						xi *= 4
 					case xi >= maxX:
 						xi = 4 * maxX
 					default:
-						xi *= 4
+						xi = 0
 					}
 					rgba[0] += float32(row[xi+0])
 					rgba[1] += float32(row[xi+1])
@@ -133,12 +133,12 @@ func nearestRGBA64(in *image.RGBA64, out *image.RGBA64, scale float64, coeffs []
 				if coeffs[ci+i] {
 					xi := start + i
 					switch {
-					case xi < 0:
-						xi = 0
+					case uint(xi) < uint(maxX):
+						xi *= 8
 					case xi >= maxX:
 						xi = 8 * maxX
 					default:
-						xi *= 8
+						xi = 0
 					}
 					rgba[0] += float32(uint16(row[xi+0])<<8 | uint16(row[xi+1]))
 					rgba[1] += float32(uint16(row[xi+2])<<8 | uint16(row[xi+3]))
@@ -211,12 +211,12 @@ func nearestGray16(in *image.Gray16, out *image.Gray16, scale float64, coeffs []
 				if coeffs[ci+i] {
 					xi := start + i
 					switch {
-					case xi < 0:
-						xi = 0
+					case uint(xi) < uint(maxX):
+						xi *= 2
 					case xi >= maxX:
 						xi = 2 * maxX
 					default:
-						xi *= 2
+						xi = 0
 					}
 					gray += float32(uint16(row[xi+0])<<8 | uint16(row[xi+1]))
 					sum++

From 076a9ce3b9fe5c61d1bc81a3ef101d565707255e Mon Sep 17 00:00:00 2001
From: Charlie Vieth <charlie.vieth@gmail.com>
Date: Sun, 15 Mar 2015 17:59:38 -0400
Subject: [PATCH 3/4] Limit spawned goroutines to GOMAXPROCS(0).

This change sets the number of spawned goroutines to the GOMAXPROCS
value set by the user.

Using more goroutines than available execution threads is slightly
detrimental to performance (~0.5%), and prevents users of the library
from controlling the number of spawned goroutines.
---
 resize.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resize.go b/resize.go
index f6d24bb..a913c6f 100644
--- a/resize.go
+++ b/resize.go
@@ -97,7 +97,7 @@ func Resize(width, height uint, img image.Image, interp InterpolationFunction) i
 	}
 
 	taps, kernel := interp.kernel()
-	cpus := runtime.NumCPU()
+	cpus := runtime.GOMAXPROCS(0)
 	wg := sync.WaitGroup{}
 
 	// Generic access to image.Image is slow in tight loops.
@@ -283,7 +283,7 @@ func Resize(width, height uint, img image.Image, interp InterpolationFunction) i
 
 func resizeNearest(width, height uint, scaleX, scaleY float64, img image.Image, interp InterpolationFunction) image.Image {
 	taps, _ := interp.kernel()
-	cpus := runtime.NumCPU()
+	cpus := runtime.GOMAXPROCS(0)
 	wg := sync.WaitGroup{}
 
 	switch input := img.(type) {

From d67851ed1f6f9cbc7bf3a453e8597cac9a0f29b3 Mon Sep 17 00:00:00 2001
From: jst <janschlicht@gmail.com>
Date: Wed, 18 Mar 2015 19:56:53 +0100
Subject: [PATCH 4/4] Explain optimization.

---
 converter.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/converter.go b/converter.go
index 8d9f61b..faf49e4 100644
--- a/converter.go
+++ b/converter.go
@@ -20,6 +20,9 @@ import "image"
 
 // Keep value in [0,255] range.
 func clampUint8(in int32) uint8 {
+	// casting a negative int to an uint will result in an overflown large int.
+	// this behavior will be exploited here and in other functions to archive
+	// a higher performance.
 	if uint32(in) < 256 {
 		return uint8(in)
 	}