Performance Benchmarking and Profiling

Performance testing is crucial for Go applications, especially those with high throughput requirements.

Writing Effective Benchmarks

Go’s testing package provides excellent support for benchmarking:

package performance

import (
	"bytes"
	"crypto/sha256"
	"encoding/json"
	"fmt"
	"sync"
	"testing"
)

// Item represents a data structure to benchmark
type Item struct {
	ID        string   `json:"id"`
	Name      string   `json:"name"`
	Tags      []string `json:"tags"`
	Count     int      `json:"count"`
	Value     float64  `json:"value"`
	IsEnabled bool     `json:"is_enabled"`
}

// generateItem creates a test item
func generateItem(id string) Item {
	return Item{
		ID:    id,
		Name:  "Test Item " + id,
		Tags:  []string{"tag1", "tag2", "tag3", "tag4", "tag5"},
		Count: 42,
		Value: 99.99,
		IsEnabled: true,
	}
}

// BenchmarkJSONMarshal benchmarks JSON marshaling performance
func BenchmarkJSONMarshal(b *testing.B) {
	item := generateItem("test-1")
	
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		_, err := json.Marshal(item)
		if err != nil {
			b.Fatal(err)
		}
	}
}

// BenchmarkJSONMarshalParallel benchmarks JSON marshaling in parallel
func BenchmarkJSONMarshalParallel(b *testing.B) {
	item := generateItem("test-1")
	
	b.ResetTimer()
	b.RunParallel(func(pb *testing.PB) {
		for pb.Next() {
			_, err := json.Marshal(item)
			if err != nil {
				b.Fatal(err)
			}
		}
	})
}

// ItemCache is a simple cache implementation to benchmark
type ItemCache struct {
	items map[string]Item
	mu    sync.RWMutex
}

// NewItemCache creates a new item cache
func NewItemCache() *ItemCache {
	return &ItemCache{
		items: make(map[string]Item),
	}
}

// Get retrieves an item from the cache
func (c *ItemCache) Get(id string) (Item, bool) {
	c.mu.RLock()
	defer c.mu.RUnlock()
	item, ok := c.items[id]
	return item, ok
}

// Set adds an item to the cache
func (c *ItemCache) Set(id string, item Item) {
	c.mu.Lock()
	defer c.mu.Unlock()
	c.items[id] = item
}

// BenchmarkCacheGet benchmarks cache retrieval
func BenchmarkCacheGet(b *testing.B) {
	// Setup
	cache := NewItemCache()
	for i := 0; i < 1000; i++ {
		id := fmt.Sprintf("item-%d", i)
		cache.Set(id, generateItem(id))
	}
	
	// Benchmark different cache sizes
	benchmarks := []struct {
		name      string
		cacheSize int
	}{
		{"Small_10", 10},
		{"Medium_100", 100},
		{"Large_1000", 1000},
	}
	
	for _, bm := range benchmarks {
		b.Run(bm.name, func(b *testing.B) {
			// Create cache with specified size
			cache := NewItemCache()
			for i := 0; i < bm.cacheSize; i++ {
				id := fmt.Sprintf("item-%d", i)
				cache.Set(id, generateItem(id))
			}
			
			b.ResetTimer()
			for i := 0; i < b.N; i++ {
				// Get random item
				id := fmt.Sprintf("item-%d", i%bm.cacheSize)
				_, found := cache.Get(id)
				if !found {
					b.Fatalf("Item %s not found", id)
				}
			}
		})
	}
}

// BenchmarkCacheGetParallel benchmarks parallel cache retrieval
func BenchmarkCacheGetParallel(b *testing.B) {
	// Setup
	cache := NewItemCache()
	for i := 0; i < 1000; i++ {
		id := fmt.Sprintf("item-%d", i)
		cache.Set(id, generateItem(id))
	}
	
	b.ResetTimer()
	b.RunParallel(func(pb *testing.PB) {
		i := 0
		for pb.Next() {
			id := fmt.Sprintf("item-%d", i%1000)
			_, found := cache.Get(id)
			if !found {
				b.Fatalf("Item %s not found", id)
			}
			i++
		}
	})
}

// HashItem hashes an item using SHA-256
func HashItem(item Item) []byte {
	data, _ := json.Marshal(item)
	hash := sha256.Sum256(data)
	return hash[:]
}

// BenchmarkHashingComparison compares different hashing strategies
func BenchmarkHashingComparison(b *testing.B) {
	item := generateItem("test-1")
	
	b.Run("JSON_Marshal_Then_Hash", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			data, _ := json.Marshal(item)
			hash := sha256.Sum256(data)
			_ = hash
		}
	})
	
	b.Run("Direct_Field_Concatenation", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			var buf bytes.Buffer
			buf.WriteString(item.ID)
			buf.WriteString(item.Name)
			for _, tag := range item.Tags {
				buf.WriteString(tag)
			}
			buf.WriteString(fmt.Sprintf("%d", item.Count))
			buf.WriteString(fmt.Sprintf("%f", item.Value))
			buf.WriteString(fmt.Sprintf("%t", item.IsEnabled))
			
			hash := sha256.Sum256(buf.Bytes())
			_ = hash
		}
	})
}

// BenchmarkWithMemoryTracking demonstrates memory allocation tracking
func BenchmarkWithMemoryTracking(b *testing.B) {
	// Run with: go test -bench=BenchmarkWithMemoryTracking -benchmem
	
	b.Run("WithPreallocation", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			// Preallocate slice with capacity
			data := make([]Item, 0, 1000)
			for j := 0; j < 1000; j++ {
				id := fmt.Sprintf("item-%d", j)
				data = append(data, generateItem(id))
			}
			_ = data
		}
	})
	
	b.Run("WithoutPreallocation", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			// No preallocation
			var data []Item
			for j := 0; j < 1000; j++ {
				id := fmt.Sprintf("item-%d", j)
				data = append(data, generateItem(id))
			}
			_ = data
		}
	})
}

To run these benchmarks:

# Run all benchmarks
go test -bench=. ./performance

# Run specific benchmark
go test -bench=BenchmarkJSONMarshal ./performance

# Run benchmarks with memory allocation statistics
go test -bench=. -benchmem ./performance

# Run benchmarks with more iterations for statistical significance
go test -bench=. -benchtime=5s ./performance

# Compare benchmarks before and after changes
go test -bench=. -benchmem ./performance > before.txt
# Make changes
go test -bench=. -benchmem ./performance > after.txt
benchstat before.txt after.txt

These benchmarks demonstrate:

Basic benchmarking: Using testing.B to measure performance
Parallel benchmarks: Testing concurrent performance with b.RunParallel
Sub-benchmarks: Using b.Run to organize related benchmarks
Memory tracking: Measuring allocations with -benchmem
Comparison benchmarks: Comparing different implementations

CPU and Memory Profiling

Profiling helps identify performance bottlenecks in your code:

package main

import (
	"flag"
	"fmt"
	"log"
	"os"
	"runtime"
	"runtime/pprof"
	"sync"
	"time"
	"crypto/sha256"
)

var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
var memprofile = flag.String("memprofile", "", "write memory profile to file")

// Worker represents a task processor
type Worker struct {
	ID       int
	Tasks    chan Task
	Results  chan Result
	QuitChan chan bool
	wg       *sync.WaitGroup
}

// Task represents a unit of work
type Task struct {
	ID       int
	Payload  string
	Strength int // Computational intensity
}

// Result represents the outcome of processing a task
type Result struct {
	TaskID    int
	WorkerID  int
	Output    string
	TimeNanos int64
}

// NewWorker creates a new worker
func NewWorker(id int, tasks chan Task, results chan Result, wg *sync.WaitGroup) *Worker {
	return &Worker{
		ID:       id,
		Tasks:    tasks,
		Results:  results,
		QuitChan: make(chan bool),
		wg:       wg,
	}
}

// Start begins the worker's processing loop
func (w *Worker) Start() {
	go func() {
		defer w.wg.Done()
		for {
			select {
			case task := <-w.Tasks:
				// Process the task
				result := w.processTask(task)
				w.Results <- result
			case <-w.QuitChan:
				return
			}
		}
	}()
}

// Stop signals the worker to stop processing
func (w *Worker) Stop() {
	go func() {
		w.QuitChan <- true
	}()
}

// processTask handles the actual work
func (w *Worker) processTask(task Task) Result {
	// Simulate CPU-intensive work
	start := time.Now()
	
	// This is our "hot" function that will show up in CPU profiles
	output := performComputation(task.Payload, task.Strength)
	
	elapsed := time.Since(start)
	
	return Result{
		TaskID:    task.ID,
		WorkerID:  w.ID,
		Output:    output,
		TimeNanos: elapsed.Nanoseconds(),
	}
}

// performComputation is a CPU-intensive function
func performComputation(input string, strength int) string {
	// Create a large slice to show up in memory profiles
	data := make([]byte, 0, strength*1000)
	
	// Perform some CPU-intensive work
	for i := 0; i < strength; i++ {
		h := sha256.New()
		h.Write([]byte(input))
		hash := h.Sum(nil)
		data = append(data, hash...)
		input = fmt.Sprintf("%x", hash)
	}
	
	return fmt.Sprintf("%x", sha256.Sum256(data))
}

func main() {
	flag.Parse()
	
	// CPU profiling
	if *cpuprofile != "" {
		f, err := os.Create(*cpuprofile)
		if err != nil {
			log.Fatal("could not create CPU profile: ", err)
		}
		defer f.Close()
		if err := pprof.StartCPUProfile(f); err != nil {
			log.Fatal("could not start CPU profile: ", err)
		}
		defer pprof.StopCPUProfile()
	}
	
	// Run the workload
	runWorkload()
	
	// Memory profiling
	if *memprofile != "" {
		f, err := os.Create(*memprofile)
		if err != nil {
			log.Fatal("could not create memory profile: ", err)
		}
		defer f.Close()
		runtime.GC() // Get up-to-date statistics
		if err := pprof.WriteHeapProfile(f); err != nil {
			log.Fatal("could not write memory profile: ", err)
		}
	}
}

func runWorkload() {
	numWorkers := runtime.NumCPU()
	numTasks := 100
	
	// Create channels
	tasks := make(chan Task, numTasks)
	results := make(chan Result, numTasks)
	
	// Create worker pool
	var wg sync.WaitGroup
	wg.Add(numWorkers)
	workers := make([]*Worker, numWorkers)
	
	for i := 0; i < numWorkers; i++ {
		workers[i] = NewWorker(i, tasks, results, &wg)
		workers[i].Start()
	}
	
	// Generate tasks
	go func() {
		for i := 0; i < numTasks; i++ {
			tasks <- Task{
				ID:       i,
				Payload:  fmt.Sprintf("task-%d", i),
				Strength: i % 10, // Vary computational intensity
			}
		}
		close(tasks)
	}()
	
	// Collect results
	go func() {
		for i := 0; i < numTasks; i++ {
			result := <-results
			fmt.Printf("Task %d completed by Worker %d in %d ns\n",
				result.TaskID, result.WorkerID, result.TimeNanos)
		}
	}()
	
	// Wait for all workers to finish
	wg.Wait()
}

To run with profiling:

# CPU profiling
go build -o app
./app -cpuprofile=cpu.prof

# Memory profiling
./app -memprofile=mem.prof

# Analyze profiles
go tool pprof -http=:8080 cpu.prof
go tool pprof -http=:8080 mem.prof

This profiling example demonstrates:

CPU profiling: Capturing CPU usage patterns
Memory profiling: Tracking heap allocations
Profile visualization: Using pprof’s web interface
Hotspot identification: Finding performance bottlenecks
Workload simulation: Creating realistic test scenarios

Benchmarking HTTP Handlers

For web services, benchmarking HTTP handlers is crucial:

package api

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"net/http"
	"net/http/httptest"
	"testing"
	"time"

	"github.com/gin-gonic/gin"
)

// BenchmarkProductListHandler benchmarks the product listing endpoint
func BenchmarkProductListHandler(b *testing.B) {
	// Setup
	gin.SetMode(gin.ReleaseMode) // Disable debug mode for benchmarking
	router := gin.New()
	
	// Create mock service with controlled data size
	mockService := &MockProductService{}
	
	// Register handler
	handler := NewProductHandler(mockService)
	router.GET("/products", handler.ListProducts)
	
	// Benchmark with different dataset sizes
	benchmarks := []struct {
		name      string
		numItems  int
		setupMock func(*MockProductService, int)
	}{
		{
			name:     "small_10_items",
			numItems: 10,
			setupMock: func(m *MockProductService, n int) {
				products := generateTestProducts(n)
				m.On("ListProducts", mock.Anything, 100, 0).Return(products, nil)
			},
		},
		{
			name:     "medium_100_items",
			numItems: 100,
			setupMock: func(m *MockProductService, n int) {
				products := generateTestProducts(n)
				m.On("ListProducts", mock.Anything, 100, 0).Return(products, nil)
			},
		},
		{
			name:     "large_1000_items",
			numItems: 1000,
			setupMock: func(m *MockProductService, n int) {
				products := generateTestProducts(n)
				m.On("ListProducts", mock.Anything, 1000, 0).Return(products, nil)
			},
		},
	}
	
	for _, bm := range benchmarks {
		b.Run(bm.name, func(b *testing.B) {
			// Setup mock for this benchmark
			mockService := new(MockProductService)
			bm.setupMock(mockService, bm.numItems)
			
			// Create handler with this mock
			handler := NewProductHandler(mockService)
			router := gin.New()
			router.GET("/products", handler.ListProducts)
			
			// Create request
			req, _ := http.NewRequest(http.MethodGet, "/products", nil)
			
			b.ResetTimer()
			b.ReportAllocs()
			
			for i := 0; i < b.N; i++ {
				// Create a response recorder for each iteration
				w := httptest.NewRecorder()
				
				// Serve the request
				router.ServeHTTP(w, req)
				
				// Verify response code (but don't parse body in benchmark)
				if w.Code != http.StatusOK {
					b.Fatalf("Expected status code 200, got %d", w.Code)
				}
			}
		})
	}
}

// BenchmarkProductCreateHandler benchmarks the product creation endpoint
func BenchmarkProductCreateHandler(b *testing.B) {
	// Setup
	gin.SetMode(gin.ReleaseMode) // Disable debug mode for benchmarking
	
	// Prepare test data
	product := &models.Product{
		Name:        "Test Product",
		Description: "A test product for benchmarking",
		Price:       99.99,
		CategoryID:  "cat-123",
	}
	
	// Serialize once outside the benchmark loop
	jsonData, _ := json.Marshal(product)
	
	// Benchmark with different response scenarios
	benchmarks := []struct {
		name      string
		setupMock func(*MockProductService)
		delay     time.Duration // Simulate processing time
	}{
		{
			name: "fast_response",
			setupMock: func(m *MockProductService) {
				m.On("CreateProduct", mock.Anything, mock.Anything).Return(nil)
			},
			delay: 0,
		},
		{
			name: "medium_response_time",
			setupMock: func(m *MockProductService) {
				m.On("CreateProduct", mock.Anything, mock.Anything).
					Run(func(args mock.Arguments) {
						time.Sleep(10 * time.Millisecond)
					}).
					Return(nil)
			},
			delay: 10 * time.Millisecond,
		},
		{
			name: "slow_response_time",
			setupMock: func(m *MockProductService) {
				m.On("CreateProduct", mock.Anything, mock.Anything).
					Run(func(args mock.Arguments) {
						time.Sleep(50 * time.Millisecond)
					}).
					Return(nil)
			},
			delay: 50 * time.Millisecond,
		},
	}
	
	for _, bm := range benchmarks {
		b.Run(bm.name, func(b *testing.B) {
			// Setup mock for this benchmark
			mockService := new(MockProductService)
			bm.setupMock(mockService)
			
			// Create handler with this mock
			handler := NewProductHandler(mockService)
			router := gin.New()
			router.POST("/products", handler.CreateProduct)
			
			b.ResetTimer()
			b.ReportAllocs()
			
			for i := 0; i < b.N; i++ {
				// Create a new request for each iteration
				req, _ := http.NewRequest(http.MethodPost, "/products", bytes.NewBuffer(jsonData))
				req.Header.Set("Content-Type", "application/json")
				
				// Create a response recorder
				w := httptest.NewRecorder()
				
				// Serve the request
				router.ServeHTTP(w, req)
				
				// Verify response code
				if w.Code != http.StatusCreated {
					b.Fatalf("Expected status code 201, got %d", w.Code)
				}
			}
		})
	}
}

// generateTestProducts creates a slice of test products
func generateTestProducts(n int) []*models.Product {
	products := make([]*models.Product, n)
	now := time.Now()
	
	for i := 0; i < n; i++ {
		products[i] = &models.Product{
			ID:          fmt.Sprintf("prod-%d", i),
			Name:        fmt.Sprintf("Product %d", i),
			Description: fmt.Sprintf("Description for product %d", i),
			Price:       float64(10 + i%90),
			CategoryID:  fmt.Sprintf("cat-%d", i%5),
			CreatedAt:   now.Add(-time.Duration(i) * time.Hour),
			UpdatedAt:   now,
		}
	}
	
	return products
}

This HTTP benchmarking demonstrates:

Handler benchmarking: Measuring API endpoint performance
Data size impact: Testing with different payload sizes
Response time simulation: Measuring the impact of backend delays
Memory allocation tracking: Using b.ReportAllocs() to monitor memory usage
Realistic scenarios: Testing with representative data volumes

Analyzing Benchmark Results

Interpreting benchmark results is crucial for making informed optimizations:

package main

import (
	"fmt"
	"math"
	"sort"
	"strings"
)

// BenchmarkResult represents the outcome of a benchmark run
type BenchmarkResult struct {
	Name         string
	NsPerOp      float64
	AllocsPerOp  int64
	BytesPerOp   int64
	MBPerSecond  float64
	Measurements []float64
}

// AnalyzeBenchmarks demonstrates how to analyze benchmark data
func AnalyzeBenchmarks(results []BenchmarkResult) {
	// Sort by ns/op (fastest first)
	sort.Slice(results, func(i, j int) bool {
		return results[i].NsPerOp < results[j].NsPerOp
	})
	
	// Print summary table
	fmt.Println("Performance Summary (sorted by ns/op):")
	fmt.Printf("%-30s %-15s %-15s %-15s %-15s\n",
		"Benchmark", "Time (ns/op)", "Allocs (count)", "Memory (B/op)", "Throughput (MB/s)")
	fmt.Println(strings.Repeat("-", 90))
	
	for _, r := range results {
		fmt.Printf("%-30s %-15.2f %-15d %-15d %-15.2f\n",
			r.Name, r.NsPerOp, r.AllocsPerOp, r.BytesPerOp, r.MBPerSecond)
	}
	
	// Statistical analysis for a specific benchmark
	if len(results) > 0 {
		result := results[0]
		if len(result.Measurements) > 0 {
			mean, stdDev := calculateStats(result.Measurements)
			cv := (stdDev / mean) * 100 // Coefficient of variation
			
			fmt.Printf("\nStatistical Analysis for %s:\n", result.Name)
			fmt.Printf("  Mean:                %.2f ns/op\n", mean)
			fmt.Printf("  Standard Deviation:  %.2f ns/op\n", stdDev)
			fmt.Printf("  Coefficient of Var:  %.2f%%\n", cv)
			
			// Interpret the results
			fmt.Println("\nInterpretation:")
			if cv < 1.0 {
				fmt.Println("  Excellent stability (CV < 1%)")
			} else if cv < 5.0 {
				fmt.Println("  Good stability (CV < 5%)")
			} else if cv < 10.0 {
				fmt.Println("  Moderate stability (CV < 10%)")
			} else {
				fmt.Println("  Poor stability (CV >= 10%) - Results may not be reliable")
			}
			
			// Performance comparison
			if len(results) > 1 {
				baseline := results[0]
				comparison := results[1]
				
				improvement := (baseline.NsPerOp - comparison.NsPerOp) / baseline.NsPerOp * 100
				
				fmt.Printf("\nComparison (%s vs %s):\n", baseline.Name, comparison.Name)
				fmt.Printf("  Time difference:     %.2f ns/op (%.2f%%)\n",
					baseline.NsPerOp - comparison.NsPerOp, improvement)
				fmt.Printf("  Memory difference:   %d bytes/op\n",
					baseline.BytesPerOp - comparison.BytesPerOp)
				fmt.Printf("  Allocation difference: %d allocs/op\n",
					baseline.AllocsPerOp - comparison.AllocsPerOp)
			}
		}
	}
}

// calculateStats computes mean and standard deviation
func calculateStats(measurements []float64) (float64, float64) {
	sum := 0.0
	for _, m := range measurements {
		sum += m
	}
	mean := sum / float64(len(measurements))
	
	sumSquaredDiff := 0.0
	for _, m := range measurements {
		diff := m - mean
		sumSquaredDiff += diff * diff
	}
	variance := sumSquaredDiff / float64(len(measurements))
	stdDev := math.Sqrt(variance)
	
	return mean, stdDev
}

This analysis approach demonstrates:

Result sorting: Ranking implementations by performance
Statistical analysis: Computing mean, standard deviation, and coefficient of variation
Stability assessment: Evaluating the reliability of benchmark results
Comparative analysis: Quantifying improvements between implementations
Throughput calculation: Converting timing results to operations per second

Performance Benchmarking and Profiling

Writing Effective Benchmarks

CPU and Memory Profiling

Benchmarking HTTP Handlers

Analyzing Benchmark Results

Continue Your Learning