CPU Profiling and Analysis

CPU profiling is one of the most powerful techniques for identifying performance bottlenecks in Go applications. Let’s explore how to effectively use Go’s CPU profiling tools.

Setting Up CPU Profiling

Go provides multiple ways to enable CPU profiling:

package main

import (
	"flag"
	"fmt"
	"log"
	"os"
	"runtime"
	"runtime/pprof"
	"time"
)

// CPU-intensive function we want to profile
func computeIntensive() {
	// Simulate CPU-intensive work
	total := 0
	for i := 0; i < 100_000_000; i++ {
		total += i % 2
	}
	fmt.Printf("Computation result: %d\n", total)
}

// Method 1: Manual profiling with runtime/pprof
func manualCPUProfiling() {
	// Create CPU profile file
	f, err := os.Create("cpu_profile.prof")
	if err != nil {
		log.Fatal("could not create CPU profile: ", err)
	}
	defer f.Close()
	
	// Start CPU profiling
	if err := pprof.StartCPUProfile(f); err != nil {
		log.Fatal("could not start CPU profile: ", err)
	}
	defer pprof.StopCPUProfile()
	
	// Run the code we want to profile
	fmt.Println("Running CPU-intensive task...")
	computeIntensive()
	
	fmt.Println("CPU profile written to cpu_profile.prof")
	fmt.Println("Analyze with: go tool pprof cpu_profile.prof")
}

// Method 2: Using net/http/pprof for continuous profiling
func httpPprofDemo() {
	// This would typically be added to your main HTTP server
	fmt.Println("In a real application, you would import _ \"net/http/pprof\"")
	fmt.Println("Then access profiles via http://localhost:8080/debug/pprof/")
	fmt.Println("For CPU profile: http://localhost:8080/debug/pprof/profile")
	fmt.Println("Download and analyze with: go tool pprof http://localhost:8080/debug/pprof/profile")
}

// Method 3: Using testing package for benchmark profiling
func benchmarkPprofDemo() {
	fmt.Println("When running benchmarks, use:")
	fmt.Println("go test -bench=. -cpuprofile=cpu.prof ./...")
	fmt.Println("Then analyze with: go tool pprof cpu.prof")
}

// Method 4: Using runtime/trace for execution tracing
func traceDemo() {
	// Create trace file
	f, err := os.Create("trace.out")
	if err != nil {
		log.Fatal("could not create trace file: ", err)
	}
	defer f.Close()
	
	// Start tracing
	if err := runtime.StartTrace(); err != nil {
		log.Fatal("could not start trace: ", err)
	}
	defer runtime.StopTrace()
	
	// Capture trace data
	trace := pprof.Lookup("trace")
	trace.WriteTo(f, 0)
	
	// Run the code we want to trace
	computeIntensive()
	
	fmt.Println("Trace written to trace.out")
	fmt.Println("Analyze with: go tool trace trace.out")
}

func main() {
	// Parse command line flags
	cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file")
	memprofile := flag.String("memprofile", "", "write memory profile to file")
	flag.Parse()
	
	// CPU profiling via command line flag
	if *cpuprofile != "" {
		f, err := os.Create(*cpuprofile)
		if err != nil {
			log.Fatal("could not create CPU profile: ", err)
		}
		defer f.Close()
		if err := pprof.StartCPUProfile(f); err != nil {
			log.Fatal("could not start CPU profile: ", err)
		}
		defer pprof.StopCPUProfile()
	}
	
	// Run the demo functions
	manualCPUProfiling()
	httpPprofDemo()
	benchmarkPprofDemo()
	traceDemo()
	
	// Memory profiling via command line flag
	if *memprofile != "" {
		f, err := os.Create(*memprofile)
		if err != nil {
			log.Fatal("could not create memory profile: ", err)
		}
		defer f.Close()
		runtime.GC() // Get up-to-date statistics
		if err := pprof.WriteHeapProfile(f); err != nil {
			log.Fatal("could not write memory profile: ", err)
		}
	}
}

Analyzing CPU Profiles

Once you’ve collected a CPU profile, the next step is to analyze it effectively:

package main

import (
	"fmt"
	"math/rand"
	"os"
	"runtime/pprof"
	"time"
)

// Inefficient string concatenation function
func inefficientStringConcat(n int) string {
	result := ""
	for i := 0; i < n; i++ {
		result += "x"
	}
	return result
}

// Inefficient sorting algorithm (bubble sort)
func inefficientSort(items []int) {
	n := len(items)
	for i := 0; i < n; i++ {
		for j := 0; j < n-1-i; j++ {
			if items[j] > items[j+1] {
				items[j], items[j+1] = items[j+1], items[j]
			}
		}
	}
}

// Recursive function with exponential complexity
func fibonacci(n int) int {
	if n <= 1 {
		return n
	}
	return fibonacci(n-1) + fibonacci(n-2)
}

// Function with unnecessary allocations
func createLotsOfGarbage(iterations int) {
	for i := 0; i < iterations; i++ {
		_ = make([]byte, 1024)
	}
}

// Main function with various performance issues
func runProfilingDemo() {
	// Create CPU profile
	f, _ := os.Create("cpu_profile_analysis.prof")
	defer f.Close()
	pprof.StartCPUProfile(f)
	defer pprof.StopCPUProfile()
	
	// Run inefficient string concatenation
	inefficientStringConcat(10000)
	
	// Run inefficient sorting
	data := make([]int, 5000)
	for i := range data {
		data[i] = rand.Intn(10000)
	}
	inefficientSort(data)
	
	// Run exponential algorithm
	fibonacci(30)
	
	// Run allocation-heavy code
	createLotsOfGarbage(100000)
}

func main() {
	// Seed random number generator
	rand.Seed(time.Now().UnixNano())
	
	// Run the demo
	runProfilingDemo()
	
	fmt.Println("Profile generated. Analyze with:")
	fmt.Println("go tool pprof -http=:8080 cpu_profile_analysis.prof")
	fmt.Println("Common pprof commands:")
	fmt.Println("  top10 - Show top 10 functions by CPU usage")
	fmt.Println("  list functionName - Show source code with CPU usage")
	fmt.Println("  web - Generate SVG graph visualization")
	fmt.Println("  traces - Show execution traces")
	fmt.Println("  peek regex - Show functions matching regex")
}

Optimizing CPU-Bound Code

After identifying CPU bottlenecks, here are techniques to optimize them:

package main

import (
	"fmt"
	"math/rand"
	"strings"
	"time"
)

// BEFORE: Inefficient string concatenation
func inefficientStringConcat(n int) string {
	result := ""
	for i := 0; i < n; i++ {
		result += "x"
	}
	return result
}

// AFTER: Optimized string concatenation
func efficientStringConcat(n int) string {
	var builder strings.Builder
	builder.Grow(n) // Pre-allocate capacity
	for i := 0; i < n; i++ {
		builder.WriteByte('x')
	}
	return builder.String()
}

// BEFORE: Inefficient sorting (bubble sort)
func inefficientSort(items []int) {
	n := len(items)
	for i := 0; i < n; i++ {
		for j := 0; j < n-1-i; j++ {
			if items[j] > items[j+1] {
				items[j], items[j+1] = items[j+1], items[j]
			}
		}
	}
}

// AFTER: Optimized sorting (quicksort)
func efficientSort(items []int) {
	quicksort(items, 0, len(items)-1)
}

func quicksort(items []int, low, high int) {
	if low < high {
		pivot := partition(items, low, high)
		quicksort(items, low, pivot-1)
		quicksort(items, pivot+1, high)
	}
}

func partition(items []int, low, high int) int {
	pivot := items[high]
	i := low - 1
	
	for j := low; j < high; j++ {
		if items[j] <= pivot {
			i++
			items[i], items[j] = items[j], items[i]
		}
	}
	
	items[i+1], items[high] = items[high], items[i+1]
	return i + 1
}

// BEFORE: Recursive fibonacci with exponential complexity
func inefficientFibonacci(n int) int {
	if n <= 1 {
		return n
	}
	return inefficientFibonacci(n-1) + inefficientFibonacci(n-2)
}

// AFTER: Iterative fibonacci with linear complexity
func efficientFibonacci(n int) int {
	if n <= 1 {
		return n
	}
	
	a, b := 0, 1
	for i := 2; i <= n; i++ {
		a, b = b, a+b
	}
	return b
}

// BEFORE: Function with unnecessary allocations
func inefficientProcessing(data []int) []int {
	result := make([]int, 0)
	for _, v := range data {
		// Create a new slice for each operation
		temp := make([]int, 1)
		temp[0] = v * v
		result = append(result, temp...)
	}
	return result
}

// AFTER: Function with optimized allocations
func efficientProcessing(data []int) []int {
	// Pre-allocate the result slice
	result := make([]int, len(data))
	for i, v := range data {
		// Direct assignment, no temporary allocations
		result[i] = v * v
	}
	return result
}

// Benchmark helper
func benchmark(name string, iterations int, fn func()) {
	start := time.Now()
	for i := 0; i < iterations; i++ {
		fn()
	}
	elapsed := time.Since(start)
	fmt.Printf("%-30s %10d iterations in %s (%s per op)\n", 
		name, iterations, elapsed, elapsed/time.Duration(iterations))
}

func main() {
	// Seed random number generator
	rand.Seed(time.Now().UnixNano())
	
	// Benchmark string concatenation
	benchmark("Inefficient string concat", 100, func() {
		inefficientStringConcat(10000)
	})
	benchmark("Efficient string concat", 100, func() {
		efficientStringConcat(10000)
	})
	
	// Benchmark sorting
	benchmark("Inefficient sorting", 10, func() {
		data := make([]int, 5000)
		for i := range data {
			data[i] = rand.Intn(10000)
		}
		inefficientSort(data)
	})
	benchmark("Efficient sorting", 10, func() {
		data := make([]int, 5000)
		for i := range data {
			data[i] = rand.Intn(10000)
		}
		efficientSort(data)
	})
	
	// Benchmark fibonacci
	benchmark("Inefficient fibonacci", 5, func() {
		inefficientFibonacci(30)
	})
	benchmark("Efficient fibonacci", 5, func() {
		efficientFibonacci(30)
	})
	
	// Benchmark processing
	data := make([]int, 100000)
	for i := range data {
		data[i] = rand.Intn(100)
	}
	benchmark("Inefficient processing", 10, func() {
		inefficientProcessing(data)
	})
	benchmark("Efficient processing", 10, func() {
		efficientProcessing(data)
	})
}