CPU Profiling and Analysis
CPU profiling is one of the most powerful techniques for identifying performance bottlenecks in Go applications. Let’s explore how to effectively use Go’s CPU profiling tools.
Setting Up CPU Profiling
Go provides multiple ways to enable CPU profiling:
package main
import (
"flag"
"fmt"
"log"
"os"
"runtime"
"runtime/pprof"
"time"
)
// CPU-intensive function we want to profile
func computeIntensive() {
// Simulate CPU-intensive work
total := 0
for i := 0; i < 100_000_000; i++ {
total += i % 2
}
fmt.Printf("Computation result: %d\n", total)
}
// Method 1: Manual profiling with runtime/pprof
func manualCPUProfiling() {
// Create CPU profile file
f, err := os.Create("cpu_profile.prof")
if err != nil {
log.Fatal("could not create CPU profile: ", err)
}
defer f.Close()
// Start CPU profiling
if err := pprof.StartCPUProfile(f); err != nil {
log.Fatal("could not start CPU profile: ", err)
}
defer pprof.StopCPUProfile()
// Run the code we want to profile
fmt.Println("Running CPU-intensive task...")
computeIntensive()
fmt.Println("CPU profile written to cpu_profile.prof")
fmt.Println("Analyze with: go tool pprof cpu_profile.prof")
}
// Method 2: Using net/http/pprof for continuous profiling
func httpPprofDemo() {
// This would typically be added to your main HTTP server
fmt.Println("In a real application, you would import _ \"net/http/pprof\"")
fmt.Println("Then access profiles via http://localhost:8080/debug/pprof/")
fmt.Println("For CPU profile: http://localhost:8080/debug/pprof/profile")
fmt.Println("Download and analyze with: go tool pprof http://localhost:8080/debug/pprof/profile")
}
// Method 3: Using testing package for benchmark profiling
func benchmarkPprofDemo() {
fmt.Println("When running benchmarks, use:")
fmt.Println("go test -bench=. -cpuprofile=cpu.prof ./...")
fmt.Println("Then analyze with: go tool pprof cpu.prof")
}
// Method 4: Using runtime/trace for execution tracing
func traceDemo() {
// Create trace file
f, err := os.Create("trace.out")
if err != nil {
log.Fatal("could not create trace file: ", err)
}
defer f.Close()
// Start tracing
if err := runtime.StartTrace(); err != nil {
log.Fatal("could not start trace: ", err)
}
defer runtime.StopTrace()
// Capture trace data
trace := pprof.Lookup("trace")
trace.WriteTo(f, 0)
// Run the code we want to trace
computeIntensive()
fmt.Println("Trace written to trace.out")
fmt.Println("Analyze with: go tool trace trace.out")
}
func main() {
// Parse command line flags
cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file")
memprofile := flag.String("memprofile", "", "write memory profile to file")
flag.Parse()
// CPU profiling via command line flag
if *cpuprofile != "" {
f, err := os.Create(*cpuprofile)
if err != nil {
log.Fatal("could not create CPU profile: ", err)
}
defer f.Close()
if err := pprof.StartCPUProfile(f); err != nil {
log.Fatal("could not start CPU profile: ", err)
}
defer pprof.StopCPUProfile()
}
// Run the demo functions
manualCPUProfiling()
httpPprofDemo()
benchmarkPprofDemo()
traceDemo()
// Memory profiling via command line flag
if *memprofile != "" {
f, err := os.Create(*memprofile)
if err != nil {
log.Fatal("could not create memory profile: ", err)
}
defer f.Close()
runtime.GC() // Get up-to-date statistics
if err := pprof.WriteHeapProfile(f); err != nil {
log.Fatal("could not write memory profile: ", err)
}
}
}
Analyzing CPU Profiles
Once you’ve collected a CPU profile, the next step is to analyze it effectively:
package main
import (
"fmt"
"math/rand"
"os"
"runtime/pprof"
"time"
)
// Inefficient string concatenation function
func inefficientStringConcat(n int) string {
result := ""
for i := 0; i < n; i++ {
result += "x"
}
return result
}
// Inefficient sorting algorithm (bubble sort)
func inefficientSort(items []int) {
n := len(items)
for i := 0; i < n; i++ {
for j := 0; j < n-1-i; j++ {
if items[j] > items[j+1] {
items[j], items[j+1] = items[j+1], items[j]
}
}
}
}
// Recursive function with exponential complexity
func fibonacci(n int) int {
if n <= 1 {
return n
}
return fibonacci(n-1) + fibonacci(n-2)
}
// Function with unnecessary allocations
func createLotsOfGarbage(iterations int) {
for i := 0; i < iterations; i++ {
_ = make([]byte, 1024)
}
}
// Main function with various performance issues
func runProfilingDemo() {
// Create CPU profile
f, _ := os.Create("cpu_profile_analysis.prof")
defer f.Close()
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
// Run inefficient string concatenation
inefficientStringConcat(10000)
// Run inefficient sorting
data := make([]int, 5000)
for i := range data {
data[i] = rand.Intn(10000)
}
inefficientSort(data)
// Run exponential algorithm
fibonacci(30)
// Run allocation-heavy code
createLotsOfGarbage(100000)
}
func main() {
// Seed random number generator
rand.Seed(time.Now().UnixNano())
// Run the demo
runProfilingDemo()
fmt.Println("Profile generated. Analyze with:")
fmt.Println("go tool pprof -http=:8080 cpu_profile_analysis.prof")
fmt.Println("Common pprof commands:")
fmt.Println(" top10 - Show top 10 functions by CPU usage")
fmt.Println(" list functionName - Show source code with CPU usage")
fmt.Println(" web - Generate SVG graph visualization")
fmt.Println(" traces - Show execution traces")
fmt.Println(" peek regex - Show functions matching regex")
}
Optimizing CPU-Bound Code
After identifying CPU bottlenecks, here are techniques to optimize them:
package main
import (
"fmt"
"math/rand"
"strings"
"time"
)
// BEFORE: Inefficient string concatenation
func inefficientStringConcat(n int) string {
result := ""
for i := 0; i < n; i++ {
result += "x"
}
return result
}
// AFTER: Optimized string concatenation
func efficientStringConcat(n int) string {
var builder strings.Builder
builder.Grow(n) // Pre-allocate capacity
for i := 0; i < n; i++ {
builder.WriteByte('x')
}
return builder.String()
}
// BEFORE: Inefficient sorting (bubble sort)
func inefficientSort(items []int) {
n := len(items)
for i := 0; i < n; i++ {
for j := 0; j < n-1-i; j++ {
if items[j] > items[j+1] {
items[j], items[j+1] = items[j+1], items[j]
}
}
}
}
// AFTER: Optimized sorting (quicksort)
func efficientSort(items []int) {
quicksort(items, 0, len(items)-1)
}
func quicksort(items []int, low, high int) {
if low < high {
pivot := partition(items, low, high)
quicksort(items, low, pivot-1)
quicksort(items, pivot+1, high)
}
}
func partition(items []int, low, high int) int {
pivot := items[high]
i := low - 1
for j := low; j < high; j++ {
if items[j] <= pivot {
i++
items[i], items[j] = items[j], items[i]
}
}
items[i+1], items[high] = items[high], items[i+1]
return i + 1
}
// BEFORE: Recursive fibonacci with exponential complexity
func inefficientFibonacci(n int) int {
if n <= 1 {
return n
}
return inefficientFibonacci(n-1) + inefficientFibonacci(n-2)
}
// AFTER: Iterative fibonacci with linear complexity
func efficientFibonacci(n int) int {
if n <= 1 {
return n
}
a, b := 0, 1
for i := 2; i <= n; i++ {
a, b = b, a+b
}
return b
}
// BEFORE: Function with unnecessary allocations
func inefficientProcessing(data []int) []int {
result := make([]int, 0)
for _, v := range data {
// Create a new slice for each operation
temp := make([]int, 1)
temp[0] = v * v
result = append(result, temp...)
}
return result
}
// AFTER: Function with optimized allocations
func efficientProcessing(data []int) []int {
// Pre-allocate the result slice
result := make([]int, len(data))
for i, v := range data {
// Direct assignment, no temporary allocations
result[i] = v * v
}
return result
}
// Benchmark helper
func benchmark(name string, iterations int, fn func()) {
start := time.Now()
for i := 0; i < iterations; i++ {
fn()
}
elapsed := time.Since(start)
fmt.Printf("%-30s %10d iterations in %s (%s per op)\n",
name, iterations, elapsed, elapsed/time.Duration(iterations))
}
func main() {
// Seed random number generator
rand.Seed(time.Now().UnixNano())
// Benchmark string concatenation
benchmark("Inefficient string concat", 100, func() {
inefficientStringConcat(10000)
})
benchmark("Efficient string concat", 100, func() {
efficientStringConcat(10000)
})
// Benchmark sorting
benchmark("Inefficient sorting", 10, func() {
data := make([]int, 5000)
for i := range data {
data[i] = rand.Intn(10000)
}
inefficientSort(data)
})
benchmark("Efficient sorting", 10, func() {
data := make([]int, 5000)
for i := range data {
data[i] = rand.Intn(10000)
}
efficientSort(data)
})
// Benchmark fibonacci
benchmark("Inefficient fibonacci", 5, func() {
inefficientFibonacci(30)
})
benchmark("Efficient fibonacci", 5, func() {
efficientFibonacci(30)
})
// Benchmark processing
data := make([]int, 100000)
for i := range data {
data[i] = rand.Intn(100)
}
benchmark("Inefficient processing", 10, func() {
inefficientProcessing(data)
})
benchmark("Efficient processing", 10, func() {
efficientProcessing(data)
})
}