Performance and Optimization

Trace Analysis

Go’s execution tracer provides detailed insights into runtime behavior:

package main

import (
	"context"
	"fmt"
	"os"
	"runtime/trace"
	"sync"
	"time"
)

// Function demonstrating various activities for tracing
func runTracedActivities() {
	// Create a trace file
	f, err := os.Create("trace.out")
	if err != nil {
		fmt.Fprintf(os.Stderr, "Failed to create trace file: %v\n", err)
		return
	}
	defer f.Close()
	
	// Start tracing
	if err := trace.Start(f); err != nil {
		fmt.Fprintf(os.Stderr, "Failed to start trace: %v\n", err)
		return
	}
	defer trace.Stop()
	
	// Create a context for tracing
	ctx := context.Background()
	
	// Trace a simple function
	ctx, task := trace.NewTask(ctx, "main")
	defer task.End()
	
	// Trace a region within a function
	trace.WithRegion(ctx, "initialization", func() {
		fmt.Println("Initializing...")
		time.Sleep(10 * time.Millisecond)
	})
	
	// Trace concurrent work
	var wg sync.WaitGroup
	
	// Start multiple goroutines with traced regions
	for i := 0; i < 10; i++ {
		wg.Add(1)
		go func(id int) {
			defer wg.Done()
			
			// Create a task for this goroutine
			_, goroutineTask := trace.NewTask(ctx, fmt.Sprintf("goroutine-%d", id))
			defer goroutineTask.End()
			
			// Log an event
			trace.Log(ctx, "goroutine-start", fmt.Sprintf("id=%d", id))
			
			// Simulate different phases of work
			trace.WithRegion(ctx, "computation", func() {
				// CPU-bound work
				sum := 0
				for j := 0; j < 1000000; j++ {
					sum += j
				}
				trace.Log(ctx, "computation-result", fmt.Sprintf("sum=%d", sum))
			})
			
			trace.WithRegion(ctx, "io-simulation", func() {
				// Simulate I/O
				time.Sleep(time.Duration(id+1) * 10 * time.Millisecond)
			})
			
			trace.Log(ctx, "goroutine-end", fmt.Sprintf("id=%d", id))
		}(i)
	}
	
	// Demonstrate blocking on channel communication
	trace.WithRegion(ctx, "channel-communication", func() {
		ch := make(chan int)
		
		// Sender
		go func() {
			senderCtx, task := trace.NewTask(ctx, "sender")
			defer task.End()
			
			trace.Log(senderCtx, "send-start", "preparing to send")
			time.Sleep(20 * time.Millisecond) // Simulate preparation
			
			trace.WithRegion(senderCtx, "send-operation", func() {
				ch <- 42 // This will block until receiver is ready
			})
			
			trace.Log(senderCtx, "send-complete", "value sent")
		}()
		
		// Receiver (intentionally delayed)
		go func() {
			receiverCtx, task := trace.NewTask(ctx, "receiver")
			defer task.End()
			
			trace.Log(receiverCtx, "receive-delay", "waiting before receiving")
			time.Sleep(100 * time.Millisecond) // Delay to demonstrate blocking
			
			trace.WithRegion(receiverCtx, "receive-operation", func() {
				val := <-ch
				trace.Log(receiverCtx, "received-value", fmt.Sprintf("%d", val))
			})
		}()
	})
	
	// Wait for all goroutines to complete
	wg.Wait()
}

func main() {
	runTracedActivities()
	
	fmt.Println("Trace complete. Analyze with:")
	fmt.Println("go tool trace trace.out")
	fmt.Println("\nTrace tool features:")
	fmt.Println("1. Timeline view of goroutine execution")
	fmt.Println("2. Synchronization events (channel operations, mutex locks)")
	fmt.Println("3. System events (GC, scheduler)")
	fmt.Println("4. User-defined regions and events")
}

Code-Level Optimization Strategies

Beyond profiling, there are numerous code-level optimizations that can significantly improve Go application performance.

Memory Allocation Optimization

Reducing memory allocations is one of the most effective ways to improve performance:

package main

import (
	"fmt"
	"runtime"
	"strings"
	"testing"
	"time"
)

// BEFORE: Creates a new slice for each call
func inefficientAppend(slice []int, value int) []int {
	return append(slice, value)
}

// AFTER: Preallocates capacity to avoid reallocations
func efficientAppend(slice []int, values ...int) []int {
	if cap(slice)-len(slice) < len(values) {
		// Need to reallocate
		newSlice := make([]int, len(slice), len(slice)+len(values)+100) // Extra capacity
		copy(newSlice, slice)
		slice = newSlice
	}
	return append(slice, values...)
}

// BEFORE: Creates many small allocations
func inefficientStringJoin(items []string) string {
	result := ""
	for _, item := range items {
		result += item + ","
	}
	return result
}

// AFTER: Uses strings.Builder to minimize allocations
func efficientStringJoin(items []string) string {
	var builder strings.Builder
	builder.Grow(len(items) * 8) // Estimate size
	
	for i, item := range items {
		builder.WriteString(item)
		if i < len(items)-1 {
			builder.WriteByte(',')
		}
	}
	return builder.String()
}

// BEFORE: Allocates a map for each call
func inefficientCounter(text string) map[rune]int {
	counts := make(map[rune]int)
	for _, char := range text {
		counts[char]++
	}
	return counts
}

// AFTER: Reuses a map to avoid allocations
func efficientCounter(text string, counts map[rune]int) {
	// Clear the map
	for k := range counts {
		delete(counts, k)
	}
	
	// Count characters
	for _, char := range text {
		counts[char]++
	}
}

// Benchmark helper
func benchmarkAllocation(b *testing.B, f func()) {
	// Warm up
	for i := 0; i < 5; i++ {
		f()
	}
	
	// Reset memory stats
	runtime.GC()
	var stats runtime.MemStats
	runtime.ReadMemStats(&stats)
	allocsBefore := stats.TotalAlloc
	
	// Run benchmark
	start := time.Now()
	for i := 0; i < b.N; i++ {
		f()
	}
	elapsed := time.Since(start)
	
	// Get memory stats
	runtime.ReadMemStats(&stats)
	allocsAfter := stats.TotalAlloc
	
	// Print results
	fmt.Printf("Time: %v, Allocations: %v bytes\n", elapsed, allocsAfter-allocsBefore)
}

func main() {
	// Benchmark slice append
	fmt.Println("Slice append benchmark:")
	benchmarkAllocation(testing.B{N: 10000}, func() {
		slice := make([]int, 0)
		for i := 0; i < 1000; i++ {
			slice = inefficientAppend(slice, i)
		}
	})
	
	benchmarkAllocation(testing.B{N: 10000}, func() {
		slice := make([]int, 0, 1000) // Preallocate
		for i := 0; i < 1000; i++ {
			slice = append(slice, i) // Direct append is better
		}
	})
	
	// Benchmark string join
	fmt.Println("\nString join benchmark:")
	items := make([]string, 1000)
	for i := range items {
		items[i] = fmt.Sprintf("item-%d", i)
	}
	
	benchmarkAllocation(testing.B{N: 100}, func() {
		_ = inefficientStringJoin(items)
	})
	
	benchmarkAllocation(testing.B{N: 100}, func() {
		_ = efficientStringJoin(items)
	})
	
	benchmarkAllocation(testing.B{N: 100}, func() {
		_ = strings.Join(items, ",") // Built-in is even better
	})
	
	// Benchmark counter
	fmt.Println("\nCounter benchmark:")
	text := strings.Repeat("Go is a great language! ", 100)
	
	benchmarkAllocation(testing.B{N: 1000}, func() {
		_ = inefficientCounter(text)
	})
	
	benchmarkAllocation(testing.B{N: 1000}, func() {
		counts := make(map[rune]int)
		efficientCounter(text, counts)
	})
}

Compiler Optimizations and Build Flags

Understanding Go’s compiler optimizations can help you write more efficient code:

package main

import (
	"fmt"
	"os"
	"runtime"
	"strings"
)

// Function that may be inlined by the compiler
func add(a, b int) int {
	return a + b
}

// Function that's too complex to inline
func complexFunction(a, b int) int {
	result := 0
	for i := 0; i < a; i++ {
		if i%2 == 0 {
			result += i * b
		} else {
			result -= i * b
		}
		
		if result > 1000 {
			result = 1000
		}
	}
	return result
}

// Function with bounds check elimination opportunity
func sumArray(arr []int) int {
	sum := 0
	for i := 0; i < len(arr); i++ {
		sum += arr[i]
	}
	return sum
}

// Function that demonstrates escape analysis
func createOnStack() int {
	x := 42 // Will be allocated on stack
	return x
}

func createOnHeap() *int {
	x := 42 // Will escape to heap
	return &x
}

func main() {
	// Print Go version and compiler info
	fmt.Printf("Go version: %s\n", runtime.Version())
	fmt.Printf("GOOS: %s, GOARCH: %s\n", runtime.GOOS, runtime.GOARCH)
	fmt.Printf("GOMAXPROCS: %d\n", runtime.GOMAXPROCS(0))
	
	// Print build flags
	fmt.Println("\nBuild flags:")
	fmt.Println("To enable inlining and bounds check elimination:")
	fmt.Println("go build -gcflags=\"-m\" main.go")
	
	fmt.Println("\nTo disable optimizations (for debugging):")
	fmt.Println("go build -gcflags=\"-N -l\" main.go")
	
	fmt.Println("\nTo see assembly output:")
	fmt.Println("go build -gcflags=\"-S\" main.go")
	
	// Demonstrate compiler optimizations
	fmt.Println("\nCompiler optimization examples:")
	
	// Function inlining
	result := 0
	for i := 0; i < 1000000; i++ {
		result = add(result, i) // Likely to be inlined
	}
	fmt.Printf("Inlined function result: %d\n", result)
	
	// Complex function (not inlined)
	result = complexFunction(100, 5)
	fmt.Printf("Complex function result: %d\n", result)
	
	// Bounds check elimination
	arr := make([]int, 1000)
	for i := range arr {
		arr[i] = i
	}
	sum := sumArray(arr)
	fmt.Printf("Array sum: %d\n", sum)
	
	// Escape analysis
	stackVal := createOnStack()
	heapVal := createOnHeap()
	fmt.Printf("Stack value: %d, Heap value: %d\n", stackVal, *heapVal)
	
	// Print instructions for viewing escape analysis
	fmt.Println("\nTo view escape analysis decisions:")
	fmt.Println("go build -gcflags=\"-m -m\" main.go")
	
	// Print instructions for benchmarking with different flags
	fmt.Println("\nTo benchmark with different compiler flags:")
	fmt.Println("go test -bench=. -benchmem -gcflags=\"-N -l\" ./...")
	fmt.Println("go test -bench=. -benchmem ./...")
}

Trace Analysis

Code-Level Optimization Strategies

Memory Allocation Optimization

Compiler Optimizations and Build Flags

Continue Your Learning