Performance Optimization Techniques

Effective synchronization isn’t just about correctness—it’s also about performance. Let’s explore techniques to optimize synchronization in Go applications.

Lock Granularity

The granularity of locks significantly impacts performance:

package main

import (
	"fmt"
	"sync"
	"sync/atomic"
	"time"
)

// CoarseGrainedMap uses a single lock for the entire map
type CoarseGrainedMap struct {
	mu   sync.RWMutex
	data map[string]int
}

// NewCoarseGrainedMap creates a new coarse-grained map
func NewCoarseGrainedMap() *CoarseGrainedMap {
	return &CoarseGrainedMap{
		data: make(map[string]int),
	}
}

// Get retrieves a value from the map
func (m *CoarseGrainedMap) Get(key string) (int, bool) {
	m.mu.RLock()
	defer m.mu.RUnlock()
	val, ok := m.data[key]
	return val, ok
}

// Set stores a value in the map
func (m *CoarseGrainedMap) Set(key string, value int) {
	m.mu.Lock()
	defer m.mu.Unlock()
	m.data[key] = value
}

// FineGrainedMap uses a separate lock for each shard
type FineGrainedMap struct {
	shards    []*mapShard
	shardMask int
}

type mapShard struct {
	mu   sync.RWMutex
	data map[string]int
}

// NewFineGrainedMap creates a new fine-grained map with the specified number of shards
func NewFineGrainedMap(numShards int) *FineGrainedMap {
	// Round up to power of 2
	shardCount := 1
	for shardCount < numShards {
		shardCount *= 2
	}
	
	m := &FineGrainedMap{
		shards:    make([]*mapShard, shardCount),
		shardMask: shardCount - 1,
	}
	
	for i := 0; i < shardCount; i++ {
		m.shards[i] = &mapShard{
			data: make(map[string]int),
		}
	}
	
	return m
}

// getShard returns the shard for the given key
func (m *FineGrainedMap) getShard(key string) *mapShard {
	// Simple hash function
	hash := 0
	for i := 0; i < len(key); i++ {
		hash = 31*hash + int(key[i])
	}
	return m.shards[hash&m.shardMask]
}

// Get retrieves a value from the map
func (m *FineGrainedMap) Get(key string) (int, bool) {
	shard := m.getShard(key)
	shard.mu.RLock()
	defer shard.mu.RUnlock()
	val, ok := shard.data[key]
	return val, ok
}

// Set stores a value in the map
func (m *FineGrainedMap) Set(key string, value int) {
	shard := m.getShard(key)
	shard.mu.Lock()
	defer shard.mu.Unlock()
	shard.data[key] = value
}

func benchmarkMaps() {
	fmt.Println("\n=== Lock Granularity Benchmark ===")
	
	// Create maps
	coarse := NewCoarseGrainedMap()
	fine := NewFineGrainedMap(16)
	
	// Prepare keys
	const keyCount = 1000
	keys := make([]string, keyCount)
	for i := 0; i < keyCount; i++ {
		keys[i] = fmt.Sprintf("key%d", i)
	}
	
	// Benchmark function
	benchmark := func(name string, m interface{}, readPct int, goroutines int) time.Duration {
		var wg sync.WaitGroup
		var ops int64
		
		// Create getter and setter functions
		var getFn func(string) (int, bool)
		var setFn func(string, int)
		
		switch mp := m.(type) {
		case *CoarseGrainedMap:
			getFn = mp.Get
			setFn = mp.Set
		case *FineGrainedMap:
			getFn = mp.Get
			setFn = mp.Set
		}
		
		// Start timing
		start := time.Now()
		
		// Launch goroutines
		for i := 0; i < goroutines; i++ {
			wg.Add(1)
			go func(id int) {
				defer wg.Done()
				
				// Local random number generator
				seed := int64(id)
				
				// Perform operations
				for j := 0; j < 10000; j++ {
					// Determine operation type
					seed = (seed*1103515245 + 12345) & 0x7fffffff
					isRead := int(seed%100) < readPct
					
					// Select a key
					seed = (seed*1103515245 + 12345) & 0x7fffffff
					key := keys[seed%keyCount]
					
					if isRead {
						// Read operation
						_, _ = getFn(key)
					} else {
						// Write operation
						setFn(key, int(seed))
					}
					
					atomic.AddInt64(&ops, 1)
				}
			}(i)
		}
		
		// Wait for all goroutines to complete
		wg.Wait()
		
		// Return elapsed time
		elapsed := time.Since(start)
		fmt.Printf("%s: %v for %d operations (%.2f ops/ms)\n",
			name, elapsed, ops, float64(ops)/(float64(elapsed)/float64(time.Millisecond)))
		
		return elapsed
	}
	
	// Run benchmarks with different read/write ratios
	fmt.Println("Read-heavy workload (95% reads):")
	coarseTime := benchmark("Coarse-grained", coarse, 95, 8)
	fineTime := benchmark("Fine-grained", fine, 95, 8)
	fmt.Printf("Improvement: %.2fx\n", float64(coarseTime)/float64(fineTime))
	
	fmt.Println("\nBalanced workload (50% reads):")
	coarseTime = benchmark("Coarse-grained", coarse, 50, 8)
	fineTime = benchmark("Fine-grained", fine, 50, 8)
	fmt.Printf("Improvement: %.2fx\n", float64(coarseTime)/float64(fineTime))
}

func main() {
	benchmarkMaps()
}

Key insights about lock granularity:

  1. Coarse-grained Locking: Uses fewer locks but causes more contention.
  2. Fine-grained Locking: Uses more locks but reduces contention.
  3. Sharding: Divides data into independent partitions, each with its own lock.
  4. Performance Impact: The optimal granularity depends on the access pattern and concurrency level.

Lock Contention Profiling

Identifying lock contention is crucial for optimizing synchronization:

package main

import (
	"fmt"
	"runtime"
	"sync"
	"time"
)

// MutexWithMetrics adds instrumentation to track lock contention
type MutexWithMetrics struct {
	mu            sync.Mutex
	name          string
	acquisitions  int64
	contentions   int64
	totalWaitTime time.Duration
	maxWaitTime   time.Duration
}

// NewMutexWithMetrics creates a new instrumented mutex
func NewMutexWithMetrics(name string) *MutexWithMetrics {
	return &MutexWithMetrics{
		name: name,
	}
}

// Lock acquires the mutex, tracking contention metrics
func (m *MutexWithMetrics) Lock() {
	start := time.Now()
	
	// Try to acquire the lock without blocking
	if m.mu.TryLock() {
		// Acquired without contention
		m.acquisitions++
		return
	}
	
	// Contention occurred, acquire with blocking
	m.contentions++
	m.mu.Lock()
	
	// Calculate wait time
	waitTime := time.Since(start)
	m.totalWaitTime += waitTime
	if waitTime > m.maxWaitTime {
		m.maxWaitTime = waitTime
	}
	
	m.acquisitions++
}

// Unlock releases the mutex
func (m *MutexWithMetrics) Unlock() {
	m.mu.Unlock()
}

// ReportMetrics prints contention statistics
func (m *MutexWithMetrics) ReportMetrics() {
	fmt.Printf("=== Mutex Metrics: %s ===\n", m.name)
	fmt.Printf("Total acquisitions: %d\n", m.acquisitions)
	fmt.Printf("Contentions: %d (%.2f%%)\n", m.contentions,
		float64(m.contentions)*100/float64(m.acquisitions))
	
	if m.contentions > 0 {
		fmt.Printf("Average wait time: %v\n", m.totalWaitTime/time.Duration(m.contentions))
		fmt.Printf("Maximum wait time: %v\n", m.maxWaitTime)
	}
}

func demonstrateMutexMetrics() {
	fmt.Println("\n=== Lock Contention Profiling ===")
	
	// Create instrumented mutexes
	highContentionMutex := NewMutexWithMetrics("High Contention")
	lowContentionMutex := NewMutexWithMetrics("Low Contention")
	
	// Function that simulates work with the mutex
	simulateWork := func(m *MutexWithMetrics, holdTime time.Duration, iterations int) {
		for i := 0; i < iterations; i++ {
			m.Lock()
			time.Sleep(holdTime)
			m.Unlock()
			
			// Pause between acquisitions
			time.Sleep(1 * time.Millisecond)
		}
	}
	
	// Launch goroutines with high contention
	var wg sync.WaitGroup
	for i := 0; i < 10; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			simulateWork(highContentionMutex, 5*time.Millisecond, 20)
		}()
	}
	
	// Launch goroutines with low contention
	for i := 0; i < 3; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			simulateWork(lowContentionMutex, 1*time.Millisecond, 20)
		}()
	}
	
	wg.Wait()
	
	// Report metrics
	highContentionMutex.ReportMetrics()
	lowContentionMutex.ReportMetrics()
}

func main() {
	demonstrateMutexMetrics()
}

Lock-Free Techniques

For some data structures, lock-free approaches can provide significant performance benefits:

package main

import (
	"fmt"
	"sync"
	"sync/atomic"
	"time"
)

// LockFreeQueue is a simple lock-free queue implementation
type LockFreeQueue struct {
	head  atomic.Pointer[node]
	tail  atomic.Pointer[node]
	count int64
}

type node struct {
	value interface{}
	next  atomic.Pointer[node]
}

// NewLockFreeQueue creates a new lock-free queue
func NewLockFreeQueue() *LockFreeQueue {
	q := &LockFreeQueue{}
	
	// Create sentinel node
	sentinel := &node{}
	q.head.Store(sentinel)
	q.tail.Store(sentinel)
	
	return q
}

// Enqueue adds an item to the queue
func (q *LockFreeQueue) Enqueue(value interface{}) {
	// Create new node
	newNode := &node{value: value}
	
	for {
		// Get current tail
		tail := q.tail.Load()
		next := tail.next.Load()
		
		// Check if tail is still valid
		if tail != q.tail.Load() {
			continue
		}
		
		// If tail.next is not nil, tail is falling behind
		if next != nil {
			// Try to advance tail
			q.tail.CompareAndSwap(tail, next)
			continue
		}
		
		// Try to link the new node
		if tail.next.CompareAndSwap(nil, newNode) {
			// Success, try to advance tail
			q.tail.CompareAndSwap(tail, newNode)
			atomic.AddInt64(&q.count, 1)
			return
		}
	}
}

// Dequeue removes and returns an item from the queue
func (q *LockFreeQueue) Dequeue() (interface{}, bool) {
	for {
		// Get current head and tail
		head := q.head.Load()
		tail := q.tail.Load()
		next := head.next.Load()
		
		// Check if head is still valid
		if head != q.head.Load() {
			continue
		}
		
		// If queue is empty
		if head == tail {
			if next == nil {
				return nil, false
			}
			
			// Tail is falling behind, try to advance it
			q.tail.CompareAndSwap(tail, next)
			continue
		}
		
		// Queue is not empty, try to dequeue
		if q.head.CompareAndSwap(head, next) {
			value := next.value
			atomic.AddInt64(&q.count, -1)
			return value, true
		}
	}
}

// Size returns the approximate size of the queue
func (q *LockFreeQueue) Size() int {
	return int(atomic.LoadInt64(&q.count))
}

// LockBasedQueue is a simple lock-based queue implementation
type LockBasedQueue struct {
	mu    sync.Mutex
	head  *node
	tail  *node
	count int
}

// NewLockBasedQueue creates a new lock-based queue
func NewLockBasedQueue() *LockBasedQueue {
	sentinel := &node{}
	return &LockBasedQueue{
		head: sentinel,
		tail: sentinel,
	}
}

// Enqueue adds an item to the queue
func (q *LockBasedQueue) Enqueue(value interface{}) {
	q.mu.Lock()
	defer q.mu.Unlock()
	
	newNode := &node{value: value}
	q.tail.next.Store(newNode)
	q.tail = newNode
	q.count++
}

// Dequeue removes and returns an item from the queue
func (q *LockBasedQueue) Dequeue() (interface{}, bool) {
	q.mu.Lock()
	defer q.mu.Unlock()
	
	if q.head == q.tail {
		return nil, false
	}
	
	next := q.head.next.Load()
	q.head = next
	q.count--
	
	return next.value, true
}

// Size returns the size of the queue
func (q *LockBasedQueue) Size() int {
	q.mu.Lock()
	defer q.mu.Unlock()
	return q.count
}

func benchmarkQueues() {
	fmt.Println("\n=== Lock-Free vs. Lock-Based Queue Benchmark ===")
	
	// Create queues
	lockFree := NewLockFreeQueue()
	lockBased := NewLockBasedQueue()
	
	// Benchmark function
	benchmark := func(name string, enqueue, dequeue func(int) bool, iterations, goroutines int) time.Duration {
		var wg sync.WaitGroup
		
		// Start timing
		start := time.Now()
		
		// Launch producer goroutines
		for i := 0; i < goroutines/2; i++ {
			wg.Add(1)
			go func(id int) {
				defer wg.Done()
				
				// Produce items
				for j := 0; j < iterations; j++ {
					enqueue(id*iterations + j)
				}
			}(i)
		}
		
		// Launch consumer goroutines
		for i := 0; i < goroutines/2; i++ {
			wg.Add(1)
			go func() {
				defer wg.Done()
				
				// Consume items
				for j := 0; j < iterations; j++ {
					for !dequeue(j) {
						// Retry if queue is empty
						runtime.Gosched()
					}
				}
			}()
		}
		
		// Wait for all goroutines to complete
		wg.Wait()
		
		// Return elapsed time
		return time.Since(start)
	}
	
	// Run benchmarks
	iterations := 10000
	goroutines := 8
	
	lockFreeTime := benchmark("Lock-Free Queue",
		func(i int) bool { lockFree.Enqueue(i); return true },
		func(i int) bool { _, ok := lockFree.Dequeue(); return ok },
		iterations, goroutines)
	
	lockBasedTime := benchmark("Lock-Based Queue",
		func(i int) bool { lockBased.Enqueue(i); return true },
		func(i int) bool { _, ok := lockBased.Dequeue(); return ok },
		iterations, goroutines)
	
	// Compare performance
	fmt.Printf("Lock-Free Queue Time: %v\n", lockFreeTime)
	fmt.Printf("Lock-Based Queue Time: %v\n", lockBasedTime)
	fmt.Printf("Performance Ratio: %.2fx\n", float64(lockBasedTime)/float64(lockFreeTime))
}

func main() {
	benchmarkQueues()
}