Performance Optimization Techniques
Effective synchronization isn’t just about correctness—it’s also about performance. Let’s explore techniques to optimize synchronization in Go applications.
Lock Granularity
The granularity of locks significantly impacts performance:
package main
import (
"fmt"
"sync"
"sync/atomic"
"time"
)
// CoarseGrainedMap uses a single lock for the entire map
type CoarseGrainedMap struct {
mu sync.RWMutex
data map[string]int
}
// NewCoarseGrainedMap creates a new coarse-grained map
func NewCoarseGrainedMap() *CoarseGrainedMap {
return &CoarseGrainedMap{
data: make(map[string]int),
}
}
// Get retrieves a value from the map
func (m *CoarseGrainedMap) Get(key string) (int, bool) {
m.mu.RLock()
defer m.mu.RUnlock()
val, ok := m.data[key]
return val, ok
}
// Set stores a value in the map
func (m *CoarseGrainedMap) Set(key string, value int) {
m.mu.Lock()
defer m.mu.Unlock()
m.data[key] = value
}
// FineGrainedMap uses a separate lock for each shard
type FineGrainedMap struct {
shards []*mapShard
shardMask int
}
type mapShard struct {
mu sync.RWMutex
data map[string]int
}
// NewFineGrainedMap creates a new fine-grained map with the specified number of shards
func NewFineGrainedMap(numShards int) *FineGrainedMap {
// Round up to power of 2
shardCount := 1
for shardCount < numShards {
shardCount *= 2
}
m := &FineGrainedMap{
shards: make([]*mapShard, shardCount),
shardMask: shardCount - 1,
}
for i := 0; i < shardCount; i++ {
m.shards[i] = &mapShard{
data: make(map[string]int),
}
}
return m
}
// getShard returns the shard for the given key
func (m *FineGrainedMap) getShard(key string) *mapShard {
// Simple hash function
hash := 0
for i := 0; i < len(key); i++ {
hash = 31*hash + int(key[i])
}
return m.shards[hash&m.shardMask]
}
// Get retrieves a value from the map
func (m *FineGrainedMap) Get(key string) (int, bool) {
shard := m.getShard(key)
shard.mu.RLock()
defer shard.mu.RUnlock()
val, ok := shard.data[key]
return val, ok
}
// Set stores a value in the map
func (m *FineGrainedMap) Set(key string, value int) {
shard := m.getShard(key)
shard.mu.Lock()
defer shard.mu.Unlock()
shard.data[key] = value
}
func benchmarkMaps() {
fmt.Println("\n=== Lock Granularity Benchmark ===")
// Create maps
coarse := NewCoarseGrainedMap()
fine := NewFineGrainedMap(16)
// Prepare keys
const keyCount = 1000
keys := make([]string, keyCount)
for i := 0; i < keyCount; i++ {
keys[i] = fmt.Sprintf("key%d", i)
}
// Benchmark function
benchmark := func(name string, m interface{}, readPct int, goroutines int) time.Duration {
var wg sync.WaitGroup
var ops int64
// Create getter and setter functions
var getFn func(string) (int, bool)
var setFn func(string, int)
switch mp := m.(type) {
case *CoarseGrainedMap:
getFn = mp.Get
setFn = mp.Set
case *FineGrainedMap:
getFn = mp.Get
setFn = mp.Set
}
// Start timing
start := time.Now()
// Launch goroutines
for i := 0; i < goroutines; i++ {
wg.Add(1)
go func(id int) {
defer wg.Done()
// Local random number generator
seed := int64(id)
// Perform operations
for j := 0; j < 10000; j++ {
// Determine operation type
seed = (seed*1103515245 + 12345) & 0x7fffffff
isRead := int(seed%100) < readPct
// Select a key
seed = (seed*1103515245 + 12345) & 0x7fffffff
key := keys[seed%keyCount]
if isRead {
// Read operation
_, _ = getFn(key)
} else {
// Write operation
setFn(key, int(seed))
}
atomic.AddInt64(&ops, 1)
}
}(i)
}
// Wait for all goroutines to complete
wg.Wait()
// Return elapsed time
elapsed := time.Since(start)
fmt.Printf("%s: %v for %d operations (%.2f ops/ms)\n",
name, elapsed, ops, float64(ops)/(float64(elapsed)/float64(time.Millisecond)))
return elapsed
}
// Run benchmarks with different read/write ratios
fmt.Println("Read-heavy workload (95% reads):")
coarseTime := benchmark("Coarse-grained", coarse, 95, 8)
fineTime := benchmark("Fine-grained", fine, 95, 8)
fmt.Printf("Improvement: %.2fx\n", float64(coarseTime)/float64(fineTime))
fmt.Println("\nBalanced workload (50% reads):")
coarseTime = benchmark("Coarse-grained", coarse, 50, 8)
fineTime = benchmark("Fine-grained", fine, 50, 8)
fmt.Printf("Improvement: %.2fx\n", float64(coarseTime)/float64(fineTime))
}
func main() {
benchmarkMaps()
}
Key insights about lock granularity:
- Coarse-grained Locking: Uses fewer locks but causes more contention.
- Fine-grained Locking: Uses more locks but reduces contention.
- Sharding: Divides data into independent partitions, each with its own lock.
- Performance Impact: The optimal granularity depends on the access pattern and concurrency level.
Lock Contention Profiling
Identifying lock contention is crucial for optimizing synchronization:
package main
import (
"fmt"
"runtime"
"sync"
"time"
)
// MutexWithMetrics adds instrumentation to track lock contention
type MutexWithMetrics struct {
mu sync.Mutex
name string
acquisitions int64
contentions int64
totalWaitTime time.Duration
maxWaitTime time.Duration
}
// NewMutexWithMetrics creates a new instrumented mutex
func NewMutexWithMetrics(name string) *MutexWithMetrics {
return &MutexWithMetrics{
name: name,
}
}
// Lock acquires the mutex, tracking contention metrics
func (m *MutexWithMetrics) Lock() {
start := time.Now()
// Try to acquire the lock without blocking
if m.mu.TryLock() {
// Acquired without contention
m.acquisitions++
return
}
// Contention occurred, acquire with blocking
m.contentions++
m.mu.Lock()
// Calculate wait time
waitTime := time.Since(start)
m.totalWaitTime += waitTime
if waitTime > m.maxWaitTime {
m.maxWaitTime = waitTime
}
m.acquisitions++
}
// Unlock releases the mutex
func (m *MutexWithMetrics) Unlock() {
m.mu.Unlock()
}
// ReportMetrics prints contention statistics
func (m *MutexWithMetrics) ReportMetrics() {
fmt.Printf("=== Mutex Metrics: %s ===\n", m.name)
fmt.Printf("Total acquisitions: %d\n", m.acquisitions)
fmt.Printf("Contentions: %d (%.2f%%)\n", m.contentions,
float64(m.contentions)*100/float64(m.acquisitions))
if m.contentions > 0 {
fmt.Printf("Average wait time: %v\n", m.totalWaitTime/time.Duration(m.contentions))
fmt.Printf("Maximum wait time: %v\n", m.maxWaitTime)
}
}
func demonstrateMutexMetrics() {
fmt.Println("\n=== Lock Contention Profiling ===")
// Create instrumented mutexes
highContentionMutex := NewMutexWithMetrics("High Contention")
lowContentionMutex := NewMutexWithMetrics("Low Contention")
// Function that simulates work with the mutex
simulateWork := func(m *MutexWithMetrics, holdTime time.Duration, iterations int) {
for i := 0; i < iterations; i++ {
m.Lock()
time.Sleep(holdTime)
m.Unlock()
// Pause between acquisitions
time.Sleep(1 * time.Millisecond)
}
}
// Launch goroutines with high contention
var wg sync.WaitGroup
for i := 0; i < 10; i++ {
wg.Add(1)
go func() {
defer wg.Done()
simulateWork(highContentionMutex, 5*time.Millisecond, 20)
}()
}
// Launch goroutines with low contention
for i := 0; i < 3; i++ {
wg.Add(1)
go func() {
defer wg.Done()
simulateWork(lowContentionMutex, 1*time.Millisecond, 20)
}()
}
wg.Wait()
// Report metrics
highContentionMutex.ReportMetrics()
lowContentionMutex.ReportMetrics()
}
func main() {
demonstrateMutexMetrics()
}
Lock-Free Techniques
For some data structures, lock-free approaches can provide significant performance benefits:
package main
import (
"fmt"
"sync"
"sync/atomic"
"time"
)
// LockFreeQueue is a simple lock-free queue implementation
type LockFreeQueue struct {
head atomic.Pointer[node]
tail atomic.Pointer[node]
count int64
}
type node struct {
value interface{}
next atomic.Pointer[node]
}
// NewLockFreeQueue creates a new lock-free queue
func NewLockFreeQueue() *LockFreeQueue {
q := &LockFreeQueue{}
// Create sentinel node
sentinel := &node{}
q.head.Store(sentinel)
q.tail.Store(sentinel)
return q
}
// Enqueue adds an item to the queue
func (q *LockFreeQueue) Enqueue(value interface{}) {
// Create new node
newNode := &node{value: value}
for {
// Get current tail
tail := q.tail.Load()
next := tail.next.Load()
// Check if tail is still valid
if tail != q.tail.Load() {
continue
}
// If tail.next is not nil, tail is falling behind
if next != nil {
// Try to advance tail
q.tail.CompareAndSwap(tail, next)
continue
}
// Try to link the new node
if tail.next.CompareAndSwap(nil, newNode) {
// Success, try to advance tail
q.tail.CompareAndSwap(tail, newNode)
atomic.AddInt64(&q.count, 1)
return
}
}
}
// Dequeue removes and returns an item from the queue
func (q *LockFreeQueue) Dequeue() (interface{}, bool) {
for {
// Get current head and tail
head := q.head.Load()
tail := q.tail.Load()
next := head.next.Load()
// Check if head is still valid
if head != q.head.Load() {
continue
}
// If queue is empty
if head == tail {
if next == nil {
return nil, false
}
// Tail is falling behind, try to advance it
q.tail.CompareAndSwap(tail, next)
continue
}
// Queue is not empty, try to dequeue
if q.head.CompareAndSwap(head, next) {
value := next.value
atomic.AddInt64(&q.count, -1)
return value, true
}
}
}
// Size returns the approximate size of the queue
func (q *LockFreeQueue) Size() int {
return int(atomic.LoadInt64(&q.count))
}
// LockBasedQueue is a simple lock-based queue implementation
type LockBasedQueue struct {
mu sync.Mutex
head *node
tail *node
count int
}
// NewLockBasedQueue creates a new lock-based queue
func NewLockBasedQueue() *LockBasedQueue {
sentinel := &node{}
return &LockBasedQueue{
head: sentinel,
tail: sentinel,
}
}
// Enqueue adds an item to the queue
func (q *LockBasedQueue) Enqueue(value interface{}) {
q.mu.Lock()
defer q.mu.Unlock()
newNode := &node{value: value}
q.tail.next.Store(newNode)
q.tail = newNode
q.count++
}
// Dequeue removes and returns an item from the queue
func (q *LockBasedQueue) Dequeue() (interface{}, bool) {
q.mu.Lock()
defer q.mu.Unlock()
if q.head == q.tail {
return nil, false
}
next := q.head.next.Load()
q.head = next
q.count--
return next.value, true
}
// Size returns the size of the queue
func (q *LockBasedQueue) Size() int {
q.mu.Lock()
defer q.mu.Unlock()
return q.count
}
func benchmarkQueues() {
fmt.Println("\n=== Lock-Free vs. Lock-Based Queue Benchmark ===")
// Create queues
lockFree := NewLockFreeQueue()
lockBased := NewLockBasedQueue()
// Benchmark function
benchmark := func(name string, enqueue, dequeue func(int) bool, iterations, goroutines int) time.Duration {
var wg sync.WaitGroup
// Start timing
start := time.Now()
// Launch producer goroutines
for i := 0; i < goroutines/2; i++ {
wg.Add(1)
go func(id int) {
defer wg.Done()
// Produce items
for j := 0; j < iterations; j++ {
enqueue(id*iterations + j)
}
}(i)
}
// Launch consumer goroutines
for i := 0; i < goroutines/2; i++ {
wg.Add(1)
go func() {
defer wg.Done()
// Consume items
for j := 0; j < iterations; j++ {
for !dequeue(j) {
// Retry if queue is empty
runtime.Gosched()
}
}
}()
}
// Wait for all goroutines to complete
wg.Wait()
// Return elapsed time
return time.Since(start)
}
// Run benchmarks
iterations := 10000
goroutines := 8
lockFreeTime := benchmark("Lock-Free Queue",
func(i int) bool { lockFree.Enqueue(i); return true },
func(i int) bool { _, ok := lockFree.Dequeue(); return ok },
iterations, goroutines)
lockBasedTime := benchmark("Lock-Based Queue",
func(i int) bool { lockBased.Enqueue(i); return true },
func(i int) bool { _, ok := lockBased.Dequeue(); return ok },
iterations, goroutines)
// Compare performance
fmt.Printf("Lock-Free Queue Time: %v\n", lockFreeTime)
fmt.Printf("Lock-Based Queue Time: %v\n", lockBasedTime)
fmt.Printf("Performance Ratio: %.2fx\n", float64(lockBasedTime)/float64(lockFreeTime))
}
func main() {
benchmarkQueues()
}