Error Handling and Recovery

When Context Errors Aren’t Really Errors

Here’s something that took me way too long to figure out: not all context “errors” are actually problems. When a user cancels a request, that’s not a system failure—that’s the system working correctly. When an operation times out because the user set an aggressive deadline, that might be expected behavior, not a bug.

The challenge is building systems that can distinguish between different types of context errors and respond appropriately to each one.

Understanding Context Error Types

Context errors come in different flavors, and each one tells you something different about what happened:

type ContextErrorAnalyzer struct {
    operation string
    startTime time.Time
}

func NewContextErrorAnalyzer(operation string) *ContextErrorAnalyzer {
    return &ContextErrorAnalyzer{
        operation: operation,
        startTime: time.Now(),
    }
}

func (cea *ContextErrorAnalyzer) AnalyzeError(ctx context.Context, err error) string {
    if err == nil {
        return "success"
    }
    
    switch {
    case errors.Is(err, context.Canceled):
        // Was this user-initiated or system-initiated?
        if cea.looksLikeUserCancellation(ctx) {
            return "user_cancelled"
        }
        return "system_cancelled"
        
    case errors.Is(err, context.DeadlineExceeded):
        // Did we hit a timeout or an absolute deadline?
        if deadline, hasDeadline := ctx.Deadline(); hasDeadline {
            if time.Now().After(deadline) {
                return "deadline_exceeded"
            }
        }
        return "timeout"
        
    default:
        return "other_error"
    }
}

func (cea *ContextErrorAnalyzer) looksLikeUserCancellation(ctx context.Context) bool {
    // Quick cancellations are often user-initiated (they hit cancel)
    elapsed := time.Since(cea.startTime)
    if elapsed < 100*time.Millisecond {
        return true
    }
    
    // Check for user cancellation markers in context
    if source := ctx.Value("cancellation_source"); source == "user" {
        return true
    }
    
    return false
}

// Usage in your error handling
func handleOperation(ctx context.Context) error {
    analyzer := NewContextErrorAnalyzer("user_data_fetch")
    
    err := fetchUserData(ctx)
    
    errorType := analyzer.AnalyzeError(ctx, err)
    switch errorType {
    case "user_cancelled":
        log.Info("User cancelled operation - no action needed")
        return nil // Treat as success
    case "timeout":
        log.Warn("Operation timed out - may need performance investigation")
        return fmt.Errorf("operation timeout: %w", err)
    case "deadline_exceeded":
        log.Error("Hard deadline exceeded - system may be overloaded")
        return fmt.Errorf("deadline exceeded: %w", err)
    default:
        return err
    }
}

This analysis helps you respond appropriately instead of treating all context errors the same way.

Smart Retry Strategies

Not all context errors should trigger retries. Here’s how I build retry logic that understands context:

type ContextAwareRetry struct {
    maxAttempts   int
    baseDelay     time.Duration
    maxDelay      time.Duration
    backoffFactor float64
}

func NewContextAwareRetry() *ContextAwareRetry {
    return &ContextAwareRetry{
        maxAttempts:   3,
        baseDelay:     100 * time.Millisecond,
        maxDelay:      5 * time.Second,
        backoffFactor: 2.0,
    }
}

func (car *ContextAwareRetry) Execute(ctx context.Context, operation func(context.Context) error) error {
    var lastErr error
    
    for attempt := 0; attempt < car.maxAttempts; attempt++ {
        // Check if we should even try
        select {
        case <-ctx.Done():
            return ctx.Err()
        default:
        }
        
        lastErr = operation(ctx)
        if lastErr == nil {
            return nil // Success!
        }
        
        // Analyze the error to decide if we should retry
        if !car.shouldRetry(ctx, lastErr, attempt) {
            return lastErr
        }
        
        // Calculate delay for next attempt
        delay := car.calculateDelay(attempt)
        
        // Wait with context awareness
        select {
        case <-time.After(delay):
            // Continue to next attempt
        case <-ctx.Done():
            return ctx.Err()
        }
    }
    
    return fmt.Errorf("operation failed after %d attempts: %w", car.maxAttempts, lastErr)
}

func (car *ContextAwareRetry) shouldRetry(ctx context.Context, err error, attempt int) bool {
    // Don't retry if we're out of attempts
    if attempt >= car.maxAttempts-1 {
        return false
    }
    
    // Never retry user cancellations
    if errors.Is(err, context.Canceled) {
        return false
    }
    
    // Retry timeouts, but only if we have enough time left
    if errors.Is(err, context.DeadlineExceeded) {
        if deadline, hasDeadline := ctx.Deadline(); hasDeadline {
            remaining := time.Until(deadline)
            nextDelay := car.calculateDelay(attempt + 1)
            return remaining > nextDelay*2 // Need at least 2x delay time remaining
        }
        return true
    }
    
    // Retry other errors
    return true
}

func (car *ContextAwareRetry) calculateDelay(attempt int) time.Duration {
    delay := time.Duration(float64(car.baseDelay) * math.Pow(car.backoffFactor, float64(attempt)))
    if delay > car.maxDelay {
        delay = car.maxDelay
    }
    return delay
}

This retry logic understands context constraints and won’t waste time on futile retry attempts.

Graceful Degradation

When operations fail due to context issues, sometimes you can provide partial functionality instead of complete failure:

type GracefulDegradation struct {
    fallbacks map[string]FallbackFunc
}

type FallbackFunc func(ctx context.Context) (interface{}, error)

func NewGracefulDegradation() *GracefulDegradation {
    return &GracefulDegradation{
        fallbacks: make(map[string]FallbackFunc),
    }
}

func (gd *GracefulDegradation) RegisterFallback(operation string, fallback FallbackFunc) {
    gd.fallbacks[operation] = fallback
}

func (gd *GracefulDegradation) ExecuteWithFallback(ctx context.Context, operation string, 
    primary func(context.Context) (interface{}, error)) (interface{}, error) {
    
    // Try primary operation first
    result, err := primary(ctx)
    if err == nil {
        return result, nil
    }
    
    // Check if we should try fallback
    if !gd.shouldUseFallback(err) {
        return nil, err
    }
    
    // Try fallback with fresh context (to avoid cascading cancellations)
    fallbackCtx := context.Background()
    
    // Copy important values but not cancellation
    if userID, ok := GetUserID(ctx); ok {
        fallbackCtx = WithUserID(fallbackCtx, userID)
    }
    if requestID, ok := GetRequestID(ctx); ok {
        fallbackCtx = WithRequestID(fallbackCtx, requestID)
    }
    
    if fallback, exists := gd.fallbacks[operation]; exists {
        log.Printf("Primary operation failed, trying fallback for %s", operation)
        return fallback(fallbackCtx)
    }
    
    return nil, err
}

func (gd *GracefulDegradation) shouldUseFallback(err error) bool {
    // Use fallback for timeouts and cancellations, but not for other errors
    return errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled)
}

// Example usage
func fetchUserProfile(ctx context.Context, userID string) (*UserProfile, error) {
    gd := NewGracefulDegradation()
    
    // Register fallback that returns cached data
    gd.RegisterFallback("user_profile", func(ctx context.Context) (interface{}, error) {
        return getCachedUserProfile(userID), nil
    })
    
    result, err := gd.ExecuteWithFallback(ctx, "user_profile", func(ctx context.Context) (interface{}, error) {
        return fetchUserProfileFromDB(ctx, userID)
    })
    
    if err != nil {
        return nil, err
    }
    
    return result.(*UserProfile), nil
}

This degradation strategy provides partial functionality when full operations fail due to context constraints.

Context-Aware Circuit Breaker

Circuit breakers need to understand context errors to avoid tripping on user cancellations:

type ContextCircuitBreaker struct {
    state        CircuitState
    failures     int
    successes    int
    lastFailure  time.Time
    timeout      time.Duration
    threshold    int
    mu           sync.RWMutex
}

type CircuitState int

const (
    Closed CircuitState = iota
    Open
    HalfOpen
)

func NewContextCircuitBreaker(threshold int, timeout time.Duration) *ContextCircuitBreaker {
    return &ContextCircuitBreaker{
        state:     Closed,
        threshold: threshold,
        timeout:   timeout,
    }
}

func (ccb *ContextCircuitBreaker) Execute(ctx context.Context, operation func(context.Context) error) error {
    ccb.mu.RLock()
    state := ccb.state
    ccb.mu.RUnlock()
    
    if state == Open {
        if time.Since(ccb.lastFailure) < ccb.timeout {
            return fmt.Errorf("circuit breaker is open")
        }
        ccb.setState(HalfOpen)
    }
    
    err := operation(ctx)
    
    if err != nil {
        // Only count real failures, not user cancellations
        if ccb.isRealFailure(err) {
            ccb.recordFailure()
        }
        return err
    }
    
    ccb.recordSuccess()
    return nil
}

func (ccb *ContextCircuitBreaker) isRealFailure(err error) bool {
    // Don't count user cancellations as failures
    if errors.Is(err, context.Canceled) {
        return false
    }
    
    // Timeouts might indicate system problems, so count them
    if errors.Is(err, context.DeadlineExceeded) {
        return true
    }
    
    // Other errors are real failures
    return true
}

func (ccb *ContextCircuitBreaker) recordFailure() {
    ccb.mu.Lock()
    defer ccb.mu.Unlock()
    
    ccb.failures++
    ccb.lastFailure = time.Now()
    
    if ccb.failures >= ccb.threshold {
        ccb.state = Open
    }
}

func (ccb *ContextCircuitBreaker) recordSuccess() {
    ccb.mu.Lock()
    defer ccb.mu.Unlock()
    
    ccb.successes++
    if ccb.state == HalfOpen {
        ccb.state = Closed
        ccb.failures = 0
    }
}

func (ccb *ContextCircuitBreaker) setState(state CircuitState) {
    ccb.mu.Lock()
    defer ccb.mu.Unlock()
    ccb.state = state
}

This circuit breaker won’t trip just because users are cancelling requests—it focuses on actual system failures.

Error Aggregation Across Operations

When you’re running multiple operations, you need smart error aggregation that understands context:

type ContextErrorCollector struct {
    errors    []ContextError
    threshold int
    mu        sync.Mutex
}

type ContextError struct {
    Operation string
    Error     error
    ErrorType string
    Timestamp time.Time
}

func NewContextErrorCollector(threshold int) *ContextErrorCollector {
    return &ContextErrorCollector{
        errors:    make([]ContextError, 0),
        threshold: threshold,
    }
}

func (cec *ContextErrorCollector) AddError(operation string, err error) {
    cec.mu.Lock()
    defer cec.mu.Unlock()
    
    errorType := "other"
    if errors.Is(err, context.Canceled) {
        errorType = "cancelled"
    } else if errors.Is(err, context.DeadlineExceeded) {
        errorType = "timeout"
    }
    
    cec.errors = append(cec.errors, ContextError{
        Operation: operation,
        Error:     err,
        ErrorType: errorType,
        Timestamp: time.Now(),
    })
}

func (cec *ContextErrorCollector) ShouldAbort() bool {
    cec.mu.Lock()
    defer cec.mu.Unlock()
    
    if len(cec.errors) < cec.threshold {
        return false
    }
    
    // Count only real failures, not user cancellations
    realFailures := 0
    for _, err := range cec.errors {
        if err.ErrorType != "cancelled" {
            realFailures++
        }
    }
    
    return realFailures >= cec.threshold
}

func (cec *ContextErrorCollector) GetSummary() string {
    cec.mu.Lock()
    defer cec.mu.Unlock()
    
    counts := make(map[string]int)
    for _, err := range cec.errors {
        counts[err.ErrorType]++
    }
    
    return fmt.Sprintf("Errors: %d cancelled, %d timeout, %d other", 
        counts["cancelled"], counts["timeout"], counts["other"])
}

This collector helps you make intelligent decisions about when to abort complex operations based on the types of errors you’re seeing.

The key insight about context error handling is that context errors are communication, not just failures. They tell you about user intentions, system constraints, and operational conditions. When you handle them appropriately, you build systems that are both robust and user-friendly.

In our final part, we’ll cover production best practices that tie everything together—monitoring, performance optimization, and operational considerations for context-aware systems.