When Context Errors Aren’t Really Errors
Here’s something that took me way too long to figure out: not all context “errors” are actually problems. When a user cancels a request, that’s not a system failure—that’s the system working correctly. When an operation times out because the user set an aggressive deadline, that might be expected behavior, not a bug.
The challenge is building systems that can distinguish between different types of context errors and respond appropriately to each one.
Understanding Context Error Types
Context errors come in different flavors, and each one tells you something different about what happened:
type ContextErrorAnalyzer struct {
operation string
startTime time.Time
}
func NewContextErrorAnalyzer(operation string) *ContextErrorAnalyzer {
return &ContextErrorAnalyzer{
operation: operation,
startTime: time.Now(),
}
}
func (cea *ContextErrorAnalyzer) AnalyzeError(ctx context.Context, err error) string {
if err == nil {
return "success"
}
switch {
case errors.Is(err, context.Canceled):
// Was this user-initiated or system-initiated?
if cea.looksLikeUserCancellation(ctx) {
return "user_cancelled"
}
return "system_cancelled"
case errors.Is(err, context.DeadlineExceeded):
// Did we hit a timeout or an absolute deadline?
if deadline, hasDeadline := ctx.Deadline(); hasDeadline {
if time.Now().After(deadline) {
return "deadline_exceeded"
}
}
return "timeout"
default:
return "other_error"
}
}
func (cea *ContextErrorAnalyzer) looksLikeUserCancellation(ctx context.Context) bool {
// Quick cancellations are often user-initiated (they hit cancel)
elapsed := time.Since(cea.startTime)
if elapsed < 100*time.Millisecond {
return true
}
// Check for user cancellation markers in context
if source := ctx.Value("cancellation_source"); source == "user" {
return true
}
return false
}
// Usage in your error handling
func handleOperation(ctx context.Context) error {
analyzer := NewContextErrorAnalyzer("user_data_fetch")
err := fetchUserData(ctx)
errorType := analyzer.AnalyzeError(ctx, err)
switch errorType {
case "user_cancelled":
log.Info("User cancelled operation - no action needed")
return nil // Treat as success
case "timeout":
log.Warn("Operation timed out - may need performance investigation")
return fmt.Errorf("operation timeout: %w", err)
case "deadline_exceeded":
log.Error("Hard deadline exceeded - system may be overloaded")
return fmt.Errorf("deadline exceeded: %w", err)
default:
return err
}
}
This analysis helps you respond appropriately instead of treating all context errors the same way.
Smart Retry Strategies
Not all context errors should trigger retries. Here’s how I build retry logic that understands context:
type ContextAwareRetry struct {
maxAttempts int
baseDelay time.Duration
maxDelay time.Duration
backoffFactor float64
}
func NewContextAwareRetry() *ContextAwareRetry {
return &ContextAwareRetry{
maxAttempts: 3,
baseDelay: 100 * time.Millisecond,
maxDelay: 5 * time.Second,
backoffFactor: 2.0,
}
}
func (car *ContextAwareRetry) Execute(ctx context.Context, operation func(context.Context) error) error {
var lastErr error
for attempt := 0; attempt < car.maxAttempts; attempt++ {
// Check if we should even try
select {
case <-ctx.Done():
return ctx.Err()
default:
}
lastErr = operation(ctx)
if lastErr == nil {
return nil // Success!
}
// Analyze the error to decide if we should retry
if !car.shouldRetry(ctx, lastErr, attempt) {
return lastErr
}
// Calculate delay for next attempt
delay := car.calculateDelay(attempt)
// Wait with context awareness
select {
case <-time.After(delay):
// Continue to next attempt
case <-ctx.Done():
return ctx.Err()
}
}
return fmt.Errorf("operation failed after %d attempts: %w", car.maxAttempts, lastErr)
}
func (car *ContextAwareRetry) shouldRetry(ctx context.Context, err error, attempt int) bool {
// Don't retry if we're out of attempts
if attempt >= car.maxAttempts-1 {
return false
}
// Never retry user cancellations
if errors.Is(err, context.Canceled) {
return false
}
// Retry timeouts, but only if we have enough time left
if errors.Is(err, context.DeadlineExceeded) {
if deadline, hasDeadline := ctx.Deadline(); hasDeadline {
remaining := time.Until(deadline)
nextDelay := car.calculateDelay(attempt + 1)
return remaining > nextDelay*2 // Need at least 2x delay time remaining
}
return true
}
// Retry other errors
return true
}
func (car *ContextAwareRetry) calculateDelay(attempt int) time.Duration {
delay := time.Duration(float64(car.baseDelay) * math.Pow(car.backoffFactor, float64(attempt)))
if delay > car.maxDelay {
delay = car.maxDelay
}
return delay
}
This retry logic understands context constraints and won’t waste time on futile retry attempts.
Graceful Degradation
When operations fail due to context issues, sometimes you can provide partial functionality instead of complete failure:
type GracefulDegradation struct {
fallbacks map[string]FallbackFunc
}
type FallbackFunc func(ctx context.Context) (interface{}, error)
func NewGracefulDegradation() *GracefulDegradation {
return &GracefulDegradation{
fallbacks: make(map[string]FallbackFunc),
}
}
func (gd *GracefulDegradation) RegisterFallback(operation string, fallback FallbackFunc) {
gd.fallbacks[operation] = fallback
}
func (gd *GracefulDegradation) ExecuteWithFallback(ctx context.Context, operation string,
primary func(context.Context) (interface{}, error)) (interface{}, error) {
// Try primary operation first
result, err := primary(ctx)
if err == nil {
return result, nil
}
// Check if we should try fallback
if !gd.shouldUseFallback(err) {
return nil, err
}
// Try fallback with fresh context (to avoid cascading cancellations)
fallbackCtx := context.Background()
// Copy important values but not cancellation
if userID, ok := GetUserID(ctx); ok {
fallbackCtx = WithUserID(fallbackCtx, userID)
}
if requestID, ok := GetRequestID(ctx); ok {
fallbackCtx = WithRequestID(fallbackCtx, requestID)
}
if fallback, exists := gd.fallbacks[operation]; exists {
log.Printf("Primary operation failed, trying fallback for %s", operation)
return fallback(fallbackCtx)
}
return nil, err
}
func (gd *GracefulDegradation) shouldUseFallback(err error) bool {
// Use fallback for timeouts and cancellations, but not for other errors
return errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled)
}
// Example usage
func fetchUserProfile(ctx context.Context, userID string) (*UserProfile, error) {
gd := NewGracefulDegradation()
// Register fallback that returns cached data
gd.RegisterFallback("user_profile", func(ctx context.Context) (interface{}, error) {
return getCachedUserProfile(userID), nil
})
result, err := gd.ExecuteWithFallback(ctx, "user_profile", func(ctx context.Context) (interface{}, error) {
return fetchUserProfileFromDB(ctx, userID)
})
if err != nil {
return nil, err
}
return result.(*UserProfile), nil
}
This degradation strategy provides partial functionality when full operations fail due to context constraints.
Context-Aware Circuit Breaker
Circuit breakers need to understand context errors to avoid tripping on user cancellations:
type ContextCircuitBreaker struct {
state CircuitState
failures int
successes int
lastFailure time.Time
timeout time.Duration
threshold int
mu sync.RWMutex
}
type CircuitState int
const (
Closed CircuitState = iota
Open
HalfOpen
)
func NewContextCircuitBreaker(threshold int, timeout time.Duration) *ContextCircuitBreaker {
return &ContextCircuitBreaker{
state: Closed,
threshold: threshold,
timeout: timeout,
}
}
func (ccb *ContextCircuitBreaker) Execute(ctx context.Context, operation func(context.Context) error) error {
ccb.mu.RLock()
state := ccb.state
ccb.mu.RUnlock()
if state == Open {
if time.Since(ccb.lastFailure) < ccb.timeout {
return fmt.Errorf("circuit breaker is open")
}
ccb.setState(HalfOpen)
}
err := operation(ctx)
if err != nil {
// Only count real failures, not user cancellations
if ccb.isRealFailure(err) {
ccb.recordFailure()
}
return err
}
ccb.recordSuccess()
return nil
}
func (ccb *ContextCircuitBreaker) isRealFailure(err error) bool {
// Don't count user cancellations as failures
if errors.Is(err, context.Canceled) {
return false
}
// Timeouts might indicate system problems, so count them
if errors.Is(err, context.DeadlineExceeded) {
return true
}
// Other errors are real failures
return true
}
func (ccb *ContextCircuitBreaker) recordFailure() {
ccb.mu.Lock()
defer ccb.mu.Unlock()
ccb.failures++
ccb.lastFailure = time.Now()
if ccb.failures >= ccb.threshold {
ccb.state = Open
}
}
func (ccb *ContextCircuitBreaker) recordSuccess() {
ccb.mu.Lock()
defer ccb.mu.Unlock()
ccb.successes++
if ccb.state == HalfOpen {
ccb.state = Closed
ccb.failures = 0
}
}
func (ccb *ContextCircuitBreaker) setState(state CircuitState) {
ccb.mu.Lock()
defer ccb.mu.Unlock()
ccb.state = state
}
This circuit breaker won’t trip just because users are cancelling requests—it focuses on actual system failures.
Error Aggregation Across Operations
When you’re running multiple operations, you need smart error aggregation that understands context:
type ContextErrorCollector struct {
errors []ContextError
threshold int
mu sync.Mutex
}
type ContextError struct {
Operation string
Error error
ErrorType string
Timestamp time.Time
}
func NewContextErrorCollector(threshold int) *ContextErrorCollector {
return &ContextErrorCollector{
errors: make([]ContextError, 0),
threshold: threshold,
}
}
func (cec *ContextErrorCollector) AddError(operation string, err error) {
cec.mu.Lock()
defer cec.mu.Unlock()
errorType := "other"
if errors.Is(err, context.Canceled) {
errorType = "cancelled"
} else if errors.Is(err, context.DeadlineExceeded) {
errorType = "timeout"
}
cec.errors = append(cec.errors, ContextError{
Operation: operation,
Error: err,
ErrorType: errorType,
Timestamp: time.Now(),
})
}
func (cec *ContextErrorCollector) ShouldAbort() bool {
cec.mu.Lock()
defer cec.mu.Unlock()
if len(cec.errors) < cec.threshold {
return false
}
// Count only real failures, not user cancellations
realFailures := 0
for _, err := range cec.errors {
if err.ErrorType != "cancelled" {
realFailures++
}
}
return realFailures >= cec.threshold
}
func (cec *ContextErrorCollector) GetSummary() string {
cec.mu.Lock()
defer cec.mu.Unlock()
counts := make(map[string]int)
for _, err := range cec.errors {
counts[err.ErrorType]++
}
return fmt.Sprintf("Errors: %d cancelled, %d timeout, %d other",
counts["cancelled"], counts["timeout"], counts["other"])
}
This collector helps you make intelligent decisions about when to abort complex operations based on the types of errors you’re seeing.
The key insight about context error handling is that context errors are communication, not just failures. They tell you about user intentions, system constraints, and operational conditions. When you handle them appropriately, you build systems that are both robust and user-friendly.
In our final part, we’ll cover production best practices that tie everything together—monitoring, performance optimization, and operational considerations for context-aware systems.