Performance and Optimization

Implementing Health and Readiness Endpoints

package main

import (
	"context"
	"encoding/json"
	"fmt"
	"log"
	"net/http"
	"os"
	"os/signal"
	"sync"
	"syscall"
	"time"
)

// HealthStatus represents the health state of a component
type HealthStatus string

const (
	StatusHealthy   HealthStatus = "healthy"
	StatusDegraded  HealthStatus = "degraded"
	StatusUnhealthy HealthStatus = "unhealthy"
	StatusShutdown  HealthStatus = "shutdown"
)

// HealthCheck represents a component that can report its health
type HealthCheck interface {
	Name() string
	Check() HealthStatus
}

// Component represents a service component with health reporting
type Component struct {
	name   string
	status HealthStatus
	mu     sync.RWMutex
}

func NewComponent(name string) *Component {
	return &Component{
		name:   name,
		status: StatusHealthy,
	}
}

func (c *Component) Name() string {
	return c.name
}

func (c *Component) Check() HealthStatus {
	c.mu.RLock()
	defer c.mu.RUnlock()
	return c.status
}

func (c *Component) SetStatus(status HealthStatus) {
	c.mu.Lock()
	defer c.mu.Unlock()
	c.status = status
}

// HealthServer provides health and readiness endpoints
type HealthServer struct {
	components []HealthCheck
	server     *http.Server
	isShutdown bool
	mu         sync.RWMutex
}

func NewHealthServer(addr string) *HealthServer {
	return &HealthServer{
		components: []HealthCheck{},
		server: &http.Server{
			Addr: addr,
		},
	}
}

// AddComponent adds a component to health monitoring
func (hs *HealthServer) AddComponent(component HealthCheck) {
	hs.components = append(hs.components, component)
}

// Start begins serving health and readiness endpoints
func (hs *HealthServer) Start() error {
	mux := http.NewServeMux()
	
	// Health endpoint returns overall system health
	mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
		hs.mu.RLock()
		if hs.isShutdown {
			hs.mu.RUnlock()
			w.WriteHeader(http.StatusServiceUnavailable)
			json.NewEncoder(w).Encode(map[string]string{
				"status": string(StatusShutdown),
			})
			return
		}
		hs.mu.RUnlock()
		
		overallStatus := StatusHealthy
		componentStatuses := make(map[string]string)
		
		for _, component := range hs.components {
			status := component.Check()
			componentStatuses[component.Name()] = string(status)
			
			if status == StatusUnhealthy {
				overallStatus = StatusUnhealthy
			} else if status == StatusDegraded && overallStatus != StatusUnhealthy {
				overallStatus = StatusDegraded
			}
		}
		
		response := map[string]interface{}{
			"status":     string(overallStatus),
			"components": componentStatuses,
			"timestamp":  time.Now().Format(time.RFC3339),
		}
		
		if overallStatus != StatusHealthy {
			w.WriteHeader(http.StatusServiceUnavailable)
		}
		
		json.NewEncoder(w).Encode(response)
	})
	
	// Readiness endpoint indicates if the service is ready to receive traffic
	mux.HandleFunc("/ready", func(w http.ResponseWriter, r *http.Request) {
		hs.mu.RLock()
		isShutdown := hs.isShutdown
		hs.mu.RUnlock()
		
		if isShutdown {
			w.WriteHeader(http.StatusServiceUnavailable)
			json.NewEncoder(w).Encode(map[string]string{
				"status": "not ready - shutting down",
			})
			return
		}
		
		w.WriteHeader(http.StatusOK)
		json.NewEncoder(w).Encode(map[string]string{
			"status": "ready",
		})
	})
	
	hs.server.Handler = mux
	
	go func() {
		log.Printf("Health server listening on %s", hs.server.Addr)
		if err := hs.server.ListenAndServe(); err != http.ErrServerClosed {
			log.Printf("Health server error: %v", err)
		}
	}()
	
	return nil
}

// BeginShutdown marks the service as shutting down
func (hs *HealthServer) BeginShutdown() {
	hs.mu.Lock()
	defer hs.mu.Unlock()
	hs.isShutdown = true
	log.Println("Health server marked as shutting down")
}

// Shutdown stops the health server
func (hs *HealthServer) Shutdown(ctx context.Context) error {
	log.Println("Shutting down health server...")
	return hs.server.Shutdown(ctx)
}

func main() {
	// Create components
	dbComponent := NewComponent("database")
	apiComponent := NewComponent("api")
	
	// Create health server
	healthServer := NewHealthServer(":8081")
	healthServer.AddComponent(dbComponent)
	healthServer.AddComponent(apiComponent)
	
	// Create API server
	apiServer := &http.Server{
		Addr: ":8080",
		Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
			time.Sleep(2 * time.Second) // Simulate work
			fmt.Fprintf(w, "API response")
		}),
	}
	
	// Start servers
	if err := healthServer.Start(); err != nil {
		log.Fatalf("Failed to start health server: %v", err)
	}
	
	go func() {
		log.Printf("API server listening on %s", apiServer.Addr)
		if err := apiServer.ListenAndServe(); err != http.ErrServerClosed {
			log.Printf("API server error: %v", err)
		}
	}()
	
	// Channel to listen for interrupt signals
	shutdown := make(chan os.Signal, 1)
	signal.Notify(shutdown, os.Interrupt, syscall.SIGTERM)
	
	// Block until we receive a signal
	sig := <-shutdown
	log.Printf("Received signal: %v", sig)
	
	// Mark as shutting down in health checks
	healthServer.BeginShutdown()
	
	// Simulate degraded status during shutdown
	dbComponent.SetStatus(StatusDegraded)
	
	// Create a deadline for graceful shutdown
	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
	defer cancel()
	
	// Shutdown API server first
	log.Println("Shutting down API server...")
	if err := apiServer.Shutdown(ctx); err != nil {
		log.Printf("API server shutdown error: %v", err)
	}
	
	// Update component status
	apiComponent.SetStatus(StatusShutdown)
	
	// Shutdown health server last
	if err := healthServer.Shutdown(ctx); err != nil {
		log.Printf("Health server shutdown error: %v", err)
	}
	
	log.Println("Application shutdown complete")
}

This pattern demonstrates:

Implementing health and readiness endpoints
Tracking component health status
Updating health status during shutdown
Using health checks to coordinate with orchestration systems

Production Deployment Strategies

Graceful shutdown is particularly important in production environments, especially when dealing with orchestration systems like Kubernetes.

Connection Draining for Zero-Downtime Deployments

In production environments, you often need to ensure that in-flight requests are completed before shutting down:

package main

import (
	"context"
	"fmt"
	"log"
	"net/http"
	"os"
	"os/signal"
	"sync"
	"syscall"
	"time"
)

// ConnectionTracker keeps track of active connections
type ConnectionTracker struct {
	activeConnections int
	mu                sync.Mutex
	drainComplete     chan struct{}
}

func NewConnectionTracker() *ConnectionTracker {
	return &ConnectionTracker{
		drainComplete: make(chan struct{}),
	}
}

// ConnectionStarted increments the active connection counter
func (ct *ConnectionTracker) ConnectionStarted() {
	ct.mu.Lock()
	defer ct.mu.Unlock()
	ct.activeConnections++
	log.Printf("Connection started. Active connections: %d", ct.activeConnections)
}

// ConnectionFinished decrements the active connection counter
func (ct *ConnectionTracker) ConnectionFinished() {
	ct.mu.Lock()
	defer ct.mu.Unlock()
	ct.activeConnections--
	log.Printf("Connection finished. Active connections: %d", ct.activeConnections)
	
	// If we're draining and this was the last connection, signal completion
	if ct.activeConnections == 0 && ct.drainComplete != nil {
		select {
		case <-ct.drainComplete:
			// Channel already closed
		default:
			close(ct.drainComplete)
		}
	}
}

// WaitForDrain waits for all connections to finish
func (ct *ConnectionTracker) WaitForDrain(ctx context.Context) error {
	ct.mu.Lock()
	if ct.activeConnections == 0 {
		ct.mu.Unlock()
		return nil
	}
	ct.mu.Unlock()
	
	select {
	case <-ct.drainComplete:
		return nil
	case <-ctx.Done():
		return ctx.Err()
	}
}

// ConnectionDrainingHandler wraps an HTTP handler to track connections
func ConnectionDrainingHandler(tracker *ConnectionTracker, next http.Handler) http.Handler {
	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		tracker.ConnectionStarted()
		defer tracker.ConnectionFinished()
		next.ServeHTTP(w, r)
	})
}

func main() {
	// Create connection tracker
	tracker := NewConnectionTracker()
	
	// Create server with connection tracking
	server := &http.Server{
		Addr: ":8080",
		Handler: ConnectionDrainingHandler(tracker, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
			// Simulate a long-running request
			duration := time.Duration(2+time.Now().Second()%3) * time.Second
			log.Printf("Handling request, will take %v", duration)
			time.Sleep(duration)
			fmt.Fprintf(w, "Request processed after %v", duration)
		})),
	}
	
	// Start server
	go func() {
		log.Printf("Server listening on %s", server.Addr)
		if err := server.ListenAndServe(); err != http.ErrServerClosed {
			log.Printf("Server error: %v", err)
		}
	}()
	
	// Channel to listen for interrupt signals
	shutdown := make(chan os.Signal, 1)
	signal.Notify(shutdown, os.Interrupt, syscall.SIGTERM)
	
	// Block until we receive a signal
	sig := <-shutdown
	log.Printf("Received signal: %v", sig)
	
	// Create a deadline for graceful shutdown
	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
	defer cancel()
	
	// Step 1: Stop accepting new connections
	log.Println("Shutting down server - no longer accepting new connections")
	if err := server.Shutdown(ctx); err != nil {
		log.Printf("Server shutdown error: %v", err)
	}
	
	// Step 2: Wait for existing connections to drain
	log.Println("Waiting for active connections to complete...")
	if err := tracker.WaitForDrain(ctx); err != nil {
		log.Printf("Connection draining error: %v", err)
	} else {
		log.Println("All connections drained successfully")
	}
	
	log.Println("Server shutdown complete")
}

This pattern demonstrates:

Tracking active connections
Gracefully rejecting new connections while allowing existing ones to complete
Waiting for all in-flight requests to finish before final shutdown

Kubernetes-Ready Graceful Shutdown

When running in Kubernetes, you need to handle termination signals and coordinate with the container lifecycle:

package main

import (
	"context"
	"fmt"
	"log"
	"net/http"
	"os"
	"os/signal"
	"sync"
	"syscall"
	"time"
)

// ShutdownManager coordinates the shutdown process
type ShutdownManager struct {
	shutdownTimeout time.Duration
	preStopTimeout  time.Duration
	server          *http.Server
	readyToShutdown bool
	mu              sync.RWMutex
}

func NewShutdownManager(server *http.Server, shutdownTimeout, preStopTimeout time.Duration) *ShutdownManager {
	return &ShutdownManager{
		server:          server,
		shutdownTimeout: shutdownTimeout,
		preStopTimeout:  preStopTimeout,
	}
}

// StartPreStop marks the service as no longer ready and waits for the preStop hook duration
func (sm *ShutdownManager) StartPreStop() {
	sm.mu.Lock()
	sm.readyToShutdown = true
	sm.mu.Unlock()
	
	log.Printf("PreStop hook received, waiting %v before starting shutdown", sm.preStopTimeout)
	time.Sleep(sm.preStopTimeout)
}

// IsReady returns whether the service is ready to receive traffic
func (sm *ShutdownManager) IsReady() bool {
	sm.mu.RLock()
	defer sm.mu.RUnlock()
	return !sm.readyToShutdown
}

// Shutdown performs the actual server shutdown
func (sm *ShutdownManager) Shutdown() error {
	log.Println("Starting graceful shutdown...")
	
	ctx, cancel := context.WithTimeout(context.Background(), sm.shutdownTimeout)
	defer cancel()
	
	return sm.server.Shutdown(ctx)
}

func main() {
	// Create server
	server := &http.Server{
		Addr: ":8080",
		Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
			time.Sleep(2 * time.Second) // Simulate work
			fmt.Fprintf(w, "Hello, World!")
		}),
	}
	
	// Create shutdown manager
	shutdownManager := NewShutdownManager(
		server,
		30*time.Second, // Shutdown timeout
		5*time.Second,  // PreStop hook duration
	)
	
	// Create health server
	healthServer := &http.Server{
		Addr: ":8081",
		Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
			// Kubernetes readiness probe
			if r.URL.Path == "/ready" {
				if shutdownManager.IsReady() {
					w.WriteHeader(http.StatusOK)
					fmt.Fprintln(w, "Ready")
				} else {
					// Return not ready during shutdown
					w.WriteHeader(http.StatusServiceUnavailable)
					fmt.Fprintln(w, "Not Ready - Shutting Down")
				}
				return
			}
			
			// Kubernetes liveness probe
			if r.URL.Path == "/health" {
				w.WriteHeader(http.StatusOK)
				fmt.Fprintln(w, "Healthy")
				return
			}
			
			w.WriteHeader(http.StatusNotFound)
		}),
	}
	
	// Start servers
	go func() {
		log.Printf("Main server listening on %s", server.Addr)
		if err := server.ListenAndServe(); err != http.ErrServerClosed {
			log.Printf("Main server error: %v", err)
		}
	}()
	
	go func() {
		log.Printf("Health server listening on %s", healthServer.Addr)
		if err := healthServer.ListenAndServe(); err != http.ErrServerClosed {
			log.Printf("Health server error: %v", err)
		}
	}()
	
	// Channel to listen for interrupt signals
	shutdown := make(chan os.Signal, 1)
	signal.Notify(shutdown, syscall.SIGINT, syscall.SIGTERM)
	
	// Block until we receive a signal
	sig := <-shutdown
	log.Printf("Received signal: %v", sig)
	
	// Start the pre-stop process
	// This simulates the Kubernetes preStop hook
	shutdownManager.StartPreStop()
	
	// Shutdown the main server
	if err := shutdownManager.Shutdown(); err != nil {
		log.Printf("Main server shutdown error: %v", err)
	}
	
	// Shutdown the health server last
	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
	defer cancel()
	
	if err := healthServer.Shutdown(ctx); err != nil {
		log.Printf("Health server shutdown error: %v", err)
	}
	
	log.Println("Application shutdown complete")
}

This pattern demonstrates:

Coordinating with Kubernetes lifecycle hooks
Implementing readiness probes that reflect shutdown state
Using a preStop hook delay to allow for load balancer reconfiguration
Proper sequencing of shutdown steps

Monitoring and Logging During Shutdown

Proper monitoring and logging during shutdown is essential for troubleshooting and ensuring clean termination.

Structured Shutdown Logging

package main

import (
	"context"
	"encoding/json"
	"fmt"
	"log"
	"net/http"
	"os"
	"os/signal"
	"sync"
	"syscall"
	"time"
)

// LogLevel represents the severity of a log message
type LogLevel string

const (
	LogLevelInfo    LogLevel = "INFO"
	LogLevelWarning LogLevel = "WARNING"
	LogLevelError   LogLevel = "ERROR"
)

// StructuredLogger provides structured logging
type StructuredLogger struct {
	mu sync.Mutex
}

// Log outputs a structured log message
func (l *StructuredLogger) Log(level LogLevel, message string, fields map[string]interface{}) {
	l.mu.Lock()
	defer l.mu.Unlock()
	
	if fields == nil {
		fields = make(map[string]interface{})
	}
	
	fields["timestamp"] = time.Now().Format(time.RFC3339)
	fields["level"] = level
	fields["message"] = message
	
	jsonData, err := json.Marshal(fields)
	if err != nil {
		log.Printf("Error marshaling log: %v", err)
		return
	}
	
	fmt.Println(string(jsonData))
}

// ShutdownMonitor tracks the shutdown process
type ShutdownMonitor struct {
	logger           *StructuredLogger
	startTime        time.Time
	shutdownSteps    map[string]ShutdownStepStatus
	mu               sync.Mutex
}

// ShutdownStepStatus represents the status of a shutdown step
type ShutdownStepStatus struct {
	Status    string
	StartTime time.Time
	EndTime   time.Time
	Duration  time.Duration
	Error     error
}

func NewShutdownMonitor(logger *StructuredLogger) *ShutdownMonitor {
	return &ShutdownMonitor{
		logger:        logger,
		shutdownSteps: make(map[string]ShutdownStepStatus),
	}
}

// StartShutdown begins the shutdown process
func (sm *ShutdownMonitor) StartShutdown() {
	sm.mu.Lock()
	defer sm.mu.Unlock()
	
	sm.startTime = time.Now()
	sm.logger.Log(LogLevelInfo, "Starting application shutdown", map[string]interface{}{
		"shutdown_id": sm.startTime.UnixNano(),
	})
}

// BeginStep marks the beginning of a shutdown step
func (sm *ShutdownMonitor) BeginStep(stepName string) {
	sm.mu.Lock()
	defer sm.mu.Unlock()
	
	sm.shutdownSteps[stepName] = ShutdownStepStatus{
		Status:    "in_progress",
		StartTime: time.Now(),
	}
	
	sm.logger.Log(LogLevelInfo, fmt.Sprintf("Beginning shutdown step: %s", stepName), map[string]interface{}{
		"step":        stepName,
		"status":      "in_progress",
		"shutdown_id": sm.startTime.UnixNano(),
	})
}

// EndStep marks the end of a shutdown step
func (sm *ShutdownMonitor) EndStep(stepName string, err error) {
	sm.mu.Lock()
	defer sm.mu.Unlock()
	
	step, exists := sm.shutdownSteps[stepName]
	if !exists {
		sm.logger.Log(LogLevelWarning, fmt.Sprintf("Ending unknown shutdown step: %s", stepName), map[string]interface{}{
			"step":        stepName,
			"shutdown_id": sm.startTime.UnixNano(),
		})
		return
	}
	
	step.EndTime = time.Now()
	step.Duration = step.EndTime.Sub(step.StartTime)
	
	if err != nil {
		step.Status = "failed"
		step.Error = err
		sm.logger.Log(LogLevelError, fmt.Sprintf("Shutdown step failed: %s", stepName), map[string]interface{}{
			"step":        stepName,
			"status":      "failed",
			"duration_ms": step.Duration.Milliseconds(),
			"error":       err.Error(),
			"shutdown_id": sm.startTime.UnixNano(),
		})
	} else {
		step.Status = "completed"
		sm.logger.Log(LogLevelInfo, fmt.Sprintf("Shutdown step completed: %s", stepName), map[string]interface{}{
			"step":        stepName,
			"status":      "completed",
			"duration_ms": step.Duration.Milliseconds(),
			"shutdown_id": sm.startTime.UnixNano(),
		})
	}
	
	sm.shutdownSteps[stepName] = step
}

// CompleteShutdown finalizes the shutdown process
func (sm *ShutdownMonitor) CompleteShutdown() {
	sm.mu.Lock()
	defer sm.mu.Unlock()
	
	duration := time.Since(sm.startTime)
	
	// Count successes and failures
	successes := 0
	failures := 0
	for _, step := range sm.shutdownSteps {
		if step.Status == "completed" {
			successes++
		} else if step.Status == "failed" {
			failures++
		}
	}
	
	sm.logger.Log(LogLevelInfo, "Application shutdown complete", map[string]interface{}{
		"shutdown_id":   sm.startTime.UnixNano(),
		"duration_ms":   duration.Milliseconds(),
		"total_steps":   len(sm.shutdownSteps),
		"success_steps": successes,
		"failed_steps":  failures,
	})
}

func main() {
	// Create structured logger
	logger := &StructuredLogger{}
	
	// Create shutdown monitor
	monitor := NewShutdownMonitor(logger)
	
	// Create server
	server := &http.Server{
		Addr: ":8080",
		Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
			time.Sleep(1 * time.Second) // Simulate work
			fmt.Fprintf(w, "Hello, World!")
		}),
	}
	
	// Start server
	go func() {
		logger.Log(LogLevelInfo, "Starting server", map[string]interface{}{
			"address": server.Addr,
		})
		
		if err := server.ListenAndServe(); err != http.ErrServerClosed {
			logger.Log(LogLevelError, "Server error", map[string]interface{}{
				"error": err.Error(),
			})
		}
	}()
	
	// Channel to listen for interrupt signals
	shutdown := make(chan os.Signal, 1)
	signal.Notify(shutdown, os.Interrupt, syscall.SIGTERM)
	
	// Block until we receive a signal
	sig := <-shutdown
	logger.Log(LogLevelInfo, "Received termination signal", map[string]interface{}{
		"signal": sig.String(),
	})
	
	// Start the shutdown process
	monitor.StartShutdown()
	
	// Step 1: Stop accepting new connections
	monitor.BeginStep("server_shutdown")
	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
	defer cancel()
	
	err := server.Shutdown(ctx)
	monitor.EndStep("server_shutdown", err)
	
	// Step 2: Close database connections (simulated)
	monitor.BeginStep("database_shutdown")
	time.Sleep(2 * time.Second) // Simulate DB shutdown
	monitor.EndStep("database_shutdown", nil)
	
	// Step 3: Flush metrics (simulated)
	monitor.BeginStep("metrics_flush")
	time.Sleep(1 * time.Second) // Simulate metrics flush
	// Simulate an error
	monitor.EndStep("metrics_flush", fmt.Errorf("failed to flush metrics: connection timeout"))
	
	// Complete the shutdown process
	monitor.CompleteShutdown()
}

This pattern demonstrates:

Structured logging during shutdown
Tracking individual shutdown steps
Measuring shutdown duration
Reporting success and failure metrics

The Bottom Line

Implementing robust graceful shutdown patterns is not just a best practice—it’s a critical requirement for production-grade Go applications. By properly handling termination signals, coordinating resource cleanup, and managing connection draining, you can ensure that your services terminate cleanly without disrupting users or compromising data integrity.

The patterns we’ve explored in this guide provide a comprehensive toolkit for implementing graceful shutdown in various contexts:

Signal Handling: Capturing OS signals to trigger controlled shutdown
Context-Based Cancellation: Propagating shutdown signals throughout your application
HTTP Server Shutdown: Allowing in-flight requests to complete before termination
Resource Cleanup: Properly closing database connections and other resources
Worker Pool Management: Gracefully stopping worker pools and background tasks
Service Coordination: Shutting down services in the correct order based on dependencies
Health Checks: Integrating with orchestration systems through health and readiness endpoints
Connection Draining: Ensuring zero-downtime deployments through proper connection handling
Kubernetes Integration: Coordinating with container lifecycle hooks
Monitoring and Logging: Tracking and troubleshooting the shutdown process

When implementing these patterns, remember these key principles:

Timeout Everything: Always use timeouts to prevent indefinite blocking during shutdown
Order Matters: Shut down services in the reverse order of their dependencies
Be Defensive: Handle errors during shutdown gracefully
Monitor and Log: Track the shutdown process for troubleshooting
Test Thoroughly: Verify shutdown behavior under various conditions

By applying these patterns and principles, you can build Go applications that not only perform well during normal operation but also terminate gracefully when needed, ensuring reliability and data integrity even during deployments, scaling events, or unexpected failures.

Continue Your Learning

This is part 5 of 5 in the comprehensive guide.

← Previous Implementation Strategies Guide Overview See all 5 parts

Guide Complete!

You've finished all 5 parts of this guide.

Explore More Browse other guides