Implementing Health and Readiness Endpoints
package main
import (
"context"
"encoding/json"
"fmt"
"log"
"net/http"
"os"
"os/signal"
"sync"
"syscall"
"time"
)
// HealthStatus represents the health state of a component
type HealthStatus string
const (
StatusHealthy HealthStatus = "healthy"
StatusDegraded HealthStatus = "degraded"
StatusUnhealthy HealthStatus = "unhealthy"
StatusShutdown HealthStatus = "shutdown"
)
// HealthCheck represents a component that can report its health
type HealthCheck interface {
Name() string
Check() HealthStatus
}
// Component represents a service component with health reporting
type Component struct {
name string
status HealthStatus
mu sync.RWMutex
}
func NewComponent(name string) *Component {
return &Component{
name: name,
status: StatusHealthy,
}
}
func (c *Component) Name() string {
return c.name
}
func (c *Component) Check() HealthStatus {
c.mu.RLock()
defer c.mu.RUnlock()
return c.status
}
func (c *Component) SetStatus(status HealthStatus) {
c.mu.Lock()
defer c.mu.Unlock()
c.status = status
}
// HealthServer provides health and readiness endpoints
type HealthServer struct {
components []HealthCheck
server *http.Server
isShutdown bool
mu sync.RWMutex
}
func NewHealthServer(addr string) *HealthServer {
return &HealthServer{
components: []HealthCheck{},
server: &http.Server{
Addr: addr,
},
}
}
// AddComponent adds a component to health monitoring
func (hs *HealthServer) AddComponent(component HealthCheck) {
hs.components = append(hs.components, component)
}
// Start begins serving health and readiness endpoints
func (hs *HealthServer) Start() error {
mux := http.NewServeMux()
// Health endpoint returns overall system health
mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
hs.mu.RLock()
if hs.isShutdown {
hs.mu.RUnlock()
w.WriteHeader(http.StatusServiceUnavailable)
json.NewEncoder(w).Encode(map[string]string{
"status": string(StatusShutdown),
})
return
}
hs.mu.RUnlock()
overallStatus := StatusHealthy
componentStatuses := make(map[string]string)
for _, component := range hs.components {
status := component.Check()
componentStatuses[component.Name()] = string(status)
if status == StatusUnhealthy {
overallStatus = StatusUnhealthy
} else if status == StatusDegraded && overallStatus != StatusUnhealthy {
overallStatus = StatusDegraded
}
}
response := map[string]interface{}{
"status": string(overallStatus),
"components": componentStatuses,
"timestamp": time.Now().Format(time.RFC3339),
}
if overallStatus != StatusHealthy {
w.WriteHeader(http.StatusServiceUnavailable)
}
json.NewEncoder(w).Encode(response)
})
// Readiness endpoint indicates if the service is ready to receive traffic
mux.HandleFunc("/ready", func(w http.ResponseWriter, r *http.Request) {
hs.mu.RLock()
isShutdown := hs.isShutdown
hs.mu.RUnlock()
if isShutdown {
w.WriteHeader(http.StatusServiceUnavailable)
json.NewEncoder(w).Encode(map[string]string{
"status": "not ready - shutting down",
})
return
}
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]string{
"status": "ready",
})
})
hs.server.Handler = mux
go func() {
log.Printf("Health server listening on %s", hs.server.Addr)
if err := hs.server.ListenAndServe(); err != http.ErrServerClosed {
log.Printf("Health server error: %v", err)
}
}()
return nil
}
// BeginShutdown marks the service as shutting down
func (hs *HealthServer) BeginShutdown() {
hs.mu.Lock()
defer hs.mu.Unlock()
hs.isShutdown = true
log.Println("Health server marked as shutting down")
}
// Shutdown stops the health server
func (hs *HealthServer) Shutdown(ctx context.Context) error {
log.Println("Shutting down health server...")
return hs.server.Shutdown(ctx)
}
func main() {
// Create components
dbComponent := NewComponent("database")
apiComponent := NewComponent("api")
// Create health server
healthServer := NewHealthServer(":8081")
healthServer.AddComponent(dbComponent)
healthServer.AddComponent(apiComponent)
// Create API server
apiServer := &http.Server{
Addr: ":8080",
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(2 * time.Second) // Simulate work
fmt.Fprintf(w, "API response")
}),
}
// Start servers
if err := healthServer.Start(); err != nil {
log.Fatalf("Failed to start health server: %v", err)
}
go func() {
log.Printf("API server listening on %s", apiServer.Addr)
if err := apiServer.ListenAndServe(); err != http.ErrServerClosed {
log.Printf("API server error: %v", err)
}
}()
// Channel to listen for interrupt signals
shutdown := make(chan os.Signal, 1)
signal.Notify(shutdown, os.Interrupt, syscall.SIGTERM)
// Block until we receive a signal
sig := <-shutdown
log.Printf("Received signal: %v", sig)
// Mark as shutting down in health checks
healthServer.BeginShutdown()
// Simulate degraded status during shutdown
dbComponent.SetStatus(StatusDegraded)
// Create a deadline for graceful shutdown
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Shutdown API server first
log.Println("Shutting down API server...")
if err := apiServer.Shutdown(ctx); err != nil {
log.Printf("API server shutdown error: %v", err)
}
// Update component status
apiComponent.SetStatus(StatusShutdown)
// Shutdown health server last
if err := healthServer.Shutdown(ctx); err != nil {
log.Printf("Health server shutdown error: %v", err)
}
log.Println("Application shutdown complete")
}
This pattern demonstrates:
- Implementing health and readiness endpoints
- Tracking component health status
- Updating health status during shutdown
- Using health checks to coordinate with orchestration systems
Production Deployment Strategies
Graceful shutdown is particularly important in production environments, especially when dealing with orchestration systems like Kubernetes.
Connection Draining for Zero-Downtime Deployments
In production environments, you often need to ensure that in-flight requests are completed before shutting down:
package main
import (
"context"
"fmt"
"log"
"net/http"
"os"
"os/signal"
"sync"
"syscall"
"time"
)
// ConnectionTracker keeps track of active connections
type ConnectionTracker struct {
activeConnections int
mu sync.Mutex
drainComplete chan struct{}
}
func NewConnectionTracker() *ConnectionTracker {
return &ConnectionTracker{
drainComplete: make(chan struct{}),
}
}
// ConnectionStarted increments the active connection counter
func (ct *ConnectionTracker) ConnectionStarted() {
ct.mu.Lock()
defer ct.mu.Unlock()
ct.activeConnections++
log.Printf("Connection started. Active connections: %d", ct.activeConnections)
}
// ConnectionFinished decrements the active connection counter
func (ct *ConnectionTracker) ConnectionFinished() {
ct.mu.Lock()
defer ct.mu.Unlock()
ct.activeConnections--
log.Printf("Connection finished. Active connections: %d", ct.activeConnections)
// If we're draining and this was the last connection, signal completion
if ct.activeConnections == 0 && ct.drainComplete != nil {
select {
case <-ct.drainComplete:
// Channel already closed
default:
close(ct.drainComplete)
}
}
}
// WaitForDrain waits for all connections to finish
func (ct *ConnectionTracker) WaitForDrain(ctx context.Context) error {
ct.mu.Lock()
if ct.activeConnections == 0 {
ct.mu.Unlock()
return nil
}
ct.mu.Unlock()
select {
case <-ct.drainComplete:
return nil
case <-ctx.Done():
return ctx.Err()
}
}
// ConnectionDrainingHandler wraps an HTTP handler to track connections
func ConnectionDrainingHandler(tracker *ConnectionTracker, next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
tracker.ConnectionStarted()
defer tracker.ConnectionFinished()
next.ServeHTTP(w, r)
})
}
func main() {
// Create connection tracker
tracker := NewConnectionTracker()
// Create server with connection tracking
server := &http.Server{
Addr: ":8080",
Handler: ConnectionDrainingHandler(tracker, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Simulate a long-running request
duration := time.Duration(2+time.Now().Second()%3) * time.Second
log.Printf("Handling request, will take %v", duration)
time.Sleep(duration)
fmt.Fprintf(w, "Request processed after %v", duration)
})),
}
// Start server
go func() {
log.Printf("Server listening on %s", server.Addr)
if err := server.ListenAndServe(); err != http.ErrServerClosed {
log.Printf("Server error: %v", err)
}
}()
// Channel to listen for interrupt signals
shutdown := make(chan os.Signal, 1)
signal.Notify(shutdown, os.Interrupt, syscall.SIGTERM)
// Block until we receive a signal
sig := <-shutdown
log.Printf("Received signal: %v", sig)
// Create a deadline for graceful shutdown
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Step 1: Stop accepting new connections
log.Println("Shutting down server - no longer accepting new connections")
if err := server.Shutdown(ctx); err != nil {
log.Printf("Server shutdown error: %v", err)
}
// Step 2: Wait for existing connections to drain
log.Println("Waiting for active connections to complete...")
if err := tracker.WaitForDrain(ctx); err != nil {
log.Printf("Connection draining error: %v", err)
} else {
log.Println("All connections drained successfully")
}
log.Println("Server shutdown complete")
}
This pattern demonstrates:
- Tracking active connections
- Gracefully rejecting new connections while allowing existing ones to complete
- Waiting for all in-flight requests to finish before final shutdown
Kubernetes-Ready Graceful Shutdown
When running in Kubernetes, you need to handle termination signals and coordinate with the container lifecycle:
package main
import (
"context"
"fmt"
"log"
"net/http"
"os"
"os/signal"
"sync"
"syscall"
"time"
)
// ShutdownManager coordinates the shutdown process
type ShutdownManager struct {
shutdownTimeout time.Duration
preStopTimeout time.Duration
server *http.Server
readyToShutdown bool
mu sync.RWMutex
}
func NewShutdownManager(server *http.Server, shutdownTimeout, preStopTimeout time.Duration) *ShutdownManager {
return &ShutdownManager{
server: server,
shutdownTimeout: shutdownTimeout,
preStopTimeout: preStopTimeout,
}
}
// StartPreStop marks the service as no longer ready and waits for the preStop hook duration
func (sm *ShutdownManager) StartPreStop() {
sm.mu.Lock()
sm.readyToShutdown = true
sm.mu.Unlock()
log.Printf("PreStop hook received, waiting %v before starting shutdown", sm.preStopTimeout)
time.Sleep(sm.preStopTimeout)
}
// IsReady returns whether the service is ready to receive traffic
func (sm *ShutdownManager) IsReady() bool {
sm.mu.RLock()
defer sm.mu.RUnlock()
return !sm.readyToShutdown
}
// Shutdown performs the actual server shutdown
func (sm *ShutdownManager) Shutdown() error {
log.Println("Starting graceful shutdown...")
ctx, cancel := context.WithTimeout(context.Background(), sm.shutdownTimeout)
defer cancel()
return sm.server.Shutdown(ctx)
}
func main() {
// Create server
server := &http.Server{
Addr: ":8080",
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(2 * time.Second) // Simulate work
fmt.Fprintf(w, "Hello, World!")
}),
}
// Create shutdown manager
shutdownManager := NewShutdownManager(
server,
30*time.Second, // Shutdown timeout
5*time.Second, // PreStop hook duration
)
// Create health server
healthServer := &http.Server{
Addr: ":8081",
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Kubernetes readiness probe
if r.URL.Path == "/ready" {
if shutdownManager.IsReady() {
w.WriteHeader(http.StatusOK)
fmt.Fprintln(w, "Ready")
} else {
// Return not ready during shutdown
w.WriteHeader(http.StatusServiceUnavailable)
fmt.Fprintln(w, "Not Ready - Shutting Down")
}
return
}
// Kubernetes liveness probe
if r.URL.Path == "/health" {
w.WriteHeader(http.StatusOK)
fmt.Fprintln(w, "Healthy")
return
}
w.WriteHeader(http.StatusNotFound)
}),
}
// Start servers
go func() {
log.Printf("Main server listening on %s", server.Addr)
if err := server.ListenAndServe(); err != http.ErrServerClosed {
log.Printf("Main server error: %v", err)
}
}()
go func() {
log.Printf("Health server listening on %s", healthServer.Addr)
if err := healthServer.ListenAndServe(); err != http.ErrServerClosed {
log.Printf("Health server error: %v", err)
}
}()
// Channel to listen for interrupt signals
shutdown := make(chan os.Signal, 1)
signal.Notify(shutdown, syscall.SIGINT, syscall.SIGTERM)
// Block until we receive a signal
sig := <-shutdown
log.Printf("Received signal: %v", sig)
// Start the pre-stop process
// This simulates the Kubernetes preStop hook
shutdownManager.StartPreStop()
// Shutdown the main server
if err := shutdownManager.Shutdown(); err != nil {
log.Printf("Main server shutdown error: %v", err)
}
// Shutdown the health server last
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := healthServer.Shutdown(ctx); err != nil {
log.Printf("Health server shutdown error: %v", err)
}
log.Println("Application shutdown complete")
}
This pattern demonstrates:
- Coordinating with Kubernetes lifecycle hooks
- Implementing readiness probes that reflect shutdown state
- Using a preStop hook delay to allow for load balancer reconfiguration
- Proper sequencing of shutdown steps
Monitoring and Logging During Shutdown
Proper monitoring and logging during shutdown is essential for troubleshooting and ensuring clean termination.
Structured Shutdown Logging
package main
import (
"context"
"encoding/json"
"fmt"
"log"
"net/http"
"os"
"os/signal"
"sync"
"syscall"
"time"
)
// LogLevel represents the severity of a log message
type LogLevel string
const (
LogLevelInfo LogLevel = "INFO"
LogLevelWarning LogLevel = "WARNING"
LogLevelError LogLevel = "ERROR"
)
// StructuredLogger provides structured logging
type StructuredLogger struct {
mu sync.Mutex
}
// Log outputs a structured log message
func (l *StructuredLogger) Log(level LogLevel, message string, fields map[string]interface{}) {
l.mu.Lock()
defer l.mu.Unlock()
if fields == nil {
fields = make(map[string]interface{})
}
fields["timestamp"] = time.Now().Format(time.RFC3339)
fields["level"] = level
fields["message"] = message
jsonData, err := json.Marshal(fields)
if err != nil {
log.Printf("Error marshaling log: %v", err)
return
}
fmt.Println(string(jsonData))
}
// ShutdownMonitor tracks the shutdown process
type ShutdownMonitor struct {
logger *StructuredLogger
startTime time.Time
shutdownSteps map[string]ShutdownStepStatus
mu sync.Mutex
}
// ShutdownStepStatus represents the status of a shutdown step
type ShutdownStepStatus struct {
Status string
StartTime time.Time
EndTime time.Time
Duration time.Duration
Error error
}
func NewShutdownMonitor(logger *StructuredLogger) *ShutdownMonitor {
return &ShutdownMonitor{
logger: logger,
shutdownSteps: make(map[string]ShutdownStepStatus),
}
}
// StartShutdown begins the shutdown process
func (sm *ShutdownMonitor) StartShutdown() {
sm.mu.Lock()
defer sm.mu.Unlock()
sm.startTime = time.Now()
sm.logger.Log(LogLevelInfo, "Starting application shutdown", map[string]interface{}{
"shutdown_id": sm.startTime.UnixNano(),
})
}
// BeginStep marks the beginning of a shutdown step
func (sm *ShutdownMonitor) BeginStep(stepName string) {
sm.mu.Lock()
defer sm.mu.Unlock()
sm.shutdownSteps[stepName] = ShutdownStepStatus{
Status: "in_progress",
StartTime: time.Now(),
}
sm.logger.Log(LogLevelInfo, fmt.Sprintf("Beginning shutdown step: %s", stepName), map[string]interface{}{
"step": stepName,
"status": "in_progress",
"shutdown_id": sm.startTime.UnixNano(),
})
}
// EndStep marks the end of a shutdown step
func (sm *ShutdownMonitor) EndStep(stepName string, err error) {
sm.mu.Lock()
defer sm.mu.Unlock()
step, exists := sm.shutdownSteps[stepName]
if !exists {
sm.logger.Log(LogLevelWarning, fmt.Sprintf("Ending unknown shutdown step: %s", stepName), map[string]interface{}{
"step": stepName,
"shutdown_id": sm.startTime.UnixNano(),
})
return
}
step.EndTime = time.Now()
step.Duration = step.EndTime.Sub(step.StartTime)
if err != nil {
step.Status = "failed"
step.Error = err
sm.logger.Log(LogLevelError, fmt.Sprintf("Shutdown step failed: %s", stepName), map[string]interface{}{
"step": stepName,
"status": "failed",
"duration_ms": step.Duration.Milliseconds(),
"error": err.Error(),
"shutdown_id": sm.startTime.UnixNano(),
})
} else {
step.Status = "completed"
sm.logger.Log(LogLevelInfo, fmt.Sprintf("Shutdown step completed: %s", stepName), map[string]interface{}{
"step": stepName,
"status": "completed",
"duration_ms": step.Duration.Milliseconds(),
"shutdown_id": sm.startTime.UnixNano(),
})
}
sm.shutdownSteps[stepName] = step
}
// CompleteShutdown finalizes the shutdown process
func (sm *ShutdownMonitor) CompleteShutdown() {
sm.mu.Lock()
defer sm.mu.Unlock()
duration := time.Since(sm.startTime)
// Count successes and failures
successes := 0
failures := 0
for _, step := range sm.shutdownSteps {
if step.Status == "completed" {
successes++
} else if step.Status == "failed" {
failures++
}
}
sm.logger.Log(LogLevelInfo, "Application shutdown complete", map[string]interface{}{
"shutdown_id": sm.startTime.UnixNano(),
"duration_ms": duration.Milliseconds(),
"total_steps": len(sm.shutdownSteps),
"success_steps": successes,
"failed_steps": failures,
})
}
func main() {
// Create structured logger
logger := &StructuredLogger{}
// Create shutdown monitor
monitor := NewShutdownMonitor(logger)
// Create server
server := &http.Server{
Addr: ":8080",
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(1 * time.Second) // Simulate work
fmt.Fprintf(w, "Hello, World!")
}),
}
// Start server
go func() {
logger.Log(LogLevelInfo, "Starting server", map[string]interface{}{
"address": server.Addr,
})
if err := server.ListenAndServe(); err != http.ErrServerClosed {
logger.Log(LogLevelError, "Server error", map[string]interface{}{
"error": err.Error(),
})
}
}()
// Channel to listen for interrupt signals
shutdown := make(chan os.Signal, 1)
signal.Notify(shutdown, os.Interrupt, syscall.SIGTERM)
// Block until we receive a signal
sig := <-shutdown
logger.Log(LogLevelInfo, "Received termination signal", map[string]interface{}{
"signal": sig.String(),
})
// Start the shutdown process
monitor.StartShutdown()
// Step 1: Stop accepting new connections
monitor.BeginStep("server_shutdown")
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
err := server.Shutdown(ctx)
monitor.EndStep("server_shutdown", err)
// Step 2: Close database connections (simulated)
monitor.BeginStep("database_shutdown")
time.Sleep(2 * time.Second) // Simulate DB shutdown
monitor.EndStep("database_shutdown", nil)
// Step 3: Flush metrics (simulated)
monitor.BeginStep("metrics_flush")
time.Sleep(1 * time.Second) // Simulate metrics flush
// Simulate an error
monitor.EndStep("metrics_flush", fmt.Errorf("failed to flush metrics: connection timeout"))
// Complete the shutdown process
monitor.CompleteShutdown()
}
This pattern demonstrates:
- Structured logging during shutdown
- Tracking individual shutdown steps
- Measuring shutdown duration
- Reporting success and failure metrics
The Bottom Line
Implementing robust graceful shutdown patterns is not just a best practice—it’s a critical requirement for production-grade Go applications. By properly handling termination signals, coordinating resource cleanup, and managing connection draining, you can ensure that your services terminate cleanly without disrupting users or compromising data integrity.
The patterns we’ve explored in this guide provide a comprehensive toolkit for implementing graceful shutdown in various contexts:
- Signal Handling: Capturing OS signals to trigger controlled shutdown
- Context-Based Cancellation: Propagating shutdown signals throughout your application
- HTTP Server Shutdown: Allowing in-flight requests to complete before termination
- Resource Cleanup: Properly closing database connections and other resources
- Worker Pool Management: Gracefully stopping worker pools and background tasks
- Service Coordination: Shutting down services in the correct order based on dependencies
- Health Checks: Integrating with orchestration systems through health and readiness endpoints
- Connection Draining: Ensuring zero-downtime deployments through proper connection handling
- Kubernetes Integration: Coordinating with container lifecycle hooks
- Monitoring and Logging: Tracking and troubleshooting the shutdown process
When implementing these patterns, remember these key principles:
- Timeout Everything: Always use timeouts to prevent indefinite blocking during shutdown
- Order Matters: Shut down services in the reverse order of their dependencies
- Be Defensive: Handle errors during shutdown gracefully
- Monitor and Log: Track the shutdown process for troubleshooting
- Test Thoroughly: Verify shutdown behavior under various conditions
By applying these patterns and principles, you can build Go applications that not only perform well during normal operation but also terminate gracefully when needed, ensuring reliability and data integrity even during deployments, scaling events, or unexpected failures.