Advanced Error Handling

gRPC provides a rich error model that can be leveraged for detailed error reporting:

package main

import (
	"context"
	"database/sql"
	"errors"
	"log"
	"net"

	"github.com/example/service/proto"
	"google.golang.org/grpc"
	"google.golang.org/grpc/codes"
	"google.golang.org/grpc/status"
	"google.golang.org/protobuf/types/known/timestamppb"
)

// Custom error types
var (
	ErrNotFound      = errors.New("resource not found")
	ErrAlreadyExists = errors.New("resource already exists")
	ErrDatabase      = errors.New("database error")
	ErrValidation    = errors.New("validation error")
)

// Service implementation with advanced error handling
type userService struct {
	proto.UnimplementedUserServiceServer
	db *sql.DB // Database connection
}

func (s *userService) GetUser(ctx context.Context, req *proto.GetUserRequest) (*proto.User, error) {
	// Validate request
	if req.UserId == "" {
		return nil, status.Error(codes.InvalidArgument, "user ID cannot be empty")
	}
	
	// Query database
	user, err := s.findUserByID(ctx, req.UserId)
	if err != nil {
		// Map domain errors to appropriate gRPC errors
		switch {
		case errors.Is(err, ErrNotFound):
			return nil, status.Errorf(codes.NotFound, "user not found: %s", req.UserId)
		case errors.Is(err, ErrDatabase):
			log.Printf("Database error: %v", err)
			return nil, status.Error(codes.Internal, "internal server error")
		default:
			log.Printf("Unknown error: %v", err)
			return nil, status.Error(codes.Unknown, "unknown error")
		}
	}
	
	return user, nil
}

func (s *userService) CreateUser(ctx context.Context, req *proto.CreateUserRequest) (*proto.User, error) {
	// Validate request
	if err := validateCreateUserRequest(req); err != nil {
		// Return detailed validation errors
		st := status.New(codes.InvalidArgument, "validation failed")
		
		// Add detailed error information
		detailedStatus, err := st.WithDetails(
			&proto.ValidationError{
				Field:   "email",
				Message: "invalid email format",
			},
		)
		if err != nil {
			// If adding details fails, return the simple status
			return nil, st.Err()
		}
		
		return nil, detailedStatus.Err()
	}
	
	// Check if user already exists
	_, err := s.findUserByEmail(ctx, req.Email)
	if err == nil {
		// User already exists
		return nil, status.Errorf(codes.AlreadyExists, "user with email %s already exists", req.Email)
	} else if !errors.Is(err, ErrNotFound) {
		// Database error
		log.Printf("Database error: %v", err)
		return nil, status.Error(codes.Internal, "internal server error")
	}
	
	// Create user
	user, err := s.createUser(ctx, req)
	if err != nil {
		log.Printf("Failed to create user: %v", err)
		return nil, status.Error(codes.Internal, "failed to create user")
	}
	
	return user, nil
}

Client-side error handling:

package main

import (
	"context"
	"log"
	"time"

	"github.com/example/service/proto"
	"google.golang.org/grpc"
	"google.golang.org/grpc/codes"
	"google.golang.org/grpc/credentials/insecure"
	"google.golang.org/grpc/status"
)

func main() {
	conn, err := grpc.Dial("localhost:50051", 
		grpc.WithTransportCredentials(insecure.NewCredentials()))
	if err != nil {
		log.Fatalf("failed to connect: %v", err)
	}
	defer conn.Close()
	
	client := proto.NewUserServiceClient(conn)
	ctx, cancel := context.WithTimeout(context.Background(), time.Second)
	defer cancel()
	
	// Try to get a non-existent user
	user, err := client.GetUser(ctx, &proto.GetUserRequest{
		UserId: "nonexistent",
	})
	
	// Handle the error
	if err != nil {
		st, ok := status.FromError(err)
		if ok {
			switch st.Code() {
			case codes.NotFound:
				log.Printf("User not found: %s", st.Message())
			case codes.InvalidArgument:
				log.Printf("Invalid argument: %s", st.Message())
			case codes.Internal:
				log.Printf("Internal error: %s", st.Message())
			default:
				log.Printf("Unexpected error: %s", st.Message())
			}
			
			// Check for additional error details
			for _, detail := range st.Details() {
				switch t := detail.(type) {
				case *proto.ValidationError:
					log.Printf("Validation error: field %s - %s", t.Field, t.Message)
				}
			}
		} else {
			log.Printf("Non-gRPC error: %v", err)
		}
	} else {
		log.Printf("Got user: %v", user)
	}
}

Retry and Circuit Breaking

Implementing retry logic and circuit breaking for resilient clients:

package main

import (
	"context"
	"log"
	"math/rand"
	"sync"
	"time"

	"github.com/example/service/proto"
	"github.com/sony/gobreaker"
	"google.golang.org/grpc"
	"google.golang.org/grpc/codes"
	"google.golang.org/grpc/credentials/insecure"
	"google.golang.org/grpc/status"
)

// ResilienceClient wraps a gRPC client with retry and circuit breaking capabilities
type ResilienceClient struct {
	client     proto.UserServiceClient
	cb         *gobreaker.CircuitBreaker
	maxRetries int
}

// NewResilienceClient creates a new resilient client
func NewResilienceClient(conn *grpc.ClientConn, maxRetries int) *ResilienceClient {
	// Configure circuit breaker
	cbSettings := gobreaker.Settings{
		Name:        "user-service",
		MaxRequests: 5,                  // Max requests allowed when circuit is half-open
		Interval:    10 * time.Second,   // Cyclic period of the closed state
		Timeout:     30 * time.Second,   // Time after which the circuit breaker resets to half-open state
		ReadyToTrip: func(counts gobreaker.Counts) bool {
			// Trip the circuit when the failure rate exceeds 50% and we have at least 5 requests
			failureRatio := float64(counts.TotalFailures) / float64(counts.Requests)
			return counts.Requests >= 5 && failureRatio >= 0.5
		},
		OnStateChange: func(name string, from gobreaker.State, to gobreaker.State) {
			log.Printf("Circuit breaker %s state changed from %s to %s", name, from, to)
		},
	}
	
	return &ResilienceClient{
		client:     proto.NewUserServiceClient(conn),
		cb:         gobreaker.NewCircuitBreaker(cbSettings),
		maxRetries: maxRetries,
	}
}

// GetUser calls the GetUser RPC with retry and circuit breaking
func (c *ResilienceClient) GetUser(ctx context.Context, userID string) (*proto.User, error) {
	var user *proto.User
	var err error
	
	// Execute through circuit breaker
	result, err := c.cb.Execute(func() (interface{}, error) {
		// Implement retry logic
		for attempt := 0; attempt <= c.maxRetries; attempt++ {
			// Create a new context with timeout for this attempt
			attemptCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
			defer cancel()
			
			user, err = c.client.GetUser(attemptCtx, &proto.GetUserRequest{
				UserId: userID,
			})
			
			// If successful, return the result
			if err == nil {
				return user, nil
			}
			
			// Check if the error is retriable
			if isRetriable(err) {
				// If this wasn't the last attempt, wait and retry
				if attempt < c.maxRetries {
					// Exponential backoff with jitter
					backoff := time.Duration(1<<uint(attempt)) * 100 * time.Millisecond
					jitter := time.Duration(rand.Intn(100)) * time.Millisecond
					sleepTime := backoff + jitter
					
					log.Printf("Request failed with retriable error: %v. Retrying in %v...", err, sleepTime)
					time.Sleep(sleepTime)
					continue
				}
			}
			
			// Non-retriable error or max retries reached
			return nil, err
		}
		
		// This should never be reached, but just in case
		return nil, err
	})
	
	if err != nil {
		return nil, err
	}
	
	return result.(*proto.User), nil
}

// isRetriable determines if an error should be retried
func isRetriable(err error) bool {
	if err == nil {
		return false
	}
	
	st, ok := status.FromError(err)
	if !ok {
		// Non-gRPC error, don't retry
		return false
	}
	
	// Retry on these status codes
	switch st.Code() {
	case codes.Unavailable, // Server is currently unavailable
		codes.ResourceExhausted, // Client or server has insufficient quota
		codes.DeadlineExceeded, // Request timed out
		codes.Aborted: // Concurrency conflict
		return true
	default:
		return false
	}
}

Health Checking

Implementing health checks for service monitoring:

// health_service.proto
syntax = "proto3";
package health;
option go_package = "github.com/example/health";

service HealthService {
  // Check the health of the service
  rpc Check(HealthCheckRequest) returns (HealthCheckResponse);
  
  // Watch for health changes
  rpc Watch(HealthCheckRequest) returns (stream HealthCheckResponse);
}

message HealthCheckRequest {
  string service = 1;
}

message HealthCheckResponse {
  enum ServingStatus {
    UNKNOWN = 0;
    SERVING = 1;
    NOT_SERVING = 2;
    SERVICE_UNKNOWN = 3;
  }
  ServingStatus status = 1;
}

Server implementation:

package main

import (
	"context"
	"log"
	"net"
	"sync"
	"time"

	"github.com/example/health"
	"google.golang.org/grpc"
)

type healthServer struct {
	health.UnimplementedHealthServiceServer
	mu             sync.RWMutex
	statusMap      map[string]health.HealthCheckResponse_ServingStatus
	statusWatchers map[string][]health.HealthService_WatchServer
}

func newHealthServer() *healthServer {
	return &healthServer{
		statusMap:      make(map[string]health.HealthCheckResponse_ServingStatus),
		statusWatchers: make(map[string][]health.HealthService_WatchServer),
	}
}

func (s *healthServer) Check(ctx context.Context, req *health.HealthCheckRequest) (*health.HealthCheckResponse, error) {
	s.mu.RLock()
	defer s.mu.RUnlock()
	
	if req.Service == "" {
		// Overall service health
		return &health.HealthCheckResponse{
			Status: health.HealthCheckResponse_SERVING,
		}, nil
	}
	
	status, ok := s.statusMap[req.Service]
	if !ok {
		return &health.HealthCheckResponse{
			Status: health.HealthCheckResponse_SERVICE_UNKNOWN,
		}, nil
	}
	
	return &health.HealthCheckResponse{
		Status: status,
	}, nil
}

func (s *healthServer) Watch(req *health.HealthCheckRequest, stream health.HealthService_WatchServer) error {
	service := req.Service
	
	// Register watcher
	s.mu.Lock()
	if _, ok := s.statusWatchers[service]; !ok {
		s.statusWatchers[service] = make([]health.HealthService_WatchServer, 0)
	}
	s.statusWatchers[service] = append(s.statusWatchers[service], stream)
	s.mu.Unlock()
	
	// Send initial status
	status := health.HealthCheckResponse_SERVICE_UNKNOWN
	s.mu.RLock()
	if st, ok := s.statusMap[service]; ok {
		status = st
	} else if service == "" {
		status = health.HealthCheckResponse_SERVING
	}
	s.mu.RUnlock()
	
	if err := stream.Send(&health.HealthCheckResponse{Status: status}); err != nil {
		return err
	}
	
	// Keep the stream open until the client disconnects
	<-stream.Context().Done()
	
	return nil
}

Production Deployment Strategies

Deploying gRPC services in production requires careful consideration of infrastructure, monitoring, and scaling strategies.