Monitoring and Observability

Monitoring multi-cloud infrastructure requires aggregating metrics, logs, and traces from different providers into unified dashboards and alerting systems. Each cloud provider has native monitoring services, but you need centralized observability to understand your entire system’s health and performance.

Unified Metrics Collection

Set up centralized metrics collection from all cloud providers:

# Prometheus deployment for centralized metrics
resource "kubernetes_namespace" "monitoring" {
  metadata {
    name = "monitoring"
  }
}

resource "helm_release" "prometheus" {
  name       = "prometheus"
  repository = "https://prometheus-community.github.io/helm-charts"
  chart      = "kube-prometheus-stack"
  namespace  = kubernetes_namespace.monitoring.metadata[0].name
  
  values = [
    yamlencode({
      prometheus = {
        prometheusSpec = {
          retention = "30d"
          storageSpec = {
            volumeClaimTemplate = {
              spec = {
                storageClassName = "fast-ssd"
                accessModes      = ["ReadWriteOnce"]
                resources = {
                  requests = {
                    storage = "100Gi"
                  }
                }
              }
            }
          }
          additionalScrapeConfigs = [
            {
              job_name = "aws-cloudwatch"
              static_configs = [{
                targets = ["cloudwatch-exporter:9106"]
              }]
            },
            {
              job_name = "azure-monitor"
              static_configs = [{
                targets = ["azure-exporter:9107"]
              }]
            },
            {
              job_name = "gcp-monitoring"
              static_configs = [{
                targets = ["gcp-exporter:9108"]
              }]
            }
          ]
        }
      }
      grafana = {
        adminPassword = var.grafana_admin_password
        persistence = {
          enabled = true
          size    = "10Gi"
        }
      }
    })
  ]
}

# CloudWatch Exporter for AWS metrics
resource "kubernetes_deployment" "cloudwatch_exporter" {
  metadata {
    name      = "cloudwatch-exporter"
    namespace = kubernetes_namespace.monitoring.metadata[0].name
  }
  
  spec {
    replicas = 1
    
    selector {
      match_labels = {
        app = "cloudwatch-exporter"
      }
    }
    
    template {
      metadata {
        labels = {
          app = "cloudwatch-exporter"
        }
      }
      
      spec {
        container {
          name  = "cloudwatch-exporter"
          image = "prom/cloudwatch-exporter:latest"
          
          port {
            container_port = 9106
          }
          
          env {
            name  = "AWS_REGION"
            value = var.aws_region
          }
          
          volume_mount {
            name       = "config"
            mount_path = "/config"
          }
        }
        
        volume {
          name = "config"
          config_map {
            name = kubernetes_config_map.cloudwatch_config.metadata[0].name
          }
        }
      }
    }
  }
}

resource "kubernetes_config_map" "cloudwatch_config" {
  metadata {
    name      = "cloudwatch-exporter-config"
    namespace = kubernetes_namespace.monitoring.metadata[0].name
  }
  
  data = {
    "config.yml" = yamlencode({
      region = var.aws_region
      metrics = [
        {
          aws_namespace = "AWS/EC2"
          aws_metric_name = "CPUUtilization"
          aws_dimensions = ["InstanceId"]
          aws_statistics = ["Average"]
        },
        {
          aws_namespace = "AWS/RDS"
          aws_metric_name = "DatabaseConnections"
          aws_dimensions = ["DBInstanceIdentifier"]
          aws_statistics = ["Average"]
        },
        {
          aws_namespace = "AWS/S3"
          aws_metric_name = "BucketSizeBytes"
          aws_dimensions = ["BucketName", "StorageType"]
          aws_statistics = ["Average"]
        }
      ]
    })
  }
}

# Azure Monitor Exporter
resource "kubernetes_deployment" "azure_exporter" {
  metadata {
    name      = "azure-exporter"
    namespace = kubernetes_namespace.monitoring.metadata[0].name
  }
  
  spec {
    replicas = 1
    
    selector {
      match_labels = {
        app = "azure-exporter"
      }
    }
    
    template {
      metadata {
        labels = {
          app = "azure-exporter"
        }
      }
      
      spec {
        container {
          name  = "azure-exporter"
          image = "webdevops/azure-metrics-exporter:latest"
          
          port {
            container_port = 9107
          }
          
          env {
            name  = "AZURE_SUBSCRIPTION_ID"
            value = var.azure_subscription_id
          }
          
          env {
            name = "AZURE_CLIENT_ID"
            value_from {
              secret_key_ref {
                name = kubernetes_secret.azure_credentials.metadata[0].name
                key  = "client_id"
              }
            }
          }
          
          env {
            name = "AZURE_CLIENT_SECRET"
            value_from {
              secret_key_ref {
                name = kubernetes_secret.azure_credentials.metadata[0].name
                key  = "client_secret"
              }
            }
          }
        }
      }
    }
  }
}

# GCP Monitoring Exporter
resource "kubernetes_deployment" "gcp_exporter" {
  metadata {
    name      = "gcp-exporter"
    namespace = kubernetes_namespace.monitoring.metadata[0].name
  }
  
  spec {
    replicas = 1
    
    selector {
      match_labels = {
        app = "gcp-exporter"
      }
    }
    
    template {
      metadata {
        labels = {
          app = "gcp-exporter"
        }
      }
      
      spec {
        container {
          name  = "gcp-exporter"
          image = "prometheuscommunity/stackdriver-exporter:latest"
          
          port {
            container_port = 9108
          }
          
          env {
            name  = "GOOGLE_APPLICATION_CREDENTIALS"
            value = "/credentials/service-account.json"
          }
          
          env {
            name  = "STACKDRIVER_EXPORTER_GOOGLE_PROJECT_ID"
            value = var.gcp_project_id
          }
          
          volume_mount {
            name       = "gcp-credentials"
            mount_path = "/credentials"
          }
        }
        
        volume {
          name = "gcp-credentials"
          secret {
            secret_name = kubernetes_secret.gcp_credentials.metadata[0].name
          }
        }
      }
    }
  }
}

Cross-Cloud Alerting System

Implement unified alerting across all providers:

#!/usr/bin/env python3
# scripts/multi_cloud_alerting.py

import boto3
import json
import requests
from azure.monitor.query import LogsQueryClient
from azure.identity import DefaultAzureCredential
from google.cloud import monitoring_v3
from typing import Dict, List, Any
from datetime import datetime, timedelta

class MultiCloudAlertManager:
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        
        # Initialize cloud monitoring clients
        self.aws_cloudwatch = boto3.client('cloudwatch')
        self.azure_credential = DefaultAzureCredential()
        self.azure_logs_client = LogsQueryClient(self.azure_credential)
        self.gcp_monitoring = monitoring_v3.MetricServiceClient()
        
        # Alert channels
        self.slack_webhook = config.get('slack_webhook_url')
        self.pagerduty_key = config.get('pagerduty_integration_key')
    
    def check_all_providers(self) -> Dict[str, Any]:
        """Check health across all cloud providers"""
        
        results = {
            'timestamp': datetime.utcnow().isoformat(),
            'overall_status': 'healthy',
            'providers': {},
            'alerts': []
        }
        
        # Check each provider
        for provider_config in self.config['providers']:
            provider_name = provider_config['name']
            
            try:
                if provider_name == 'aws':
                    provider_results = self._check_aws_health(provider_config)
                elif provider_name == 'azure':
                    provider_results = self._check_azure_health(provider_config)
                elif provider_name == 'gcp':
                    provider_results = self._check_gcp_health(provider_config)
                else:
                    continue
                
                results['providers'][provider_name] = provider_results
                
                # Collect alerts
                if provider_results['alerts']:
                    results['alerts'].extend(provider_results['alerts'])
                    results['overall_status'] = 'degraded'
                
            except Exception as e:
                results['providers'][provider_name] = {
                    'status': 'error',
                    'error': str(e),
                    'alerts': [{
                        'severity': 'critical',
                        'message': f"Failed to check {provider_name}: {str(e)}"
                    }]
                }
                results['alerts'].append({
                    'provider': provider_name,
                    'severity': 'critical',
                    'message': f"Monitoring failure: {str(e)}"
                })
                results['overall_status'] = 'critical'
        
        return results
    
    def _check_aws_health(self, config: Dict[str, Any]) -> Dict[str, Any]:
        """Check AWS resource health"""
        
        results = {
            'status': 'healthy',
            'metrics': {},
            'alerts': []
        }
        
        # Check EC2 instances
        for instance_check in config.get('ec2_checks', []):
            metric_data = self.aws_cloudwatch.get_metric_statistics(
                Namespace='AWS/EC2',
                MetricName='CPUUtilization',
                Dimensions=[{'Name': 'InstanceId', 'Value': instance_check['instance_id']}],
                StartTime=datetime.utcnow() - timedelta(minutes=10),
                EndTime=datetime.utcnow(),
                Period=300,
                Statistics=['Average']
            )
            
            if metric_data['Datapoints']:
                cpu_usage = metric_data['Datapoints'][-1]['Average']
                results['metrics'][f"ec2_{instance_check['instance_id']}_cpu"] = cpu_usage
                
                if cpu_usage > instance_check.get('cpu_threshold', 80):
                    results['alerts'].append({
                        'severity': 'warning',
                        'resource': instance_check['instance_id'],
                        'message': f"High CPU usage: {cpu_usage:.1f}%"
                    })
        
        # Check RDS instances
        for rds_check in config.get('rds_checks', []):
            metric_data = self.aws_cloudwatch.get_metric_statistics(
                Namespace='AWS/RDS',
                MetricName='DatabaseConnections',
                Dimensions=[{'Name': 'DBInstanceIdentifier', 'Value': rds_check['db_instance']}],
                StartTime=datetime.utcnow() - timedelta(minutes=10),
                EndTime=datetime.utcnow(),
                Period=300,
                Statistics=['Average']
            )
            
            if metric_data['Datapoints']:
                connections = metric_data['Datapoints'][-1]['Average']
                results['metrics'][f"rds_{rds_check['db_instance']}_connections"] = connections
                
                if connections > rds_check.get('connection_threshold', 80):
                    results['alerts'].append({
                        'severity': 'warning',
                        'resource': rds_check['db_instance'],
                        'message': f"High database connections: {connections}"
                    })
        
        return results
    
    def _check_azure_health(self, config: Dict[str, Any]) -> Dict[str, Any]:
        """Check Azure resource health"""
        
        results = {
            'status': 'healthy',
            'metrics': {},
            'alerts': []
        }
        
        # Check virtual machines
        for vm_check in config.get('vm_checks', []):
            query = f"""
            Perf
            | where TimeGenerated > ago(10m)
            | where Computer == "{vm_check['vm_name']}"
            | where CounterName == "% Processor Time"
            | summarize avg(CounterValue) by bin(TimeGenerated, 5m)
            | order by TimeGenerated desc
            | limit 1
            """
            
            try:
                response = self.azure_logs_client.query_workspace(
                    workspace_id=config['workspace_id'],
                    query=query,
                    timespan=timedelta(minutes=10)
                )
                
                if response.tables and response.tables[0].rows:
                    cpu_usage = response.tables[0].rows[0][1]
                    results['metrics'][f"vm_{vm_check['vm_name']}_cpu"] = cpu_usage
                    
                    if cpu_usage > vm_check.get('cpu_threshold', 80):
                        results['alerts'].append({
                            'severity': 'warning',
                            'resource': vm_check['vm_name'],
                            'message': f"High CPU usage: {cpu_usage:.1f}%"
                        })
            
            except Exception as e:
                results['alerts'].append({
                    'severity': 'error',
                    'resource': vm_check['vm_name'],
                    'message': f"Failed to query metrics: {str(e)}"
                })
        
        return results
    
    def _check_gcp_health(self, config: Dict[str, Any]) -> Dict[str, Any]:
        """Check GCP resource health"""
        
        results = {
            'status': 'healthy',
            'metrics': {},
            'alerts': []
        }
        
        project_name = f"projects/{config['project_id']}"
        
        # Check Compute Engine instances
        for instance_check in config.get('instance_checks', []):
            interval = monitoring_v3.TimeInterval({
                "end_time": {"seconds": int(datetime.utcnow().timestamp())},
                "start_time": {"seconds": int((datetime.utcnow() - timedelta(minutes=10)).timestamp())},
            })
            
            request = monitoring_v3.ListTimeSeriesRequest({
                "name": project_name,
                "filter": f'metric.type="compute.googleapis.com/instance/cpu/utilization" AND resource.labels.instance_name="{instance_check["instance_name"]}"',
                "interval": interval,
                "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL,
            })
            
            try:
                page_result = self.gcp_monitoring.list_time_series(request=request)
                
                for time_series in page_result:
                    if time_series.points:
                        cpu_usage = time_series.points[0].value.double_value * 100
                        results['metrics'][f"gce_{instance_check['instance_name']}_cpu"] = cpu_usage
                        
                        if cpu_usage > instance_check.get('cpu_threshold', 80):
                            results['alerts'].append({
                                'severity': 'warning',
                                'resource': instance_check['instance_name'],
                                'message': f"High CPU usage: {cpu_usage:.1f}%"
                            })
            
            except Exception as e:
                results['alerts'].append({
                    'severity': 'error',
                    'resource': instance_check['instance_name'],
                    'message': f"Failed to query metrics: {str(e)}"
                })
        
        return results
    
    def send_alerts(self, alerts: List[Dict[str, Any]]):
        """Send alerts to configured channels"""
        
        if not alerts:
            return
        
        # Group alerts by severity
        critical_alerts = [a for a in alerts if a.get('severity') == 'critical']
        warning_alerts = [a for a in alerts if a.get('severity') == 'warning']
        
        # Send to Slack
        if self.slack_webhook:
            self._send_slack_alert(critical_alerts, warning_alerts)
        
        # Send to PagerDuty for critical alerts
        if self.pagerduty_key and critical_alerts:
            self._send_pagerduty_alert(critical_alerts)
    
    def _send_slack_alert(self, critical_alerts: List, warning_alerts: List):
        """Send alert to Slack"""
        
        color = "danger" if critical_alerts else "warning"
        
        message = {
            "attachments": [{
                "color": color,
                "title": "Multi-Cloud Infrastructure Alert",
                "fields": []
            }]
        }
        
        if critical_alerts:
            message["attachments"][0]["fields"].append({
                "title": f"Critical Alerts ({len(critical_alerts)})",
                "value": "\n".join([f"• {alert['message']}" for alert in critical_alerts[:5]]),
                "short": False
            })
        
        if warning_alerts:
            message["attachments"][0]["fields"].append({
                "title": f"Warning Alerts ({len(warning_alerts)})",
                "value": "\n".join([f"• {alert['message']}" for alert in warning_alerts[:5]]),
                "short": False
            })
        
        requests.post(self.slack_webhook, json=message)
    
    def _send_pagerduty_alert(self, critical_alerts: List):
        """Send critical alert to PagerDuty"""
        
        payload = {
            "routing_key": self.pagerduty_key,
            "event_action": "trigger",
            "payload": {
                "summary": f"Multi-Cloud Critical Alert: {len(critical_alerts)} issues detected",
                "source": "multi-cloud-monitor",
                "severity": "critical",
                "custom_details": {
                    "alerts": critical_alerts
                }
            }
        }
        
        requests.post("https://events.pagerduty.com/v2/enqueue", json=payload)

def main():
    import argparse
    
    parser = argparse.ArgumentParser(description='Multi-Cloud Alert Manager')
    parser.add_argument('--config', required=True, help='Configuration file')
    parser.add_argument('--send-alerts', action='store_true', help='Send alerts to configured channels')
    
    args = parser.parse_args()
    
    with open(args.config, 'r') as f:
        config = json.load(f)
    
    alert_manager = MultiCloudAlertManager(config)
    results = alert_manager.check_all_providers()
    
    print(f"Overall Status: {results['overall_status']}")
    print(f"Total Alerts: {len(results['alerts'])}")
    
    for provider, provider_results in results['providers'].items():
        print(f"\n{provider.upper()}:")
        print(f"  Status: {provider_results['status']}")
        print(f"  Alerts: {len(provider_results.get('alerts', []))}")
    
    if args.send_alerts and results['alerts']:
        alert_manager.send_alerts(results['alerts'])
        print(f"\n📧 Sent {len(results['alerts'])} alerts")

if __name__ == "__main__":
    main()

Unified Dashboard Creation

Create comprehensive dashboards showing all cloud providers:

#!/bin/bash
# scripts/setup-dashboards.sh

set -e

GRAFANA_URL=${1:-"http://localhost:3000"}
GRAFANA_USER=${2:-"admin"}
GRAFANA_PASSWORD=${3:-"admin"}

create_multi_cloud_dashboard() {
    echo "Creating multi-cloud overview dashboard..."
    
    cat > multi-cloud-dashboard.json << 'EOF'
{
  "dashboard": {
    "title": "Multi-Cloud Infrastructure Overview",
    "tags": ["multi-cloud", "overview"],
    "timezone": "browser",
    "panels": [
      {
        "title": "AWS EC2 CPU Utilization",
        "type": "stat",
        "targets": [
          {
            "expr": "aws_ec2_cpuutilization_average",
            "legendFormat": "{{instance_id}}"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "thresholds": {
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 70},
                {"color": "red", "value": 90}
              ]
            }
          }
        },
        "gridPos": {"h": 8, "w": 8, "x": 0, "y": 0}
      },
      {
        "title": "Azure VM CPU Utilization",
        "type": "stat",
        "targets": [
          {
            "expr": "azure_vm_cpu_percent",
            "legendFormat": "{{vm_name}}"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "thresholds": {
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 70},
                {"color": "red", "value": 90}
              ]
            }
          }
        },
        "gridPos": {"h": 8, "w": 8, "x": 8, "y": 0}
      },
      {
        "title": "GCP Compute CPU Utilization",
        "type": "stat",
        "targets": [
          {
            "expr": "gcp_compute_instance_cpu_utilization",
            "legendFormat": "{{instance_name}}"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "thresholds": {
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 70},
                {"color": "red", "value": 90}
              ]
            }
          }
        },
        "gridPos": {"h": 8, "w": 8, "x": 16, "y": 0}
      },
      {
        "title": "Cross-Cloud Network Latency",
        "type": "graph",
        "targets": [
          {
            "expr": "probe_duration_seconds{job=\"blackbox\"}",
            "legendFormat": "{{instance}}"
          }
        ],
        "yAxes": [
          {
            "label": "Latency (seconds)",
            "min": 0
          }
        ],
        "gridPos": {"h": 8, "w": 24, "x": 0, "y": 8}
      },
      {
        "title": "Storage Usage by Provider",
        "type": "piechart",
        "targets": [
          {
            "expr": "aws_s3_bucket_size_bytes",
            "legendFormat": "AWS S3"
          },
          {
            "expr": "azure_storage_account_used_capacity",
            "legendFormat": "Azure Storage"
          },
          {
            "expr": "gcp_storage_bucket_size",
            "legendFormat": "GCP Storage"
          }
        ],
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
      },
      {
        "title": "Database Connections",
        "type": "graph",
        "targets": [
          {
            "expr": "aws_rds_database_connections",
            "legendFormat": "AWS RDS {{db_instance_identifier}}"
          },
          {
            "expr": "azure_sql_connections",
            "legendFormat": "Azure SQL {{server_name}}"
          },
          {
            "expr": "gcp_cloudsql_connections",
            "legendFormat": "GCP Cloud SQL {{database_id}}"
          }
        ],
        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}
      }
    ],
    "time": {
      "from": "now-1h",
      "to": "now"
    },
    "refresh": "30s"
  }
}
EOF
    
    # Import dashboard to Grafana
    curl -X POST \
        -H "Content-Type: application/json" \
        -u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
        -d @multi-cloud-dashboard.json \
        "$GRAFANA_URL/api/dashboards/db"
    
    echo "✅ Multi-cloud dashboard created"
}

create_cost_dashboard() {
    echo "Creating cost monitoring dashboard..."
    
    cat > cost-dashboard.json << 'EOF'
{
  "dashboard": {
    "title": "Multi-Cloud Cost Analysis",
    "tags": ["cost", "billing"],
    "panels": [
      {
        "title": "Daily Costs by Provider",
        "type": "graph",
        "targets": [
          {
            "expr": "aws_billing_estimated_charges",
            "legendFormat": "AWS"
          },
          {
            "expr": "azure_consumption_cost",
            "legendFormat": "Azure"
          },
          {
            "expr": "gcp_billing_cost",
            "legendFormat": "GCP"
          }
        ],
        "yAxes": [
          {
            "label": "Cost (USD)",
            "min": 0
          }
        ],
        "gridPos": {"h": 8, "w": 24, "x": 0, "y": 0}
      },
      {
        "title": "Cost by Service Category",
        "type": "table",
        "targets": [
          {
            "expr": "sum by (service) (aws_billing_estimated_charges)",
            "format": "table"
          }
        ],
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
      },
      {
        "title": "Monthly Cost Trend",
        "type": "graph",
        "targets": [
          {
            "expr": "increase(aws_billing_estimated_charges[30d])",
            "legendFormat": "AWS Monthly"
          }
        ],
        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
      }
    ]
  }
}
EOF
    
    curl -X POST \
        -H "Content-Type: application/json" \
        -u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
        -d @cost-dashboard.json \
        "$GRAFANA_URL/api/dashboards/db"
    
    echo "✅ Cost dashboard created"
}

create_sla_dashboard() {
    echo "Creating SLA monitoring dashboard..."
    
    cat > sla-dashboard.json << 'EOF'
{
  "dashboard": {
    "title": "Multi-Cloud SLA Monitoring",
    "tags": ["sla", "uptime"],
    "panels": [
      {
        "title": "Service Uptime",
        "type": "stat",
        "targets": [
          {
            "expr": "avg_over_time(up{job=\"multi-cloud-services\"}[24h]) * 100",
            "legendFormat": "{{service}}"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "min": 95,
            "max": 100,
            "thresholds": {
              "steps": [
                {"color": "red", "value": null},
                {"color": "yellow", "value": 99},
                {"color": "green", "value": 99.9}
              ]
            }
          }
        },
        "gridPos": {"h": 8, "w": 24, "x": 0, "y": 0}
      },
      {
        "title": "Response Time SLA",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, http_request_duration_seconds_bucket)",
            "legendFormat": "95th percentile"
          },
          {
            "expr": "histogram_quantile(0.99, http_request_duration_seconds_bucket)",
            "legendFormat": "99th percentile"
          }
        ],
        "gridPos": {"h": 8, "w": 24, "x": 0, "y": 8}
      }
    ]
  }
}
EOF
    
    curl -X POST \
        -H "Content-Type: application/json" \
        -u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
        -d @sla-dashboard.json \
        "$GRAFANA_URL/api/dashboards/db"
    
    echo "✅ SLA dashboard created"
}

# Create all dashboards
create_multi_cloud_dashboard
create_cost_dashboard
create_sla_dashboard

# Cleanup temp files
rm -f multi-cloud-dashboard.json cost-dashboard.json sla-dashboard.json

echo "✅ All multi-cloud dashboards created successfully"
echo "Access them at: $GRAFANA_URL"

What’s Next

Unified monitoring and observability provide the visibility needed to operate multi-cloud infrastructure effectively. With comprehensive metrics, alerting, and dashboards in place, you can maintain high availability and performance across all your cloud providers.

In the final part of this guide, we’ll explore governance and cost management strategies that help you maintain control, compliance, and cost efficiency across your entire multi-cloud environment.

Monitoring and Observability

Unified Metrics Collection

Cross-Cloud Alerting System

Unified Dashboard Creation

What’s Next

Continue Your Learning