Monitoring and Observability
Monitoring multi-cloud infrastructure requires aggregating metrics, logs, and traces from different providers into unified dashboards and alerting systems. Each cloud provider has native monitoring services, but you need centralized observability to understand your entire system’s health and performance.
Unified Metrics Collection
Set up centralized metrics collection from all cloud providers:
# Prometheus deployment for centralized metrics
resource "kubernetes_namespace" "monitoring" {
metadata {
name = "monitoring"
}
}
resource "helm_release" "prometheus" {
name = "prometheus"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "kube-prometheus-stack"
namespace = kubernetes_namespace.monitoring.metadata[0].name
values = [
yamlencode({
prometheus = {
prometheusSpec = {
retention = "30d"
storageSpec = {
volumeClaimTemplate = {
spec = {
storageClassName = "fast-ssd"
accessModes = ["ReadWriteOnce"]
resources = {
requests = {
storage = "100Gi"
}
}
}
}
}
additionalScrapeConfigs = [
{
job_name = "aws-cloudwatch"
static_configs = [{
targets = ["cloudwatch-exporter:9106"]
}]
},
{
job_name = "azure-monitor"
static_configs = [{
targets = ["azure-exporter:9107"]
}]
},
{
job_name = "gcp-monitoring"
static_configs = [{
targets = ["gcp-exporter:9108"]
}]
}
]
}
}
grafana = {
adminPassword = var.grafana_admin_password
persistence = {
enabled = true
size = "10Gi"
}
}
})
]
}
# CloudWatch Exporter for AWS metrics
resource "kubernetes_deployment" "cloudwatch_exporter" {
metadata {
name = "cloudwatch-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec {
replicas = 1
selector {
match_labels = {
app = "cloudwatch-exporter"
}
}
template {
metadata {
labels = {
app = "cloudwatch-exporter"
}
}
spec {
container {
name = "cloudwatch-exporter"
image = "prom/cloudwatch-exporter:latest"
port {
container_port = 9106
}
env {
name = "AWS_REGION"
value = var.aws_region
}
volume_mount {
name = "config"
mount_path = "/config"
}
}
volume {
name = "config"
config_map {
name = kubernetes_config_map.cloudwatch_config.metadata[0].name
}
}
}
}
}
}
resource "kubernetes_config_map" "cloudwatch_config" {
metadata {
name = "cloudwatch-exporter-config"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
data = {
"config.yml" = yamlencode({
region = var.aws_region
metrics = [
{
aws_namespace = "AWS/EC2"
aws_metric_name = "CPUUtilization"
aws_dimensions = ["InstanceId"]
aws_statistics = ["Average"]
},
{
aws_namespace = "AWS/RDS"
aws_metric_name = "DatabaseConnections"
aws_dimensions = ["DBInstanceIdentifier"]
aws_statistics = ["Average"]
},
{
aws_namespace = "AWS/S3"
aws_metric_name = "BucketSizeBytes"
aws_dimensions = ["BucketName", "StorageType"]
aws_statistics = ["Average"]
}
]
})
}
}
# Azure Monitor Exporter
resource "kubernetes_deployment" "azure_exporter" {
metadata {
name = "azure-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec {
replicas = 1
selector {
match_labels = {
app = "azure-exporter"
}
}
template {
metadata {
labels = {
app = "azure-exporter"
}
}
spec {
container {
name = "azure-exporter"
image = "webdevops/azure-metrics-exporter:latest"
port {
container_port = 9107
}
env {
name = "AZURE_SUBSCRIPTION_ID"
value = var.azure_subscription_id
}
env {
name = "AZURE_CLIENT_ID"
value_from {
secret_key_ref {
name = kubernetes_secret.azure_credentials.metadata[0].name
key = "client_id"
}
}
}
env {
name = "AZURE_CLIENT_SECRET"
value_from {
secret_key_ref {
name = kubernetes_secret.azure_credentials.metadata[0].name
key = "client_secret"
}
}
}
}
}
}
}
}
# GCP Monitoring Exporter
resource "kubernetes_deployment" "gcp_exporter" {
metadata {
name = "gcp-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec {
replicas = 1
selector {
match_labels = {
app = "gcp-exporter"
}
}
template {
metadata {
labels = {
app = "gcp-exporter"
}
}
spec {
container {
name = "gcp-exporter"
image = "prometheuscommunity/stackdriver-exporter:latest"
port {
container_port = 9108
}
env {
name = "GOOGLE_APPLICATION_CREDENTIALS"
value = "/credentials/service-account.json"
}
env {
name = "STACKDRIVER_EXPORTER_GOOGLE_PROJECT_ID"
value = var.gcp_project_id
}
volume_mount {
name = "gcp-credentials"
mount_path = "/credentials"
}
}
volume {
name = "gcp-credentials"
secret {
secret_name = kubernetes_secret.gcp_credentials.metadata[0].name
}
}
}
}
}
}
Cross-Cloud Alerting System
Implement unified alerting across all providers:
#!/usr/bin/env python3
# scripts/multi_cloud_alerting.py
import boto3
import json
import requests
from azure.monitor.query import LogsQueryClient
from azure.identity import DefaultAzureCredential
from google.cloud import monitoring_v3
from typing import Dict, List, Any
from datetime import datetime, timedelta
class MultiCloudAlertManager:
def __init__(self, config: Dict[str, Any]):
self.config = config
# Initialize cloud monitoring clients
self.aws_cloudwatch = boto3.client('cloudwatch')
self.azure_credential = DefaultAzureCredential()
self.azure_logs_client = LogsQueryClient(self.azure_credential)
self.gcp_monitoring = monitoring_v3.MetricServiceClient()
# Alert channels
self.slack_webhook = config.get('slack_webhook_url')
self.pagerduty_key = config.get('pagerduty_integration_key')
def check_all_providers(self) -> Dict[str, Any]:
"""Check health across all cloud providers"""
results = {
'timestamp': datetime.utcnow().isoformat(),
'overall_status': 'healthy',
'providers': {},
'alerts': []
}
# Check each provider
for provider_config in self.config['providers']:
provider_name = provider_config['name']
try:
if provider_name == 'aws':
provider_results = self._check_aws_health(provider_config)
elif provider_name == 'azure':
provider_results = self._check_azure_health(provider_config)
elif provider_name == 'gcp':
provider_results = self._check_gcp_health(provider_config)
else:
continue
results['providers'][provider_name] = provider_results
# Collect alerts
if provider_results['alerts']:
results['alerts'].extend(provider_results['alerts'])
results['overall_status'] = 'degraded'
except Exception as e:
results['providers'][provider_name] = {
'status': 'error',
'error': str(e),
'alerts': [{
'severity': 'critical',
'message': f"Failed to check {provider_name}: {str(e)}"
}]
}
results['alerts'].append({
'provider': provider_name,
'severity': 'critical',
'message': f"Monitoring failure: {str(e)}"
})
results['overall_status'] = 'critical'
return results
def _check_aws_health(self, config: Dict[str, Any]) -> Dict[str, Any]:
"""Check AWS resource health"""
results = {
'status': 'healthy',
'metrics': {},
'alerts': []
}
# Check EC2 instances
for instance_check in config.get('ec2_checks', []):
metric_data = self.aws_cloudwatch.get_metric_statistics(
Namespace='AWS/EC2',
MetricName='CPUUtilization',
Dimensions=[{'Name': 'InstanceId', 'Value': instance_check['instance_id']}],
StartTime=datetime.utcnow() - timedelta(minutes=10),
EndTime=datetime.utcnow(),
Period=300,
Statistics=['Average']
)
if metric_data['Datapoints']:
cpu_usage = metric_data['Datapoints'][-1]['Average']
results['metrics'][f"ec2_{instance_check['instance_id']}_cpu"] = cpu_usage
if cpu_usage > instance_check.get('cpu_threshold', 80):
results['alerts'].append({
'severity': 'warning',
'resource': instance_check['instance_id'],
'message': f"High CPU usage: {cpu_usage:.1f}%"
})
# Check RDS instances
for rds_check in config.get('rds_checks', []):
metric_data = self.aws_cloudwatch.get_metric_statistics(
Namespace='AWS/RDS',
MetricName='DatabaseConnections',
Dimensions=[{'Name': 'DBInstanceIdentifier', 'Value': rds_check['db_instance']}],
StartTime=datetime.utcnow() - timedelta(minutes=10),
EndTime=datetime.utcnow(),
Period=300,
Statistics=['Average']
)
if metric_data['Datapoints']:
connections = metric_data['Datapoints'][-1]['Average']
results['metrics'][f"rds_{rds_check['db_instance']}_connections"] = connections
if connections > rds_check.get('connection_threshold', 80):
results['alerts'].append({
'severity': 'warning',
'resource': rds_check['db_instance'],
'message': f"High database connections: {connections}"
})
return results
def _check_azure_health(self, config: Dict[str, Any]) -> Dict[str, Any]:
"""Check Azure resource health"""
results = {
'status': 'healthy',
'metrics': {},
'alerts': []
}
# Check virtual machines
for vm_check in config.get('vm_checks', []):
query = f"""
Perf
| where TimeGenerated > ago(10m)
| where Computer == "{vm_check['vm_name']}"
| where CounterName == "% Processor Time"
| summarize avg(CounterValue) by bin(TimeGenerated, 5m)
| order by TimeGenerated desc
| limit 1
"""
try:
response = self.azure_logs_client.query_workspace(
workspace_id=config['workspace_id'],
query=query,
timespan=timedelta(minutes=10)
)
if response.tables and response.tables[0].rows:
cpu_usage = response.tables[0].rows[0][1]
results['metrics'][f"vm_{vm_check['vm_name']}_cpu"] = cpu_usage
if cpu_usage > vm_check.get('cpu_threshold', 80):
results['alerts'].append({
'severity': 'warning',
'resource': vm_check['vm_name'],
'message': f"High CPU usage: {cpu_usage:.1f}%"
})
except Exception as e:
results['alerts'].append({
'severity': 'error',
'resource': vm_check['vm_name'],
'message': f"Failed to query metrics: {str(e)}"
})
return results
def _check_gcp_health(self, config: Dict[str, Any]) -> Dict[str, Any]:
"""Check GCP resource health"""
results = {
'status': 'healthy',
'metrics': {},
'alerts': []
}
project_name = f"projects/{config['project_id']}"
# Check Compute Engine instances
for instance_check in config.get('instance_checks', []):
interval = monitoring_v3.TimeInterval({
"end_time": {"seconds": int(datetime.utcnow().timestamp())},
"start_time": {"seconds": int((datetime.utcnow() - timedelta(minutes=10)).timestamp())},
})
request = monitoring_v3.ListTimeSeriesRequest({
"name": project_name,
"filter": f'metric.type="compute.googleapis.com/instance/cpu/utilization" AND resource.labels.instance_name="{instance_check["instance_name"]}"',
"interval": interval,
"view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL,
})
try:
page_result = self.gcp_monitoring.list_time_series(request=request)
for time_series in page_result:
if time_series.points:
cpu_usage = time_series.points[0].value.double_value * 100
results['metrics'][f"gce_{instance_check['instance_name']}_cpu"] = cpu_usage
if cpu_usage > instance_check.get('cpu_threshold', 80):
results['alerts'].append({
'severity': 'warning',
'resource': instance_check['instance_name'],
'message': f"High CPU usage: {cpu_usage:.1f}%"
})
except Exception as e:
results['alerts'].append({
'severity': 'error',
'resource': instance_check['instance_name'],
'message': f"Failed to query metrics: {str(e)}"
})
return results
def send_alerts(self, alerts: List[Dict[str, Any]]):
"""Send alerts to configured channels"""
if not alerts:
return
# Group alerts by severity
critical_alerts = [a for a in alerts if a.get('severity') == 'critical']
warning_alerts = [a for a in alerts if a.get('severity') == 'warning']
# Send to Slack
if self.slack_webhook:
self._send_slack_alert(critical_alerts, warning_alerts)
# Send to PagerDuty for critical alerts
if self.pagerduty_key and critical_alerts:
self._send_pagerduty_alert(critical_alerts)
def _send_slack_alert(self, critical_alerts: List, warning_alerts: List):
"""Send alert to Slack"""
color = "danger" if critical_alerts else "warning"
message = {
"attachments": [{
"color": color,
"title": "Multi-Cloud Infrastructure Alert",
"fields": []
}]
}
if critical_alerts:
message["attachments"][0]["fields"].append({
"title": f"Critical Alerts ({len(critical_alerts)})",
"value": "\n".join([f"• {alert['message']}" for alert in critical_alerts[:5]]),
"short": False
})
if warning_alerts:
message["attachments"][0]["fields"].append({
"title": f"Warning Alerts ({len(warning_alerts)})",
"value": "\n".join([f"• {alert['message']}" for alert in warning_alerts[:5]]),
"short": False
})
requests.post(self.slack_webhook, json=message)
def _send_pagerduty_alert(self, critical_alerts: List):
"""Send critical alert to PagerDuty"""
payload = {
"routing_key": self.pagerduty_key,
"event_action": "trigger",
"payload": {
"summary": f"Multi-Cloud Critical Alert: {len(critical_alerts)} issues detected",
"source": "multi-cloud-monitor",
"severity": "critical",
"custom_details": {
"alerts": critical_alerts
}
}
}
requests.post("https://events.pagerduty.com/v2/enqueue", json=payload)
def main():
import argparse
parser = argparse.ArgumentParser(description='Multi-Cloud Alert Manager')
parser.add_argument('--config', required=True, help='Configuration file')
parser.add_argument('--send-alerts', action='store_true', help='Send alerts to configured channels')
args = parser.parse_args()
with open(args.config, 'r') as f:
config = json.load(f)
alert_manager = MultiCloudAlertManager(config)
results = alert_manager.check_all_providers()
print(f"Overall Status: {results['overall_status']}")
print(f"Total Alerts: {len(results['alerts'])}")
for provider, provider_results in results['providers'].items():
print(f"\n{provider.upper()}:")
print(f" Status: {provider_results['status']}")
print(f" Alerts: {len(provider_results.get('alerts', []))}")
if args.send_alerts and results['alerts']:
alert_manager.send_alerts(results['alerts'])
print(f"\n📧 Sent {len(results['alerts'])} alerts")
if __name__ == "__main__":
main()
Unified Dashboard Creation
Create comprehensive dashboards showing all cloud providers:
#!/bin/bash
# scripts/setup-dashboards.sh
set -e
GRAFANA_URL=${1:-"http://localhost:3000"}
GRAFANA_USER=${2:-"admin"}
GRAFANA_PASSWORD=${3:-"admin"}
create_multi_cloud_dashboard() {
echo "Creating multi-cloud overview dashboard..."
cat > multi-cloud-dashboard.json << 'EOF'
{
"dashboard": {
"title": "Multi-Cloud Infrastructure Overview",
"tags": ["multi-cloud", "overview"],
"timezone": "browser",
"panels": [
{
"title": "AWS EC2 CPU Utilization",
"type": "stat",
"targets": [
{
"expr": "aws_ec2_cpuutilization_average",
"legendFormat": "{{instance_id}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 70},
{"color": "red", "value": 90}
]
}
}
},
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 0}
},
{
"title": "Azure VM CPU Utilization",
"type": "stat",
"targets": [
{
"expr": "azure_vm_cpu_percent",
"legendFormat": "{{vm_name}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 70},
{"color": "red", "value": 90}
]
}
}
},
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 0}
},
{
"title": "GCP Compute CPU Utilization",
"type": "stat",
"targets": [
{
"expr": "gcp_compute_instance_cpu_utilization",
"legendFormat": "{{instance_name}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 70},
{"color": "red", "value": 90}
]
}
}
},
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 0}
},
{
"title": "Cross-Cloud Network Latency",
"type": "graph",
"targets": [
{
"expr": "probe_duration_seconds{job=\"blackbox\"}",
"legendFormat": "{{instance}}"
}
],
"yAxes": [
{
"label": "Latency (seconds)",
"min": 0
}
],
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 8}
},
{
"title": "Storage Usage by Provider",
"type": "piechart",
"targets": [
{
"expr": "aws_s3_bucket_size_bytes",
"legendFormat": "AWS S3"
},
{
"expr": "azure_storage_account_used_capacity",
"legendFormat": "Azure Storage"
},
{
"expr": "gcp_storage_bucket_size",
"legendFormat": "GCP Storage"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
},
{
"title": "Database Connections",
"type": "graph",
"targets": [
{
"expr": "aws_rds_database_connections",
"legendFormat": "AWS RDS {{db_instance_identifier}}"
},
{
"expr": "azure_sql_connections",
"legendFormat": "Azure SQL {{server_name}}"
},
{
"expr": "gcp_cloudsql_connections",
"legendFormat": "GCP Cloud SQL {{database_id}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}
EOF
# Import dashboard to Grafana
curl -X POST \
-H "Content-Type: application/json" \
-u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
-d @multi-cloud-dashboard.json \
"$GRAFANA_URL/api/dashboards/db"
echo "✅ Multi-cloud dashboard created"
}
create_cost_dashboard() {
echo "Creating cost monitoring dashboard..."
cat > cost-dashboard.json << 'EOF'
{
"dashboard": {
"title": "Multi-Cloud Cost Analysis",
"tags": ["cost", "billing"],
"panels": [
{
"title": "Daily Costs by Provider",
"type": "graph",
"targets": [
{
"expr": "aws_billing_estimated_charges",
"legendFormat": "AWS"
},
{
"expr": "azure_consumption_cost",
"legendFormat": "Azure"
},
{
"expr": "gcp_billing_cost",
"legendFormat": "GCP"
}
],
"yAxes": [
{
"label": "Cost (USD)",
"min": 0
}
],
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 0}
},
{
"title": "Cost by Service Category",
"type": "table",
"targets": [
{
"expr": "sum by (service) (aws_billing_estimated_charges)",
"format": "table"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
},
{
"title": "Monthly Cost Trend",
"type": "graph",
"targets": [
{
"expr": "increase(aws_billing_estimated_charges[30d])",
"legendFormat": "AWS Monthly"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
}
]
}
}
EOF
curl -X POST \
-H "Content-Type: application/json" \
-u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
-d @cost-dashboard.json \
"$GRAFANA_URL/api/dashboards/db"
echo "✅ Cost dashboard created"
}
create_sla_dashboard() {
echo "Creating SLA monitoring dashboard..."
cat > sla-dashboard.json << 'EOF'
{
"dashboard": {
"title": "Multi-Cloud SLA Monitoring",
"tags": ["sla", "uptime"],
"panels": [
{
"title": "Service Uptime",
"type": "stat",
"targets": [
{
"expr": "avg_over_time(up{job=\"multi-cloud-services\"}[24h]) * 100",
"legendFormat": "{{service}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 95,
"max": 100,
"thresholds": {
"steps": [
{"color": "red", "value": null},
{"color": "yellow", "value": 99},
{"color": "green", "value": 99.9}
]
}
}
},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 0}
},
{
"title": "Response Time SLA",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, http_request_duration_seconds_bucket)",
"legendFormat": "95th percentile"
},
{
"expr": "histogram_quantile(0.99, http_request_duration_seconds_bucket)",
"legendFormat": "99th percentile"
}
],
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 8}
}
]
}
}
EOF
curl -X POST \
-H "Content-Type: application/json" \
-u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
-d @sla-dashboard.json \
"$GRAFANA_URL/api/dashboards/db"
echo "✅ SLA dashboard created"
}
# Create all dashboards
create_multi_cloud_dashboard
create_cost_dashboard
create_sla_dashboard
# Cleanup temp files
rm -f multi-cloud-dashboard.json cost-dashboard.json sla-dashboard.json
echo "✅ All multi-cloud dashboards created successfully"
echo "Access them at: $GRAFANA_URL"
What’s Next
Unified monitoring and observability provide the visibility needed to operate multi-cloud infrastructure effectively. With comprehensive metrics, alerting, and dashboards in place, you can maintain high availability and performance across all your cloud providers.
In the final part of this guide, we’ll explore governance and cost management strategies that help you maintain control, compliance, and cost efficiency across your entire multi-cloud environment.