Cost Optimization

AWS costs can spiral out of control quickly without proper governance and optimization strategies. Terraform helps by making cost controls repeatable and enforceable, but you need to understand AWS pricing models, implement proper tagging strategies, and automate resource lifecycle management to keep costs under control.

This part covers the patterns and practices for implementing cost optimization with Terraform, from basic tagging strategies to advanced automation that right-sizes resources and manages their lifecycle.

Comprehensive Tagging Strategy

Consistent tagging is the foundation of cost management and allocation:

# Global tagging strategy
locals {
  # Required tags for all resources
  required_tags = {
    Environment   = var.environment
    Project       = var.project_name
    Owner         = var.team_name
    CostCenter    = var.cost_center
    ManagedBy     = "terraform"
    CreatedDate   = formatdate("YYYY-MM-DD", timestamp())
  }
  
  # Optional tags that can be merged
  optional_tags = {
    Application = var.application_name
    Component   = var.component_name
    Version     = var.application_version
  }
  
  # Combined tags
  common_tags = merge(local.required_tags, local.optional_tags)
}

# Provider-level default tags
provider "aws" {
  region = var.aws_region
  
  default_tags {
    tags = local.required_tags
  }
}

# Resource-specific tagging
resource "aws_instance" "web" {
  ami           = data.aws_ami.amazon_linux.id
  instance_type = var.instance_type
  
  tags = merge(local.common_tags, {
    Name         = "${var.name_prefix}-web-${count.index + 1}"
    Role         = "webserver"
    Backup       = "daily"
    AutoShutdown = var.environment != "production" ? "true" : "false"
  })
}

# Enforce tagging with lifecycle rules
resource "aws_instance" "web" {
  # ... other configuration ...
  
  lifecycle {
    postcondition {
      condition = alltrue([
        for tag in keys(local.required_tags) :
        contains(keys(self.tags), tag)
      ])
      error_message = "All required tags must be present: ${join(", ", keys(local.required_tags))}"
    }
  }
}

Resource Right-Sizing

Implement policies to prevent oversized resources:

# Instance type validation
variable "allowed_instance_types" {
  description = "Allowed EC2 instance types by environment"
  type = map(list(string))
  
  default = {
    dev = [
      "t3.nano", "t3.micro", "t3.small", "t3.medium"
    ]
    staging = [
      "t3.small", "t3.medium", "t3.large",
      "m5.large", "m5.xlarge"
    ]
    production = [
      "t3.medium", "t3.large", "t3.xlarge",
      "m5.large", "m5.xlarge", "m5.2xlarge",
      "c5.large", "c5.xlarge", "c5.2xlarge"
    ]
  }
}

resource "aws_instance" "web" {
  ami           = data.aws_ami.amazon_linux.id
  instance_type = var.instance_type
  
  lifecycle {
    precondition {
      condition = contains(
        var.allowed_instance_types[var.environment],
        var.instance_type
      )
      error_message = "Instance type ${var.instance_type} is not allowed in ${var.environment} environment. Allowed types: ${join(", ", var.allowed_instance_types[var.environment])}"
    }
  }
}

# RDS instance size controls
variable "allowed_db_instance_classes" {
  description = "Allowed RDS instance classes by environment"
  type = map(list(string))
  
  default = {
    dev = [
      "db.t3.micro", "db.t3.small"
    ]
    staging = [
      "db.t3.small", "db.t3.medium", "db.r5.large"
    ]
    production = [
      "db.t3.medium", "db.t3.large",
      "db.r5.large", "db.r5.xlarge", "db.r5.2xlarge"
    ]
  }
}

resource "aws_db_instance" "main" {
  identifier     = "${var.name_prefix}-database"
  engine         = "mysql"
  engine_version = "8.0"
  instance_class = var.db_instance_class
  
  lifecycle {
    precondition {
      condition = contains(
        var.allowed_db_instance_classes[var.environment],
        var.db_instance_class
      )
      error_message = "DB instance class ${var.db_instance_class} is not allowed in ${var.environment} environment."
    }
  }
}

Automated Resource Scheduling

Implement automated start/stop for non-production resources:

# Lambda function for EC2 scheduling
resource "aws_lambda_function" "ec2_scheduler" {
  filename         = "ec2_scheduler.zip"
  function_name    = "${var.name_prefix}-ec2-scheduler"
  role            = aws_iam_role.ec2_scheduler.arn
  handler         = "index.handler"
  runtime         = "python3.9"
  timeout         = 60
  
  environment {
    variables = {
      ENVIRONMENT = var.environment
    }
  }
  
  tags = local.common_tags
}

# IAM role for scheduler
resource "aws_iam_role" "ec2_scheduler" {
  name = "${var.name_prefix}-ec2-scheduler-role"
  
  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Effect = "Allow"
        Principal = {
          Service = "lambda.amazonaws.com"
        }
      }
    ]
  })
}

resource "aws_iam_role_policy" "ec2_scheduler" {
  name = "${var.name_prefix}-ec2-scheduler-policy"
  role = aws_iam_role.ec2_scheduler.id
  
  policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Effect = "Allow"
        Action = [
          "logs:CreateLogGroup",
          "logs:CreateLogStream",
          "logs:PutLogEvents"
        ]
        Resource = "arn:aws:logs:*:*:*"
      },
      {
        Effect = "Allow"
        Action = [
          "ec2:DescribeInstances",
          "ec2:StartInstances",
          "ec2:StopInstances"
        ]
        Resource = "*"
      }
    ]
  })
}

# CloudWatch Events for scheduling
resource "aws_cloudwatch_event_rule" "stop_instances" {
  name                = "${var.name_prefix}-stop-instances"
  description         = "Stop non-production instances at 6 PM"
  schedule_expression = "cron(0 18 ? * MON-FRI *)"
  
  tags = local.common_tags
}

resource "aws_cloudwatch_event_rule" "start_instances" {
  name                = "${var.name_prefix}-start-instances"
  description         = "Start non-production instances at 8 AM"
  schedule_expression = "cron(0 8 ? * MON-FRI *)"
  
  tags = local.common_tags
}

resource "aws_cloudwatch_event_target" "stop_instances" {
  rule      = aws_cloudwatch_event_rule.stop_instances.name
  target_id = "StopInstancesTarget"
  arn       = aws_lambda_function.ec2_scheduler.arn
  
  input = jsonencode({
    action = "stop"
  })
}

resource "aws_cloudwatch_event_target" "start_instances" {
  rule      = aws_cloudwatch_event_rule.start_instances.name
  target_id = "StartInstancesTarget"
  arn       = aws_lambda_function.ec2_scheduler.arn
  
  input = jsonencode({
    action = "start"
  })
}

resource "aws_lambda_permission" "allow_cloudwatch_stop" {
  statement_id  = "AllowExecutionFromCloudWatchStop"
  action        = "lambda:InvokeFunction"
  function_name = aws_lambda_function.ec2_scheduler.function_name
  principal     = "events.amazonaws.com"
  source_arn    = aws_cloudwatch_event_rule.stop_instances.arn
}

resource "aws_lambda_permission" "allow_cloudwatch_start" {
  statement_id  = "AllowExecutionFromCloudWatchStart"
  action        = "lambda:InvokeFunction"
  function_name = aws_lambda_function.ec2_scheduler.function_name
  principal     = "events.amazonaws.com"
  source_arn    = aws_cloudwatch_event_rule.start_instances.arn
}

Storage Lifecycle Management

Implement intelligent tiering and lifecycle policies:

# S3 bucket with intelligent tiering
resource "aws_s3_bucket" "data_storage" {
  bucket = "${var.name_prefix}-data-storage"
  
  tags = local.common_tags
}

resource "aws_s3_bucket_intelligent_tiering_configuration" "data_storage" {
  bucket = aws_s3_bucket.data_storage.id
  name   = "EntireBucket"
  
  status = "Enabled"
  
  filter {
    prefix = ""
  }
  
  tiering {
    access_tier = "DEEP_ARCHIVE_ACCESS"
    days        = 180
  }
  
  tiering {
    access_tier = "ARCHIVE_ACCESS"
    days        = 125
  }
}

# Lifecycle configuration for different data types
resource "aws_s3_bucket_lifecycle_configuration" "data_storage" {
  bucket = aws_s3_bucket.data_storage.id
  
  rule {
    id     = "logs_lifecycle"
    status = "Enabled"
    
    filter {
      prefix = "logs/"
    }
    
    transition {
      days          = 30
      storage_class = "STANDARD_IA"
    }
    
    transition {
      days          = 90
      storage_class = "GLACIER"
    }
    
    transition {
      days          = 365
      storage_class = "DEEP_ARCHIVE"
    }
    
    expiration {
      days = 2555  # 7 years
    }
  }
  
  rule {
    id     = "temp_data_cleanup"
    status = "Enabled"
    
    filter {
      prefix = "temp/"
    }
    
    expiration {
      days = 7
    }
  }
  
  rule {
    id     = "incomplete_multipart_uploads"
    status = "Enabled"
    
    abort_incomplete_multipart_upload {
      days_after_initiation = 1
    }
  }
}

# EBS volume optimization
resource "aws_ebs_volume" "data" {
  availability_zone = var.availability_zone
  size              = var.volume_size
  type              = var.environment == "production" ? "gp3" : "gp2"
  encrypted         = true
  
  # Use gp3 for better cost/performance in production
  dynamic "throughput" {
    for_each = var.environment == "production" ? [1] : []
    content {
      throughput = 125  # Baseline throughput for gp3
    }
  }
  
  dynamic "iops" {
    for_each = var.environment == "production" ? [1] : []
    content {
      iops = 3000  # Baseline IOPS for gp3
    }
  }
  
  tags = merge(local.common_tags, {
    Name = "${var.name_prefix}-data-volume"
    Type = "data"
  })
}

Reserved Instance and Savings Plans Management

Track and manage reserved capacity:

# Data source to check existing reserved instances
data "aws_ec2_reserved_instances" "existing" {
  filter {
    name   = "state"
    values = ["active"]
  }
}

# Local calculation for RI coverage
locals {
  # Calculate running instances by type
  running_instances = {
    for instance_type, count in var.instance_counts :
    instance_type => count
  }
  
  # Calculate RI coverage
  ri_coverage = {
    for ri in data.aws_ec2_reserved_instances.existing.reserved_instances :
    ri.instance_type => ri.instance_count
  }
  
  # Identify gaps in RI coverage
  ri_gaps = {
    for instance_type, running_count in local.running_instances :
    instance_type => max(0, running_count - lookup(local.ri_coverage, instance_type, 0))
  }
}

# Output RI recommendations
output "ri_recommendations" {
  description = "Reserved Instance purchase recommendations"
  value = {
    for instance_type, gap in local.ri_gaps :
    instance_type => {
      running_instances = local.running_instances[instance_type]
      reserved_instances = lookup(local.ri_coverage, instance_type, 0)
      recommended_purchase = gap
    }
    if gap > 0
  }
}

Cost Monitoring and Alerting

Set up comprehensive cost monitoring:

# Budget for overall account spending
resource "aws_budgets_budget" "monthly_budget" {
  name         = "${var.name_prefix}-monthly-budget"
  budget_type  = "COST"
  limit_amount = var.monthly_budget_limit
  limit_unit   = "USD"
  time_unit    = "MONTHLY"
  
  cost_filters = {
    LinkedAccount = [data.aws_caller_identity.current.account_id]
  }
  
  notification {
    comparison_operator        = "GREATER_THAN"
    threshold                 = 80
    threshold_type            = "PERCENTAGE"
    notification_type         = "ACTUAL"
    subscriber_email_addresses = var.budget_notification_emails
  }
  
  notification {
    comparison_operator        = "GREATER_THAN"
    threshold                 = 100
    threshold_type            = "PERCENTAGE"
    notification_type          = "FORECASTED"
    subscriber_email_addresses = var.budget_notification_emails
  }
}

# Service-specific budgets
resource "aws_budgets_budget" "service_budgets" {
  for_each = var.service_budgets
  
  name         = "${var.name_prefix}-${each.key}-budget"
  budget_type  = "COST"
  limit_amount = each.value.limit
  limit_unit   = "USD"
  time_unit    = "MONTHLY"
  
  cost_filters = {
    Service = [each.value.service_name]
  }
  
  notification {
    comparison_operator        = "GREATER_THAN"
    threshold                 = each.value.threshold
    threshold_type            = "PERCENTAGE"
    notification_type         = "ACTUAL"
    subscriber_email_addresses = var.budget_notification_emails
  }
}

# Cost anomaly detection
resource "aws_ce_anomaly_detector" "service_anomaly" {
  name         = "${var.name_prefix}-service-anomaly-detector"
  monitor_type = "DIMENSIONAL"
  
  specification = jsonencode({
    Dimension = "SERVICE"
    MatchOptions = ["EQUALS"]
    Values = ["Amazon Elastic Compute Cloud - Compute", "Amazon Relational Database Service"]
  })
  
  tags = local.common_tags
}

resource "aws_ce_anomaly_subscription" "service_anomaly" {
  name      = "${var.name_prefix}-anomaly-subscription"
  frequency = "DAILY"
  
  monitor_arn_list = [
    aws_ce_anomaly_detector.service_anomaly.arn
  ]
  
  subscriber {
    type    = "EMAIL"
    address = var.cost_anomaly_email
  }
  
  threshold_expression {
    and {
      dimension {
        key           = "ANOMALY_TOTAL_IMPACT_ABSOLUTE"
        values        = ["50"]
        match_options = ["GREATER_THAN_OR_EQUAL"]
      }
    }
  }
  
  tags = local.common_tags
}

Spot Instance Integration

Use Spot instances for cost-effective compute:

# Launch template for Spot instances
resource "aws_launch_template" "spot_template" {
  name_prefix   = "${var.name_prefix}-spot-"
  image_id      = data.aws_ami.amazon_linux.id
  instance_type = var.spot_instance_type
  
  vpc_security_group_ids = [aws_security_group.web.id]
  
  iam_instance_profile {
    name = aws_iam_instance_profile.app_server.name
  }
  
  user_data = base64encode(templatefile("${path.module}/user_data.sh", {
    environment = var.environment
  }))
  
  tag_specifications {
    resource_type = "instance"
    tags = merge(local.common_tags, {
      Name = "${var.name_prefix}-spot-instance"
      Type = "spot"
    })
  }
}

# Auto Scaling Group with mixed instances
resource "aws_autoscaling_group" "mixed_instances" {
  name                = "${var.name_prefix}-mixed-asg"
  vpc_zone_identifier = var.private_subnet_ids
  target_group_arns   = [aws_lb_target_group.web.arn]
  health_check_type   = "ELB"
  
  min_size         = var.min_size
  max_size         = var.max_size
  desired_capacity = var.desired_capacity
  
  mixed_instances_policy {
    launch_template {
      launch_template_specification {
        launch_template_id = aws_launch_template.spot_template.id
        version           = "$Latest"
      }
      
      override {
        instance_type     = "t3.medium"
        weighted_capacity = "1"
      }
      
      override {
        instance_type     = "t3.large"
        weighted_capacity = "2"
      }
    }
    
    instances_distribution {
      on_demand_base_capacity                  = 1
      on_demand_percentage_above_base_capacity = 25
      spot_allocation_strategy                 = "capacity-optimized"
      spot_instance_pools                      = 2
      spot_max_price                          = var.spot_max_price
    }
  }
  
  tag {
    key                 = "Name"
    value               = "${var.name_prefix}-mixed-asg"
    propagate_at_launch = true
  }
  
  dynamic "tag" {
    for_each = local.common_tags
    content {
      key                 = tag.key
      value               = tag.value
      propagate_at_launch = true
    }
  }
}

Resource Cleanup Automation

Automate cleanup of unused resources:

# Lambda function for resource cleanup
resource "aws_lambda_function" "resource_cleanup" {
  filename         = "resource_cleanup.zip"
  function_name    = "${var.name_prefix}-resource-cleanup"
  role            = aws_iam_role.resource_cleanup.arn
  handler         = "index.handler"
  runtime         = "python3.9"
  timeout         = 300
  
  environment {
    variables = {
      ENVIRONMENT = var.environment
      DRY_RUN     = var.cleanup_dry_run
    }
  }
  
  tags = local.common_tags
}

# Schedule cleanup to run weekly
resource "aws_cloudwatch_event_rule" "resource_cleanup" {
  name                = "${var.name_prefix}-resource-cleanup"
  description         = "Weekly resource cleanup"
  schedule_expression = "cron(0 2 ? * SUN *)"  # 2 AM every Sunday
  
  tags = local.common_tags
}

resource "aws_cloudwatch_event_target" "resource_cleanup" {
  rule      = aws_cloudwatch_event_rule.resource_cleanup.name
  target_id = "ResourceCleanupTarget"
  arn       = aws_lambda_function.resource_cleanup.arn
}

resource "aws_lambda_permission" "allow_cloudwatch_cleanup" {
  statement_id  = "AllowExecutionFromCloudWatch"
  action        = "lambda:InvokeFunction"
  function_name = aws_lambda_function.resource_cleanup.function_name
  principal     = "events.amazonaws.com"
  source_arn    = aws_cloudwatch_event_rule.resource_cleanup.arn
}

What’s Next

Cost optimization provides the financial discipline needed for sustainable AWS operations, but implementing reusable patterns and modules is what makes these optimizations scalable across your organization.

In the next part, we’ll explore AWS-specific module patterns that encapsulate these cost optimization strategies along with security and operational best practices, creating reusable building blocks for your infrastructure.